├── requirements.txt ├── CovertCastAnalysis ├── dataset_gen.py ├── ParseCaptures.py ├── KL_classifier.py ├── EMD_classifier.py ├── xgboost_classifier.py └── X2_classifier.py ├── DeltaShaperAnalysis ├── dataset_gen.py ├── parseCaptures.py ├── KL_classifier.py ├── EMD_classifier.py ├── xgboost_classifier.py ├── X2_classifier.py └── IsolationForest.py ├── FacetAnalysis ├── dataset_gen.py ├── ParseCaptures.py ├── KL_classifier.py ├── EMD_classifier.py ├── xgboost_classifier.py └── autoencoder.py └── README.md /requirements.txt: -------------------------------------------------------------------------------- 1 | dpkt = 1.8.8 2 | numpy == 1.13.1 3 | scipy == 0.14.0 4 | matplotlib == 1.5.3 5 | scikit-learn == 0.19.0 6 | xgboost == 0.6a2 7 | tensorflow == 0.12.1 8 | pyemd == 0.3.0 -------------------------------------------------------------------------------- /CovertCastAnalysis/dataset_gen.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys 3 | import csv 4 | import glob 5 | import os 6 | 7 | def MergeDatasets(data_folder): 8 | if(os.path.exists(data_folder + '/full_dataset.csv')): 9 | os.remove(data_folder + '/full_dataset.csv') 10 | 11 | features_files = glob.glob(data_folder + "/*_dataset.csv") 12 | 13 | print "Merging full dataset..." 14 | header_saved = False 15 | with open(data_folder + '/full_dataset.csv','wb') as fout: 16 | for filename in features_files: 17 | print "merging " + filename 18 | with open(filename) as fin: 19 | header = next(fin) 20 | if not header_saved: 21 | fout.write(header) 22 | header_saved = True 23 | for line in fin: 24 | fout.write(line) 25 | print "Dataset merged!" 26 | 27 | 28 | def MergeSamples(data_folder): 29 | #Generate training dataset 30 | youtube_files = glob.glob(data_folder + "/YouTubeTraffic_*.csv") 31 | 32 | header_saved = False 33 | with open(data_folder + '/youtube_dataset.csv','wb') as fout: 34 | for filename in youtube_files: 35 | with open(filename) as fin: 36 | header = next(fin) 37 | if not header_saved: 38 | fout.write(header) 39 | header_saved = True 40 | for line in fin: 41 | fout.write(line) 42 | 43 | covertcast_files = glob.glob(data_folder + "/CovertCastTraffic_*.csv") 44 | 45 | header_saved = False 46 | with open(data_folder + '/covertcast_dataset.csv','wb') as fout: 47 | for filename in covertcast_files: 48 | with open(filename) as fin: 49 | header = next(fin) 50 | if not header_saved: 51 | fout.write(header) 52 | header_saved = True 53 | for line in fin: 54 | fout.write(line) 55 | 56 | -------------------------------------------------------------------------------- /DeltaShaperAnalysis/dataset_gen.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys 3 | import csv 4 | import glob 5 | import os 6 | 7 | 8 | def MergeDatasets(data_folder): 9 | if(os.path.exists(data_folder + '/full_dataset.csv')): 10 | os.remove(data_folder + '/full_dataset.csv') 11 | 12 | features_files = [data_folder + "deltashaper_dataset.csv", data_folder + "RegularTraffic_dataset.csv"] 13 | 14 | print "Merging full dataset..." 15 | header_saved = False 16 | with open(data_folder + '/full_dataset.csv','wb') as fout: 17 | for filename in features_files: 18 | print "merging " + filename 19 | with open(filename) as fin: 20 | header = next(fin) 21 | if not header_saved: 22 | fout.write(header) 23 | header_saved = True 24 | for line in fin: 25 | fout.write(line) 26 | print "Dataset merged!" 27 | 28 | 29 | def CombinedMerging(data_folder): 30 | if(os.path.exists(data_folder + '/regular_320_dataset.csv')): 31 | os.remove(data_folder + '/regular_320_dataset.csv') 32 | if(os.path.exists(data_folder + '/regular_160_dataset.csv')): 33 | os.remove(data_folder + '/regular_160_dataset.csv') 34 | 35 | features_files = [data_folder + "DeltaShaperTraffic_320_dataset.csv", data_folder + "RegularTraffic_dataset.csv"] 36 | 37 | print "Merging dataset..." 38 | header_saved = False 39 | with open(data_folder + '/regular_320_dataset.csv','wb') as fout: 40 | for filename in features_files: 41 | print "merging " + filename 42 | with open(filename) as fin: 43 | header = next(fin) 44 | if not header_saved: 45 | fout.write(header) 46 | header_saved = True 47 | for line in fin: 48 | fout.write(line) 49 | print "Dataset merged!" 50 | 51 | features_files = [data_folder + "DeltaShaperTraffic_160_dataset.csv", data_folder + "RegularTraffic_dataset.csv"] 52 | 53 | print "Merging dataset..." 54 | header_saved = False 55 | with open(data_folder + '/regular_160_dataset.csv','wb') as fout: 56 | for filename in features_files: 57 | print "merging " + filename 58 | with open(filename) as fin: 59 | header = next(fin) 60 | if not header_saved: 61 | fout.write(header) 62 | header_saved = True 63 | for line in fin: 64 | fout.write(line) 65 | print "Dataset merged!" 66 | 67 | 68 | 69 | def MergeSamples(data_folder): 70 | #Generate training dataset 71 | deltashaper_files = glob.glob(data_folder + "/DeltaShaperTraffic_*.csv") 72 | 73 | header_saved = False 74 | with open(data_folder + 'deltashaper_dataset.csv','wb') as fout: 75 | for filename in deltashaper_files: 76 | with open(filename) as fin: 77 | header = next(fin) 78 | if not header_saved: 79 | fout.write(header) 80 | header_saved = True 81 | for line in fin: 82 | fout.write(line) 83 | 84 | 85 | def GenerateDatasets(data_folder): 86 | MergeSamples(data_folder) 87 | CombinedMerging(data_folder) 88 | MergeDatasets(data_folder) 89 | -------------------------------------------------------------------------------- /FacetAnalysis/dataset_gen.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys 3 | import csv 4 | import glob 5 | import os 6 | 7 | 8 | def MergeDatasets(data_folder): 9 | if(os.path.exists(data_folder + '/full_dataset.csv')): 10 | os.remove(data_folder + '/full_dataset.csv') 11 | 12 | features_files = [data_folder + "facet_dataset.csv", data_folder + "RegularTraffic_Christmas_dataset.csv"] 13 | 14 | print "Merging full dataset..." 15 | header_saved = False 16 | with open(data_folder + '/full_dataset.csv','wb') as fout: 17 | for filename in features_files: 18 | print "merging " + filename 19 | with open(filename) as fin: 20 | header = next(fin) 21 | if not header_saved: 22 | fout.write(header) 23 | header_saved = True 24 | for line in fin: 25 | fout.write(line) 26 | print "Dataset merged!" 27 | 28 | 29 | 30 | def CombinedMerging(data_folder): 31 | if(os.path.exists(data_folder + '/regular_12.5_dataset.csv')): 32 | os.remove(data_folder + '/regular_12.5_dataset.csv') 33 | if(os.path.exists(data_folder + '/regular_25_dataset.csv')): 34 | os.remove(data_folder + '/regular_25_dataset.csv') 35 | if(os.path.exists(data_folder + '/regular_50_dataset.csv')): 36 | os.remove(data_folder + '/regular_50_dataset.csv') 37 | 38 | features_files = [data_folder + "FacetTraffic_12.5_Christmas_dataset.csv", data_folder + "RegularTraffic_Christmas_dataset.csv"] 39 | 40 | print "Merging dataset..." 41 | header_saved = False 42 | with open(data_folder + '/regular_12.5_dataset.csv','wb') as fout: 43 | for filename in features_files: 44 | print "merging " + filename 45 | with open(filename) as fin: 46 | header = next(fin) 47 | if not header_saved: 48 | fout.write(header) 49 | header_saved = True 50 | for line in fin: 51 | fout.write(line) 52 | print "Dataset merged!" 53 | 54 | features_files = [data_folder + "FacetTraffic_25_Christmas_dataset.csv", data_folder + "RegularTraffic_Christmas_dataset.csv"] 55 | 56 | print "Merging dataset..." 57 | header_saved = False 58 | with open(data_folder + '/regular_25_dataset.csv','wb') as fout: 59 | for filename in features_files: 60 | print "merging " + filename 61 | with open(filename) as fin: 62 | header = next(fin) 63 | if not header_saved: 64 | fout.write(header) 65 | header_saved = True 66 | for line in fin: 67 | fout.write(line) 68 | print "Dataset merged!" 69 | 70 | features_files = [data_folder + "FacetTraffic_50_Christmas_dataset.csv", data_folder + "RegularTraffic_Christmas_dataset.csv"] 71 | 72 | print "Merging dataset..." 73 | header_saved = False 74 | with open(data_folder + '/regular_50_dataset.csv','wb') as fout: 75 | for filename in features_files: 76 | print "merging " + filename 77 | with open(filename) as fin: 78 | header = next(fin) 79 | if not header_saved: 80 | fout.write(header) 81 | header_saved = True 82 | for line in fin: 83 | fout.write(line) 84 | print "Dataset merged!" 85 | 86 | 87 | 88 | def MergeSamples(data_folder): 89 | #Generate training dataset 90 | facet_files = glob.glob(data_folder + "/FacetTraffic_*.csv") 91 | 92 | header_saved = False 93 | with open(data_folder + '/facet_dataset.csv','wb') as fout: 94 | for filename in facet_files: 95 | with open(filename) as fin: 96 | header = next(fin) 97 | if not header_saved: 98 | fout.write(header) 99 | header_saved = True 100 | for line in fin: 101 | fout.write(line) 102 | 103 | 104 | def GenerateDatasets(data_folder): 105 | MergeSamples(data_folder) 106 | CombinedMerging(data_folder) 107 | #MergeDatasets(data_folder) 108 | 109 | -------------------------------------------------------------------------------- /DeltaShaperAnalysis/parseCaptures.py: -------------------------------------------------------------------------------- 1 | import dpkt 2 | import os 3 | from matplotlib import pyplot as plt 4 | from matplotlib.pyplot import cm 5 | import numpy as np 6 | import socket 7 | 8 | BIN_WIDTH = [15,20,50] 9 | 10 | InterPacketBins = [5000,2500,1000] 11 | 12 | 13 | auxFolder = 'auxFolder/' 14 | 15 | def RoundToNearest(n, m): 16 | r = n % m 17 | return n + m - r if r + r >= m else n - r 18 | 19 | def CreateBigrams(capsFolder, sampleFolder): 20 | for sample in os.listdir(capsFolder + sampleFolder): 21 | for binWidth in BIN_WIDTH: 22 | faux = open(auxFolder + os.path.dirname(capsFolder) + "/" + sampleFolder + "/" + sample + '/bigrams_' + str(binWidth), 'w') 23 | f = open(auxFolder + os.path.dirname(capsFolder) + "/" + sampleFolder + "/" + sample + "/packetCount_" + str(binWidth), 'r') 24 | 25 | lines = f.readlines() 26 | for index, line in enumerate(lines): 27 | try: 28 | faux.write(line.rstrip('\n') + "," + lines[index+1]) 29 | except IndexError: 30 | break #Reached last index, stop processing 31 | faux.close() 32 | f.close() 33 | 34 | 35 | def ComputeDelta(capsFolder, sampleFolder): 36 | for sample in os.listdir(capsFolder + sampleFolder): 37 | for binWidth in InterPacketBins: 38 | faux = open(auxFolder + os.path.dirname(capsFolder) + "/" + sampleFolder + "/" + sample + '/deltaT_' + str(binWidth), 'w') 39 | f = open(auxFolder + os.path.dirname(capsFolder) + "/" + sampleFolder + "/" + sample + "/timestamps", 'r') 40 | 41 | lines = f.readlines() 42 | for index, line in enumerate(lines): 43 | try: 44 | delta = "%0.6f" % (float(lines[index+1]) - (float(line.rstrip('\n')))) 45 | delta = float(delta) * 1000000 46 | faux.write("%s\n" % RoundToNearest(int(delta), int(binWidth))) 47 | except IndexError: 48 | break #Reached last index, stop processing 49 | 50 | faux.close() 51 | f.close() 52 | 53 | def ParseCapture(capsFolder, sampleFolder): 54 | 55 | fig = plt.figure() 56 | ax1 = fig.add_subplot(111) 57 | 58 | for sample in os.listdir(capsFolder + sampleFolder): 59 | descriptors = [] 60 | f = open(capsFolder + sampleFolder + "/" + sample + "/" + sample) 61 | print sample 62 | if not os.path.exists(auxFolder + os.path.dirname(capsFolder) + "/" + sampleFolder + "/" + sample): 63 | os.makedirs(auxFolder + os.path.dirname(capsFolder) + "/" + sampleFolder + "/" + sample) 64 | packet_count = open(auxFolder + os.path.dirname(capsFolder) + "/" + sampleFolder + "/" + sample + '/packetCount_0', 'w') 65 | timestamps = open(auxFolder + os.path.dirname(capsFolder) + "/" + sampleFolder + "/" + sample + "/timestamps", 'w') 66 | for binWidth in BIN_WIDTH: 67 | descriptors.append(open(auxFolder + os.path.dirname(capsFolder) + "/" + sampleFolder + "/" + sample + '/packetCount_' + str(binWidth), 'w')) 68 | 69 | 70 | pcap = dpkt.pcap.Reader(f) 71 | 72 | for ts, buf in pcap: 73 | eth = dpkt.ethernet.Ethernet(buf) 74 | ip_hdr = eth.data 75 | try: 76 | if (eth.type != dpkt.ethernet.ETH_TYPE_IP and eth.type != dpkt.ethernet.ETH_TYPE_IP6): 77 | continue 78 | if eth.type != dpkt.ethernet.ETH_TYPE_IP6: 79 | src_ip_addr_str = socket.inet_ntoa(ip_hdr.src) 80 | else: 81 | src_ip_addr_str = socket.inet_ntop(socket.AF_INET6, ip_hdr.src) 82 | 83 | if (ip_hdr.p == 17 and src_ip_addr_str == '172.31.0.19'): 84 | for i, descript in enumerate(BIN_WIDTH): 85 | descriptors[i].write("%s\n" % RoundToNearest(len(buf), BIN_WIDTH[i])) 86 | timestamps.write("{0:.6f}".format(ts) + "\n") 87 | packet_count.write("%s\n" % len(buf)) 88 | 89 | except Exception as e: 90 | print "[Exception]" + str(e) 91 | packet_count.close() 92 | timestamps.close() 93 | for i, descript in enumerate(BIN_WIDTH): 94 | descriptors[i].close() 95 | f.close() 96 | 97 | 98 | if __name__ == "__main__": 99 | sampleFolders = ["TrafficCaptures/480Resolution/"] 100 | modeFolders = ["RegularTraffic","DeltaShaperTraffic_320", "DeltaShaperTraffic_160"] 101 | 102 | for sampleFolder in sampleFolders: 103 | for modeFolder in modeFolders: 104 | if not os.path.exists(auxFolder + sampleFolder + modeFolder): 105 | os.makedirs(auxFolder + sampleFolder + modeFolder) 106 | ParseCapture(sampleFolder, modeFolder) 107 | CreateBigrams(sampleFolder, modeFolder) 108 | #ComputeDelta(sampleFolder, modeFolder) 109 | -------------------------------------------------------------------------------- /FacetAnalysis/ParseCaptures.py: -------------------------------------------------------------------------------- 1 | import dpkt 2 | import os 3 | from matplotlib import pyplot as plt 4 | from matplotlib.pyplot import cm 5 | import numpy as np 6 | import socket 7 | 8 | BIN_WIDTH = [15,20,50] 9 | 10 | InterPacketBins = [5000,2500,1000] 11 | 12 | 13 | auxFolder = 'auxFolder/' 14 | 15 | def RoundToNearest(n, m): 16 | r = n % m 17 | return n + m - r if r + r >= m else n - r 18 | 19 | def CreateBigrams(capsFolder, sampleFolder): 20 | for sample in os.listdir(capsFolder + sampleFolder): 21 | for binWidth in BIN_WIDTH: 22 | faux = open(auxFolder + os.path.dirname(capsFolder) + "/" + sampleFolder + "/" + sample + '/bigrams_' + str(binWidth), 'w') 23 | f = open(auxFolder + os.path.dirname(capsFolder) + "/" + sampleFolder + "/" + sample + "/packetCount_" + str(binWidth), 'r') 24 | 25 | lines = f.readlines() 26 | for index, line in enumerate(lines): 27 | try: 28 | faux.write(line.rstrip('\n') + "," + lines[index+1]) 29 | except IndexError: 30 | break #Reached last index, stop processing 31 | faux.close() 32 | f.close() 33 | 34 | 35 | def ComputeDelta(capsFolder, sampleFolder): 36 | for sample in os.listdir(capsFolder + sampleFolder): 37 | for binWidth in InterPacketBins: 38 | faux = open(auxFolder + os.path.dirname(capsFolder) + "/" + sampleFolder + "/" + sample + '/deltaT_' + str(binWidth), 'w') 39 | f = open(auxFolder + os.path.dirname(capsFolder) + "/" + sampleFolder + "/" + sample + "/timestamps", 'r') 40 | 41 | lines = f.readlines() 42 | for index, line in enumerate(lines): 43 | try: 44 | delta = "%0.6f" % (float(lines[index+1]) - (float(line.rstrip('\n')))) 45 | delta = float(delta) * 1000000 46 | faux.write("%s\n" % RoundToNearest(int(delta), int(binWidth))) 47 | except IndexError: 48 | break #Reached last index, stop processing 49 | 50 | faux.close() 51 | f.close() 52 | 53 | def ParseCapture(capsFolder, sampleFolder): 54 | 55 | fig = plt.figure() 56 | ax1 = fig.add_subplot(111) 57 | 58 | for sample in os.listdir(capsFolder + sampleFolder): 59 | descriptors = [] 60 | f = open(capsFolder + sampleFolder + "/" + sample + "/" + sample) 61 | print sample 62 | if not os.path.exists(auxFolder + os.path.dirname(capsFolder) + "/" + sampleFolder + "/" + sample): 63 | os.makedirs(auxFolder + os.path.dirname(capsFolder) + "/" + sampleFolder + "/" + sample) 64 | packet_count = open(auxFolder + os.path.dirname(capsFolder) + "/" + sampleFolder + "/" + sample + '/packetCount_0', 'w') 65 | timestamps = open(auxFolder + os.path.dirname(capsFolder) + "/" + sampleFolder + "/" + sample + "/timestamps", 'w') 66 | for binWidth in BIN_WIDTH: 67 | descriptors.append(open(auxFolder + os.path.dirname(capsFolder) + "/" + sampleFolder + "/" + sample + '/packetCount_' + str(binWidth), 'w')) 68 | 69 | 70 | pcap = dpkt.pcap.Reader(f) 71 | 72 | for ts, buf in pcap: 73 | eth = dpkt.ethernet.Ethernet(buf) 74 | ip_hdr = eth.data 75 | try: 76 | if (eth.type != dpkt.ethernet.ETH_TYPE_IP and eth.type != dpkt.ethernet.ETH_TYPE_IP6): 77 | continue 78 | if eth.type != dpkt.ethernet.ETH_TYPE_IP6: 79 | src_ip_addr_str = socket.inet_ntoa(ip_hdr.src) 80 | else: 81 | src_ip_addr_str = socket.inet_ntop(socket.AF_INET6, ip_hdr.src) 82 | 83 | if (ip_hdr.p == 17 and src_ip_addr_str == '172.31.0.19'): 84 | for i, descript in enumerate(BIN_WIDTH): 85 | descriptors[i].write("%s\n" % RoundToNearest(len(buf), BIN_WIDTH[i])) 86 | timestamps.write("{0:.6f}".format(ts) + "\n") 87 | packet_count.write("%s\n" % len(buf)) 88 | 89 | except Exception as e: 90 | print "[Exception]" + str(e) 91 | packet_count.close() 92 | timestamps.close() 93 | for i, descript in enumerate(BIN_WIDTH): 94 | descriptors[i].close() 95 | f.close() 96 | 97 | 98 | if __name__ == "__main__": 99 | sampleFolders = ["TrafficCaptures/240Resolution/"] 100 | modeFolders = ["RegularTraffic_Christmas","FacetTraffic_12.5_Christmas","FacetTraffic_25_Christmas","FacetTraffic_50_Christmas"] #"CensoredTraffic_Christmas" 101 | 102 | for sampleFolder in sampleFolders: 103 | for modeFolder in modeFolders: 104 | if not os.path.exists(auxFolder + sampleFolder + modeFolder): 105 | os.makedirs(auxFolder + sampleFolder + modeFolder) 106 | ParseCapture(sampleFolder, modeFolder) 107 | CreateBigrams(sampleFolder, modeFolder) 108 | #ComputeDelta(sampleFolder, modeFolder) 109 | -------------------------------------------------------------------------------- /CovertCastAnalysis/ParseCaptures.py: -------------------------------------------------------------------------------- 1 | import dpkt 2 | import os 3 | from matplotlib import pyplot as plt 4 | from matplotlib.pyplot import cm 5 | import numpy as np 6 | import socket 7 | 8 | BIN_WIDTH = [15, 20, 50] 9 | 10 | InterPacketBins = [5000,2500,1000] 11 | 12 | 13 | auxFolder = 'auxFolder/' 14 | 15 | def RoundToNearest(n, m): 16 | r = n % m 17 | return n + m - r if r + r >= m else n - r 18 | 19 | def CreateBigrams(capsFolder, sampleFolder): 20 | for sample in os.listdir(capsFolder + sampleFolder): 21 | for binWidth in BIN_WIDTH: 22 | faux = open(auxFolder + os.path.dirname(capsFolder) + "/" + sampleFolder + "/" + sample + '/bigrams_' + str(binWidth), 'w') 23 | f = open(auxFolder + os.path.dirname(capsFolder) + "/" + sampleFolder + "/" + sample + "/packetCount_" + str(binWidth), 'r') 24 | 25 | lines = f.readlines() 26 | for index, line in enumerate(lines): 27 | try: 28 | faux.write(line.rstrip('\n') + "," + lines[index+1]) 29 | except IndexError: 30 | break #Reached last index, stop processing 31 | faux.close() 32 | f.close() 33 | 34 | 35 | def ComputeDelta(capsFolder, sampleFolder): 36 | for sample in os.listdir(capsFolder + sampleFolder): 37 | for binWidth in InterPacketBins: 38 | faux = open(auxFolder + os.path.dirname(capsFolder) + "/" + sampleFolder + "/" + sample + '/deltaT_' + str(binWidth), 'w') 39 | f = open(auxFolder + os.path.dirname(capsFolder) + "/" + sampleFolder + "/" + sample + "/timestamps", 'r') 40 | 41 | lines = f.readlines() 42 | for index, line in enumerate(lines): 43 | try: 44 | delta = "%0.6f" % (float(lines[index+1]) - (float(line.rstrip('\n')))) 45 | delta = float(delta) * 1000000 46 | faux.write("%s\n" % RoundToNearest(int(delta), int(binWidth))) 47 | except IndexError: 48 | break #Reached last index, stop processing 49 | 50 | faux.close() 51 | f.close() 52 | 53 | def ParseCapture(capsFolder, sampleFolder): 54 | 55 | fig = plt.figure() 56 | ax1 = fig.add_subplot(111) 57 | 58 | for sample in os.listdir(capsFolder + sampleFolder): 59 | descriptors = [] 60 | f = open(capsFolder + sampleFolder + "/" + sample) 61 | print sample 62 | if not os.path.exists(auxFolder + os.path.dirname(capsFolder) + "/" + sampleFolder + "/" + sample): 63 | os.makedirs(auxFolder + os.path.dirname(capsFolder) + "/" + sampleFolder + "/" + sample) 64 | packet_count = open(auxFolder + os.path.dirname(capsFolder) + "/" + sampleFolder + "/" + sample + '/packetCount_0', 'w') 65 | timestamps = open(auxFolder + os.path.dirname(capsFolder) + "/" + sampleFolder + "/" + sample + "/timestamps", 'w') 66 | for binWidth in BIN_WIDTH: 67 | print auxFolder + os.path.dirname(capsFolder) + "/" + sampleFolder + "/" + sample + '/packetCount_' + str(binWidth) 68 | descriptors.append(open(auxFolder + os.path.dirname(capsFolder) + "/" + sampleFolder + "/" + sample + '/packetCount_' + str(binWidth), 'w')) 69 | 70 | 71 | pcap = dpkt.pcapng.Reader(f) 72 | 73 | for ts, buf in pcap: 74 | eth = dpkt.ethernet.Ethernet(buf) 75 | ip_hdr = eth.data 76 | try: 77 | if (eth.type != dpkt.ethernet.ETH_TYPE_IP and eth.type != dpkt.ethernet.ETH_TYPE_IP6): 78 | continue 79 | if eth.type != dpkt.ethernet.ETH_TYPE_IP6: 80 | src_ip_addr_str = socket.inet_ntoa(ip_hdr.src) 81 | else: 82 | src_ip_addr_str = socket.inet_ntop(socket.AF_INET6, ip_hdr.src) 83 | 84 | if (ip_hdr.p == 17 and (ip_hdr.data.sport == 443 or ip_hdr.data.dport == 443)): 85 | if(ip_hdr.data.sport == 443): 86 | for i, descript in enumerate(BIN_WIDTH): 87 | descriptors[i].write("%s\n" % RoundToNearest(len(buf), BIN_WIDTH[i])) 88 | timestamps.write("{0:.6f}".format(ts) + "\n") 89 | packet_count.write("%s\n" % len(buf)) 90 | 91 | except Exception as e: 92 | print "[Exception]" + str(e) 93 | packet_count.close() 94 | timestamps.close() 95 | for i, descript in enumerate(BIN_WIDTH): 96 | descriptors[i].close() 97 | f.close() 98 | 99 | 100 | if __name__ == "__main__": 101 | sampleFolders = ["TrafficCaptures/"] 102 | 103 | modeFolders = ["YouTube_home_world_live","CovertCast_home_world"] 104 | 105 | for sampleFolder in sampleFolders: 106 | for modeFolder in modeFolders: 107 | if not os.path.exists(auxFolder + sampleFolder + modeFolder): 108 | os.makedirs(auxFolder + sampleFolder + modeFolder) 109 | #ParseCapture(sampleFolder, modeFolder) 110 | CreateBigrams(sampleFolder, modeFolder) 111 | #ComputeDelta(sampleFolder, modeFolder) 112 | -------------------------------------------------------------------------------- /CovertCastAnalysis/KL_classifier.py: -------------------------------------------------------------------------------- 1 | import dpkt 2 | import os 3 | from matplotlib import pyplot as plt 4 | from matplotlib.pyplot import cm 5 | import numpy as np 6 | import socket 7 | import collections 8 | from scipy.stats import entropy 9 | 10 | auxFolder = 'auxFolder/' 11 | 12 | cfgs = [ 13 | ["YouTube_home_world_live", 14 | "CovertCast_home_world"] 15 | ] 16 | 17 | 18 | BIN_WIDTH = [15] 19 | 20 | 21 | def ComputeFrequencyDistributions(sampleFolder, cfg, binWidth): 22 | freq_dists = [] 23 | 24 | for mode in cfg: 25 | #Compute frequency distribution for A and B 26 | freq_dist = [] 27 | for sample in os.listdir(sampleFolder + mode): 28 | 29 | f = open(auxFolder + os.path.dirname(sampleFolder) + "/" + mode + "/" + sample + '/packetCount_' + str(binWidth), 'r') 30 | 31 | bin_dict = {} 32 | bins=[] 33 | #Generate the set of all possible bins 34 | for i in range(0,1500, binWidth): 35 | bin_dict[str(i).replace(" ", "")] = 1 36 | 37 | 38 | lines = f.readlines() 39 | for line in lines: 40 | try: 41 | bins.append(line.rstrip('\n')) 42 | except IndexError: 43 | break #Reached last index, stop processing 44 | f.close() 45 | 46 | #Account for each bin elem 47 | for i in bins: 48 | bin_dict[str(i)]+=1 49 | 50 | #Order bin_key : num_packets 51 | od_dict = collections.OrderedDict(sorted(bin_dict.items(), key=lambda t: float(t[0]))) 52 | bin_list = [] 53 | for i in od_dict: 54 | bin_list.append(float(od_dict[i])) 55 | 56 | #Build up the list of a distribution samples freq dist 57 | freq_dist.append(bin_list) 58 | #Build up the list of all freq dists for different sample folders 59 | freq_dists.append(freq_dist) 60 | 61 | return freq_dists 62 | 63 | def KL_Classify(freq_dists): 64 | # A vs A 65 | AvsA_matrix = [] 66 | for i in range(0, len(freq_dists[0])): 67 | AxVsAy = [] 68 | for j in range(0, len(freq_dists[0])): 69 | d = entropy(freq_dists[0][i],freq_dists[0][j]) 70 | AxVsAy.append(d) 71 | AvsA_matrix.append(AxVsAy) 72 | 73 | # A vs B 74 | AvsB_matrix = [] 75 | for i in range(0,len(freq_dists[0])): 76 | AxVsBy = [] 77 | for j in range(0, len(freq_dists[1])): 78 | d = entropy(freq_dists[0][i],freq_dists[1][j]) 79 | AxVsBy.append(d) 80 | AvsB_matrix.append(AxVsBy) 81 | 82 | # B vs B 83 | BvsB_matrix = [] 84 | for i in range(0, len(freq_dists[1])): 85 | BxVsBy = [] 86 | for j in range(0, len(freq_dists[1])): 87 | d = entropy(freq_dists[1][i],freq_dists[1][j]) 88 | BxVsBy.append(d) 89 | BvsB_matrix.append(BxVsBy) 90 | 91 | # B vs A 92 | BvsA_matrix = [] 93 | for i in range(0,len(freq_dists[1])): 94 | BxVsAy = [] 95 | for j in range(0, len(freq_dists[0])): 96 | d = entropy(freq_dists[1][i],freq_dists[0][j]) 97 | BxVsAy.append(d) 98 | BvsA_matrix.append(BxVsAy) 99 | 100 | ########################## 101 | #Compute success metric 102 | #Set A - YouTube 103 | #Set B - CovertCast 104 | #TP = Correctly identify CovertCast 105 | #TN = Correctly identify YouTube 106 | ########################## 107 | 108 | total_KL_distances = 0 109 | success = 0 110 | TrueNegatives = 0 111 | TruePositives = 0 112 | 113 | #A - B 114 | for i in range(0,len(freq_dists[0])): 115 | for j in range(0, len(AvsA_matrix[i])): 116 | for k in range(0, len(AvsB_matrix[i])): 117 | total_KL_distances+=1 118 | if(AvsA_matrix[i][j] < AvsB_matrix[i][k]): 119 | success += 1 120 | TrueNegatives += 1 121 | # B - A 122 | for i in range(0,len(freq_dists[1])): 123 | for j in range(0, len(BvsB_matrix[i])): 124 | for k in range(0, len(BvsA_matrix[i])): 125 | total_KL_distances +=1 126 | if(BvsB_matrix[i][j] < BvsA_matrix[i][k]): 127 | success += 1 128 | TruePositives += 1 129 | 130 | print "Total Accuracy: " + str(success / float(total_KL_distances)) 131 | print "TruePositives: " + str(TruePositives / float(total_KL_distances/2.0)) 132 | print "TrueNegatives: " + str(TrueNegatives / float(total_KL_distances/2.0)) 133 | 134 | 135 | if __name__ == "__main__": 136 | 137 | sampleFolders = ['TrafficCaptures/'] 138 | 139 | for sampleFolder in sampleFolders: 140 | print "###########################" 141 | print os.path.dirname(sampleFolder) 142 | print "###########################" 143 | for cfg in cfgs: 144 | 145 | print "KL classifier - Regular vs " + cfg[1] 146 | for binWidth in BIN_WIDTH: 147 | print "Bin Width: " + str(binWidth) 148 | KL_Classify(ComputeFrequencyDistributions(sampleFolder, cfg, binWidth)) 149 | -------------------------------------------------------------------------------- /DeltaShaperAnalysis/KL_classifier.py: -------------------------------------------------------------------------------- 1 | import dpkt 2 | import os 3 | from matplotlib import pyplot as plt 4 | from matplotlib.pyplot import cm 5 | import numpy as np 6 | import socket 7 | import collections 8 | from scipy.stats import entropy 9 | 10 | auxFolder = 'auxFolder/' 11 | 12 | cfgs = [ 13 | ["RegularTraffic", 14 | "DeltaShaperTraffic_320"], 15 | ["RegularTraffic", 16 | "DeltaShaperTraffic_160"]] 17 | 18 | 19 | BIN_WIDTH = [15] 20 | 21 | def ComputeFrequencyDistributions(sampleFolder, cfg, binWidth): 22 | freq_dists = [] 23 | 24 | for mode in cfg: 25 | #Compute frequency distribution for A and B 26 | freq_dist = [] 27 | for sample in os.listdir(sampleFolder + mode): 28 | 29 | f = open(auxFolder + os.path.dirname(sampleFolder) + "/" + mode + "/" + sample + '/packetCount_' + str(binWidth), 'r') 30 | 31 | bin_dict = {} 32 | bins=[] 33 | #Generate the set of all possible bins 34 | for i in range(0,1500, binWidth): 35 | bin_dict[str(i).replace(" ", "")] = 1 36 | 37 | 38 | lines = f.readlines() 39 | for line in lines: 40 | try: 41 | bins.append(line.rstrip('\n')) 42 | except IndexError: 43 | break #Reached last index, stop processing 44 | f.close() 45 | 46 | #Account for each bin elem 47 | for i in bins: 48 | bin_dict[str(i)]+=1 49 | 50 | #Order bin_key : num_packets 51 | od_dict = collections.OrderedDict(sorted(bin_dict.items(), key=lambda t: float(t[0]))) 52 | bin_list = [] 53 | for i in od_dict: 54 | bin_list.append(float(od_dict[i])) 55 | 56 | #Build up the list of a distribution samples freq dist 57 | freq_dist.append(bin_list) 58 | #Build up the list of all freq dists for different sample folders 59 | freq_dists.append(freq_dist) 60 | 61 | return freq_dists 62 | 63 | def KL_Classify(freq_dists): 64 | # A vs A 65 | AvsA_matrix = [] 66 | for i in range(0, len(freq_dists[0])): 67 | AxVsAy = [] 68 | for j in range(0, len(freq_dists[0])): 69 | d = entropy(freq_dists[0][i],freq_dists[0][j]) 70 | AxVsAy.append(d) 71 | AvsA_matrix.append(AxVsAy) 72 | 73 | # A vs B 74 | AvsB_matrix = [] 75 | for i in range(0,len(freq_dists[0])): 76 | AxVsBy = [] 77 | for j in range(0, len(freq_dists[1])): 78 | d = entropy(freq_dists[0][i],freq_dists[1][j]) 79 | AxVsBy.append(d) 80 | AvsB_matrix.append(AxVsBy) 81 | 82 | # B vs B 83 | BvsB_matrix = [] 84 | for i in range(0, len(freq_dists[1])): 85 | BxVsBy = [] 86 | for j in range(0, len(freq_dists[1])): 87 | d = entropy(freq_dists[1][i],freq_dists[1][j]) 88 | BxVsBy.append(d) 89 | BvsB_matrix.append(BxVsBy) 90 | 91 | # B vs A 92 | BvsA_matrix = [] 93 | for i in range(0,len(freq_dists[1])): 94 | BxVsAy = [] 95 | for j in range(0, len(freq_dists[0])): 96 | d = entropy(freq_dists[1][i],freq_dists[0][j]) 97 | BxVsAy.append(d) 98 | BvsA_matrix.append(BxVsAy) 99 | 100 | ########################## 101 | #Compute success metric 102 | #Set A - YouTube 103 | #Set B - CovertCast 104 | #TP = Correctly identify CovertCast 105 | #TN = Correctly identify YouTube 106 | ########################## 107 | 108 | total_KL_distances = 0 109 | success = 0 110 | TrueNegatives = 0 111 | TruePositives = 0 112 | 113 | #A - B 114 | for i in range(0,len(freq_dists[0])): 115 | for j in range(0, len(AvsA_matrix[i])): 116 | for k in range(0, len(AvsB_matrix[i])): 117 | total_KL_distances+=1 118 | if(AvsA_matrix[i][j] < AvsB_matrix[i][k]): 119 | success += 1 120 | TrueNegatives += 1 121 | # B - A 122 | for i in range(0,len(freq_dists[1])): 123 | for j in range(0, len(BvsB_matrix[i])): 124 | for k in range(0, len(BvsA_matrix[i])): 125 | total_KL_distances +=1 126 | if(BvsB_matrix[i][j] < BvsA_matrix[i][k]): 127 | success += 1 128 | TruePositives += 1 129 | 130 | print "Total Accuracy: " + str(success / float(total_KL_distances)) 131 | print "TruePositives: " + str(TruePositives / float(total_KL_distances/2.0)) 132 | print "TrueNegatives: " + str(TrueNegatives / float(total_KL_distances/2.0)) 133 | 134 | 135 | if __name__ == "__main__": 136 | 137 | sampleFolders = ['TrafficCaptures/480Resolution/'] 138 | 139 | for sampleFolder in sampleFolders: 140 | print "###########################" 141 | print os.path.dirname(sampleFolder) 142 | print "###########################" 143 | for cfg in cfgs: 144 | print "KL classifier - " + cfg[0] + " vs " + cfg[1] 145 | for binWidth in BIN_WIDTH: 146 | print "Bin Width: " + str(binWidth) 147 | KL_Classify(ComputeFrequencyDistributions(sampleFolder, cfg, binWidth)) 148 | -------------------------------------------------------------------------------- /FacetAnalysis/KL_classifier.py: -------------------------------------------------------------------------------- 1 | import dpkt 2 | import os 3 | from matplotlib import pyplot as plt 4 | from matplotlib.pyplot import cm 5 | import numpy as np 6 | import socket 7 | import time 8 | import collections 9 | from scipy.stats import entropy 10 | 11 | auxFolder = 'auxFolder/' 12 | 13 | cfgs = [ 14 | ["RegularTraffic_Christmas", 15 | "FacetTraffic_12.5_Christmas"], 16 | ["RegularTraffic_Christmas", 17 | "FacetTraffic_25_Christmas"], 18 | ["RegularTraffic_Christmas", 19 | "FacetTraffic_50_Christmas"] 20 | ] 21 | 22 | 23 | BIN_WIDTH = [15] 24 | 25 | def ComputeFrequencyDistributions(sampleFolder, cfg, binWidth): 26 | freq_dists = [] 27 | 28 | for mode in cfg: 29 | #Compute frequency distribution for A and B 30 | freq_dist = [] 31 | for sample in os.listdir(sampleFolder + mode): 32 | 33 | f = open(auxFolder + os.path.dirname(sampleFolder) + "/" + mode + "/" + sample + '/packetCount_' + str(binWidth), 'r') 34 | 35 | bin_dict = {} 36 | bins=[] 37 | #Generate the set of all possible bins 38 | for i in range(0,1500, binWidth): 39 | bin_dict[str(i).replace(" ", "")] = 1 40 | 41 | 42 | lines = f.readlines() 43 | for line in lines: 44 | try: 45 | bins.append(line.rstrip('\n')) 46 | except IndexError: 47 | break #Reached last index, stop processing 48 | f.close() 49 | 50 | #Account for each bin elem 51 | for i in bins: 52 | bin_dict[str(i)]+=1 53 | 54 | #Order bin_key : num_packets 55 | od_dict = collections.OrderedDict(sorted(bin_dict.items(), key=lambda t: float(t[0]))) 56 | bin_list = [] 57 | for i in od_dict: 58 | bin_list.append(float(od_dict[i])) 59 | 60 | #Build up the list of a distribution samples freq dist 61 | freq_dist.append(bin_list) 62 | #Build up the list of all freq dists for different sample folders 63 | freq_dists.append(freq_dist) 64 | 65 | return freq_dists 66 | 67 | def KL_Classify(freq_dists): 68 | 69 | #Time measurement - avg single KL 70 | """times = [] 71 | for j in range(0, len(freq_dists[1])): 72 | start_time = time.time() 73 | d = entropy(freq_dists[0][0],freq_dists[1][j]) 74 | end_time = time.time() 75 | times.append(end_time - start_time) 76 | print "Avg KL: " + "{0:.5f}".format(np.mean(times,axis=0))""" 77 | 78 | 79 | #time measurement - avg classification 80 | times = [] 81 | start_time = time.time() 82 | for j in range(0, len(freq_dists[1])): 83 | d = entropy(freq_dists[0][0],freq_dists[1][j]) 84 | for j in range(0, len(freq_dists[1])): 85 | d = entropy(freq_dists[0][1],freq_dists[1][j]) 86 | 87 | 88 | end_time = time.time() 89 | times.append(end_time - start_time) 90 | #print "Avg sample classification time: " + "{0:.5f}".format(end_time - start_time) 91 | 92 | 93 | ############################### 94 | #Model Building 95 | ############################### 96 | start_time = time.time() 97 | # A vs A 98 | AvsA_matrix = [] 99 | for i in range(0, len(freq_dists[0])): 100 | AxVsAy = [] 101 | for j in range(0, len(freq_dists[0])): 102 | d = entropy(freq_dists[0][i],freq_dists[0][j]) 103 | AxVsAy.append(d) 104 | AvsA_matrix.append(AxVsAy) 105 | 106 | 107 | 108 | 109 | # A vs B 110 | AvsB_matrix = [] 111 | for i in range(0,len(freq_dists[0])): 112 | AxVsBy = [] 113 | start_time = time.time() 114 | for j in range(0, len(freq_dists[1])): 115 | d = entropy(freq_dists[0][i],freq_dists[1][j]) 116 | AxVsBy.append(d) 117 | AvsB_matrix.append(AxVsBy) 118 | 119 | 120 | 121 | # B vs B 122 | BvsB_matrix = [] 123 | for i in range(0, len(freq_dists[1])): 124 | BxVsBy = [] 125 | for j in range(0, len(freq_dists[1])): 126 | d = entropy(freq_dists[1][i],freq_dists[1][j]) 127 | BxVsBy.append(d) 128 | BvsB_matrix.append(BxVsBy) 129 | 130 | # B vs A 131 | BvsA_matrix = [] 132 | for i in range(0,len(freq_dists[1])): 133 | BxVsAy = [] 134 | for j in range(0, len(freq_dists[0])): 135 | d = entropy(freq_dists[1][i],freq_dists[0][j]) 136 | BxVsAy.append(d) 137 | BvsA_matrix.append(BxVsAy) 138 | 139 | end_time = time.time() 140 | print "Model Building Time: " + "{0:.5f}".format(end_time - start_time) 141 | ########################## 142 | #Compute success metric 143 | #Set A - YouTube 144 | #Set B - CovertCast 145 | #TP = Correctly identify CovertCast 146 | #TN = Correctly identify YouTube 147 | ########################## 148 | 149 | total_KL_distances = 0 150 | success = 0 151 | TrueNegatives = 0 152 | TruePositives = 0 153 | 154 | #A - B 155 | for i in range(0,len(freq_dists[0])): 156 | for j in range(0, len(AvsA_matrix[i])): 157 | for k in range(0, len(AvsB_matrix[i])): 158 | total_KL_distances+=1 159 | if(AvsA_matrix[i][j] < AvsB_matrix[i][k]): 160 | success += 1 161 | TrueNegatives += 1 162 | # B - A 163 | for i in range(0,len(freq_dists[1])): 164 | for j in range(0, len(BvsB_matrix[i])): 165 | for k in range(0, len(BvsA_matrix[i])): 166 | total_KL_distances +=1 167 | if(BvsB_matrix[i][j] < BvsA_matrix[i][k]): 168 | success += 1 169 | TruePositives += 1 170 | 171 | 172 | print "Total Accuracy: " + str(success / float(total_KL_distances)) 173 | print "TruePositives: " + str(TruePositives / float(total_KL_distances/2.0)) 174 | print "TrueNegatives: " + str(TrueNegatives / float(total_KL_distances/2.0)) 175 | 176 | 177 | if __name__ == "__main__": 178 | 179 | sampleFolders = ['TrafficCaptures/240Resolution/'] 180 | 181 | for sampleFolder in sampleFolders: 182 | print "###########################" 183 | print os.path.dirname(sampleFolder) 184 | print "###########################" 185 | for cfg in cfgs: 186 | print "KL classifier - " + cfg[0] + " vs " + cfg[1] 187 | for binWidth in BIN_WIDTH: 188 | print "Bin Width: " + str(binWidth) 189 | KL_Classify(ComputeFrequencyDistributions(sampleFolder, cfg, binWidth)) 190 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Index 2 | 3 | 1. Classifiers description 4 | 2. Extracting features for feeding classifiers 5 | 3. Running the classifiers 6 | 4. Watching some more figures 7 | 5. Full example for Facet analysis 8 | 9 | #### Python 2.7 package requirements 10 | Install the required Python packages by running: 11 | `pip install -r requirements.txt` 12 | 13 | [**Note**] Not using virtualenvs, packages will be installed system-wide. 14 | 15 | #### Traffic Captures Location 16 | Traffic captures are available [here](https://turbina.gsd.inesc-id.pt/resources/resources.html). 17 | 18 | Copy each `TrafficCaptures` folder into the respective path in **MPTAnalysis** repo: 19 | 20 | MPTAnalysis/FacetAnalysis/TrafficCaptures 21 | MPTAnalysis/DeltaShaperAnalysis/TrafficCaptures 22 | MPTAnalysis/CovertCastAnalysis/TrafficCaptures 23 | 24 | ## 1- Classifiers Description 25 | ### Similarity-based classifiers 26 | *EMD classifier:* 27 | `EMD_classifier.py` -- This file includes the threshold-based EMD classifier as proposed in DeltaShaper. 28 | 29 | *Chi-Square classifier:* 30 | `X2_classifier.py`-- This file includes the Chi-Square test-based classifier as proposed in Facet. 31 | 32 | *Kullback-Leibler classifier:* 33 | `KL_classifier.py`-- This file includes the Kullback-Leibler-divergence classifier as proposed in CovertCast. 34 | 35 | ### Decision Tree-based classifiers 36 | *Decision Tree, Random Forest, and XGBoost:* 37 | `xgboost_classifier.py` -- This file includes the three decision tree-based classifiers used in our paper. 38 | 39 | ### Semi-Supervised / Unsupervised 40 | *Autoencoder:* 41 | `autoencoder.py`-- This file contains the TensorFlow code required to run our semi-supervised autoencoder. 42 | 43 | *One-Class SVM:* 44 | `OCSVM.py`-- This file includes the One-Class SVM classifier. 45 | 46 | *IsolationForests:* 47 | `IsolationForests.py`-- This file includes the Isolation Forests classifier. 48 | 49 | ## 2 - Extracting features for feeding classifiers 50 | ### Similarity-based classifiers 51 | For using our similarity-based classifiers, raw packet captures must be mangled in order to extract binned packet sizes / bi-grams of packet sizes. `ParseCaptures.py`includes the code for parsing the raw packet captures into packet length bins of size [15, 20, 50], which will be respectively used by the [KL, X2, EMD] classifiers. Extracted features will be located in a newly generated folder called `auxFolder`. 52 | 53 | Albeit `ParseCaptures.py` is also prepared to extract inter-packet timing features, we will not be using these with our similarity-based classifiers. 54 | 55 | [**Disclaimer**] Extraction can take a while, I did not parallelize this code as it would be just a one-time execution. 56 | 57 | ### Remaining classifiers 58 | For using the remaining classifiers, we will extract features and build datasets to be stored in `.csv` files. 59 | 60 | File `extractFeatures.py`contains the required code for extracting our two different sets of features (Binned packet lengths / Summary statistics) from existing packet captures. This file defines two functions for each set of features, respectively: `FeatureExtractionPLBenchmark` and `FeatureExtractionStatsBenchmark`. Each can be called in the main code. `GenerateDatasets` will take the job of combining the extracted sets of features and build the datasets. 61 | 62 | Feature datasets will be stored in the `FeatureSets` folder. For instance `PL_60` stores the datasets pertaining to the extraction of binned Packet Lengths collected in an interval of 60 seconds of the whole packet trace. 63 | 64 | ## 3 - Running the classifiers 65 | ### Similarity-based classifiers 66 | `X2_classifier.py` provides two main functions for analysis, which can be selected to be used in its `main` interchangeably. `Prepare_X_RatioReproduction` reproduces the results of Facet's paper, outputting the results of a classifier with changing deltas (and enabling us to plot a ROC curve). `Prepare_X_Fixed` allows for obtaining fixed classification results for comparison with the Kullback-Leibler classifier which only outputs fixed classification rates. 67 | 68 | Creates a folder called `X2` for holding AUC plots and serialized TPR/FPR rates for later producing the figures included in the paper. 69 | 70 | [**Warning**] This code is not parallelized. Building the models for the classifier is an overnight effort (at least for Facet data). 71 | 72 | ---- 73 | `EMD_classifier.py`can just be executed in order to output the classifier's results with changing deltas. The script prints the delta threshold where maximum accuracy for the classifier is reached, in order to compare with the Kullback-Leibler classifier which only outputs fixed classification rates. 74 | 75 | Creates a folder called `EMD` for holding AUC plots and serialized TPR/FPR rates for later producing the figures included in the paper. 76 | 77 | ---- 78 | `KL_classifier.py` outputs fixed classification results. 79 | 80 | ### Decision Tree-based classifiers 81 | `xgboost_classifier.py`outputs the classification results of our three different decision tree-based classifiers, for different True Positive / False Positive rates ratios. For training these classifiers, data is assumed to be fully labeled. 82 | 83 | The script creates a folder called `xgBoost` for storing ROC AUC figures of each classification effort, along with serialized data for building our paper ROC figures + feature importance data. 84 | 85 | [**Note**] In the `main` function, variable `data_folder` must point the folder containing the dataset extracted with the desired feature set. For our paper results, `FeatureSets/Stats_60` or `FeatureSets/PL_60` correspond to the either of our feature sets using Summary Statistics or binned Packet Lengths. 86 | 87 | ### Semi-Supervised / Unsupervised 88 | `OCSVM.py` runs a grid search on the parameter space of (nu,gamma) for OCSVM. It outputs the average and maximum AUC obtained after attempting to classify data points from a learned representation of legitimate video transmissions-only. In its `main` function, variable `data_folder` must point the folder containing the dataset extracted with the desired feature set. 89 | 90 | `autoencoder.py` runs a grid search on the parameter space of (neurons in the hidden layer, size of the compressed representation layer) for our Autoencoder. It outputs the average and maximum AUC obtained after attempting to classify data points from a learned representation of legitimate video transmissions-only. In its `main` function, variable `data_folder` must point the folder containing the dataset extracted with the desired feature set. 91 | 92 | `IsolationForests.py`runs a grid search on the parameter space of (number of trees, samples per tree) for our Isolation Forest. It outputs the average and maximum AUC obtained after attempting to classify unlabeled data points. In its `main` function, variable `data_folder` must point the folder containing the dataset extracted with the desired feature set. 93 | 94 | The script creates a folder called `Isolation` for storing ROC AUC figures. 95 | 96 | [**Note**] In the `main` function, variable `data_folder` must point the folder containing the dataset extracted with the desired feature set. For our paper results, `FeatureSets/Stats_60` or `FeatureSets/PL_60` correspond to the either of our feature sets using Summary Statistics or binned Packet Lengths. 97 | 98 | ## 4 - Watching some more figures 99 | In the case of Facet / DeltaShaper analysis, there is a folder called `Figures`. This folder includes `generateFigures.py` which generates the figures used in our paper + some more detail about feature analysis. 100 | 101 | ## 5 - Full example for Facet analysis 102 | 103 | #Parse raw .pcap files for generating features for similarity-based classifiers 104 | $ cd FacetAnalysis 105 | $ python ParseCaptures.py 106 | 107 | #Run any similarity-based classifier 108 | $ python [EMD_classifier.py, KL_classifier.py, X2_classifier.py] 109 | 110 | #Parse raw .pcap files for generating features for state-of-the-art ML algorithms 111 | $ python extractFeatures.py 112 | 113 | #Run any ML classifier 114 | $ python [xgboost_classifier.py, OCSVM.py, autoencoder.py, IsolationForests.py] 115 | 116 | #Generate paper figures 117 | $ cd Figures 118 | $ python generateFigures.py 119 | 120 | 121 | -------------------------------------------------------------------------------- /CovertCastAnalysis/EMD_classifier.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import dpkt 3 | import subprocess 4 | import socket 5 | import os 6 | from random import randint 7 | import math 8 | from itertools import product 9 | import datetime 10 | from matplotlib import pyplot as plt 11 | from matplotlib.pyplot import cm 12 | from matplotlib.ticker import MultipleLocator, FormatStrFormatter 13 | import numpy as np 14 | from pyemd import emd 15 | import collections 16 | 17 | BIN_WIDTH = [50] 18 | folder = "auxFolder/" 19 | 20 | 21 | cfgs = [ 22 | ["YouTube_home_world_live", 23 | "CovertCast_home_world"] 24 | ] 25 | 26 | 27 | 28 | 29 | def GatherChatSamples(sampleFolder, baselines, binWidth): 30 | Samples = [] 31 | for baseline in baselines: 32 | for cap in os.listdir(folder + sampleFolder + baseline): 33 | Samples.append(folder + sampleFolder + baseline + "/" + cap + "/" + 'packetCount_' + str(binWidth)) 34 | return Samples 35 | 36 | def ComputeRate(sampleFolder, emdResults, num_irregular_samples, num_regular_samples, binWidth): 37 | deltas = np.arange(0.001, 1, 0.001) 38 | 39 | Sensitivity = [] 40 | Specificity = [] 41 | 42 | 43 | max_acc = 0 44 | max_delta = 0 45 | max_tpr = 0 46 | max_tnr = 0 47 | max_fpr = 0 48 | 49 | accuracy = 0 50 | for delta in deltas: 51 | FPositives = 0 52 | FNegatives = 0 53 | TPositives = 0 54 | TNegatives = 0 55 | 56 | 57 | #Positives are Facet samples classified as Facet 58 | for i, capEMD in enumerate(emdResults): 59 | if(capEMD > delta and i < num_regular_samples): # Regular baselines 60 | FPositives += 1 61 | if (capEMD < delta and i < num_regular_samples): 62 | TNegatives += 1 63 | if(capEMD <= delta and i >= num_regular_samples): #Irregular baseline 64 | FNegatives += 1 65 | if(capEMD > delta and i >= num_regular_samples): 66 | TPositives += 1 67 | """ 68 | #NEGATED 69 | for i, capEMD in enumerate(emdResults): 70 | if(capEMD > delta and i < num_regular_samples): # Regular baselines 71 | TNegatives += 1 72 | if (capEMD < delta and i < num_regular_samples): 73 | FPositives += 1 74 | if(capEMD <= delta and i >= num_regular_samples): #Irregular baseline 75 | TPositives += 1 76 | if(capEMD > delta and i >= num_regular_samples): 77 | FNegatives += 1 78 | """ 79 | Sensitivity.append(TPositives/(TPositives+float(FNegatives))) 80 | Specificity.append(TNegatives/(TNegatives+float(FPositives))) 81 | 82 | accuracy = (TPositives + TNegatives)/float(num_irregular_samples + num_regular_samples) 83 | if(accuracy > max_acc): 84 | max_acc = accuracy 85 | max_delta = delta 86 | max_tpr = TPositives/(TPositives+float(FNegatives)) 87 | max_tnr = TNegatives/(TNegatives+float(FPositives)) 88 | max_fpr = 1 - max_tnr 89 | 90 | fig = plt.figure() 91 | ax1 = fig.add_subplot(111) 92 | 93 | print "AUC" 94 | auc = np.trapz(np.array(Sensitivity), 1 - np.array(Specificity)) 95 | print auc 96 | #ROC Curve 97 | ax1.plot(1 - np.array(Specificity), np.array(Sensitivity), 'k.-', color='black', label = 'ROC (AUC = %0.2f)' % (auc)) 98 | ax1.plot([0, 1], [0, 1], 'k--', lw=2, color='orange', label = 'Random Guess') 99 | ax1.grid(color='black', linestyle='dotted') 100 | 101 | plt.title('Receiver Operating Characteristic (ROC)') 102 | plt.xlabel('False Positive Rate', fontsize='x-large') 103 | plt.ylabel('True Positive Rate', fontsize='x-large') 104 | plt.legend(loc='lower right', fontsize='large') 105 | 106 | plt.setp(ax1.get_xticklabels(), fontsize=14) 107 | plt.setp(ax1.get_yticklabels(), fontsize=14) 108 | 109 | max_stats = "Max acc: " + str(max_acc) + " Max TPR:" + str(max_tpr) + " Max TNR:" + str(max_tnr) + " Max FPR:" + str(max_fpr) + " delta:" + str(max_delta) 110 | 111 | fig.savefig('EMD/' + sampleFolder + baselines[1] + '/Rate_' + str(binWidth) + '.pdf') # save the figure to file 112 | plt.close(fig) 113 | 114 | return max_stats 115 | 116 | 117 | def GenerateDists(samples, binWidth): 118 | dists = [] 119 | print "Building distributions" 120 | 121 | for sample in samples: 122 | #print sample 123 | f = open(sample, 'r') 124 | 125 | Gk = {} 126 | bins=[] 127 | #Generate the set of all possible bins 128 | for i in range(0,1500, binWidth): 129 | Gk[str(i).replace(" ", "")] = 0 130 | 131 | 132 | lines = f.readlines() 133 | for line in lines: 134 | try: 135 | bins.append(line.rstrip('\n')) 136 | except IndexError: 137 | break #Reached last index, stop processing 138 | 139 | #Account for each bin elem 140 | for i in bins: 141 | Gk[str(i)]+=1 142 | 143 | od = collections.OrderedDict(sorted(Gk.items())) 144 | Gklist = [] 145 | for i in od: 146 | Gklist.append(float(od[i])) 147 | Gklist = np.array(Gklist) 148 | 149 | dists.append(Gklist) 150 | f.close() 151 | print "End - Building distributions" 152 | 153 | #Build distance matrix 154 | Gk = {} 155 | bins=[] 156 | #Generate the set of all possible bins 157 | for i in range(0,1500, binWidth): 158 | Gk[str(i).replace(" ", "")] = 0 159 | 160 | #Generate distance matrix 161 | distance_matrix = [] 162 | for i in range(0,len(Gk)): 163 | line =[] 164 | for j in range(0,len(Gk)): 165 | if(i==j): 166 | line.append(0.0) 167 | else: 168 | line.append(1.0) 169 | 170 | distance_matrix.append(np.array(line)) 171 | distance_matrix = np.array(distance_matrix) 172 | 173 | return dists, distance_matrix 174 | 175 | 176 | def Classifier(toClassify, allSamples, baseSamples, distance_matrix, binWidth): 177 | emdResults = [] 178 | emdSum = 0 179 | ################################## 180 | #Read first element in combination 181 | ################################## 182 | Gk_corelist = toClassify 183 | 184 | for n, sample in enumerate(baseSamples): 185 | 186 | Gklist = sample 187 | ############################ 188 | ###### NORMALIZATION ####### 189 | ############################ 190 | ground1 = max(Gk_corelist) 191 | ground2 = max(Gklist) 192 | if(ground1 > ground2): 193 | MAX = ground1 194 | else: 195 | MAX = ground2 196 | 197 | if(max(np.cumsum(Gk_corelist)) > max(np.cumsum(Gklist))): 198 | cSum = max(np.cumsum(Gk_corelist)) 199 | else: 200 | cSum = max(np.cumsum(Gklist)) 201 | 202 | dtm = distance_matrix/cSum 203 | 204 | emdR = float(emd(Gk_corelist, Gklist, dtm)) 205 | emdSum += emdR 206 | emdResults.append(emdR) 207 | 208 | avgEMD = emdSum / len(emdResults) 209 | #print str(avgEMD) 210 | return avgEMD 211 | 212 | def plotEMD(sampleFolder, baselines, binWidth): 213 | regularSamples = GatherChatSamples(sampleFolder,baselines[:-1], binWidth) 214 | allSamples = GatherChatSamples(sampleFolder, baselines, binWidth) 215 | 216 | allSamplesDists, distance_matrix = GenerateDists(allSamples, binWidth) 217 | 218 | emdResults = [] 219 | for n, bs in enumerate(allSamplesDists): 220 | #print allSamples[n] 221 | emdResults.append(Classifier(bs, allSamples, allSamplesDists[:len(regularSamples)], distance_matrix, binWidth)) 222 | 223 | acc = float(0) 224 | for i in range(0,len(regularSamples)): 225 | acc += emdResults[i] 226 | acc = acc/len(regularSamples) 227 | print "AVG Regular " + str(acc) 228 | 229 | max_stat = ComputeRate(sampleFolder, emdResults, len(allSamples) - len(regularSamples), len(regularSamples), binWidth) 230 | print max_stat 231 | 232 | 233 | if __name__ == "__main__": 234 | 235 | sampleFolders = ["TrafficCaptures/"] 236 | 237 | if not os.path.exists('EMD'): 238 | os.makedirs('EMD') 239 | 240 | for sampleFolder in sampleFolders: 241 | for baselines in cfgs: 242 | print "===========================================" 243 | print "Analyzing " + baselines[0] + " - " + baselines[1] 244 | for binWidth in BIN_WIDTH: 245 | print "##############" 246 | print "BinWidth: " + str(binWidth) 247 | if not os.path.exists('EMD/' + sampleFolder + baselines[1]): 248 | os.makedirs('EMD/' + sampleFolder + baselines[1]) 249 | plotEMD(sampleFolder, baselines, binWidth) 250 | -------------------------------------------------------------------------------- /CovertCastAnalysis/xgboost_classifier.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import os 3 | import csv 4 | import numpy as np 5 | from scipy import interp 6 | import random 7 | from random import shuffle 8 | import math 9 | #Classifiers 10 | from xgboost import XGBClassifier 11 | from sklearn.ensemble import RandomForestClassifier 12 | from sklearn.tree import DecisionTreeClassifier 13 | from sklearn.neighbors import KNeighborsClassifier 14 | #Eval Metrics 15 | from sklearn.model_selection import train_test_split, KFold 16 | from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, auc 17 | from sklearn.model_selection import cross_val_score 18 | 19 | np.random.seed(1) 20 | random.seed(1) 21 | 22 | 23 | def gatherHoldoutData(data_folder, cfg): 24 | SPLIT_FACTOR = 0.7 25 | #Load Datasets 26 | f = open(data_folder + cfg[0] + "_dataset.csv", 'r') 27 | reader = csv.reader(f, delimiter=',') 28 | reg = list(reader) 29 | 30 | f = open(data_folder + cfg[1] + "_dataset.csv", 'r') 31 | reader = csv.reader(f, delimiter=',') 32 | fac = list(reader) 33 | print "###########################################" 34 | print "Configuration " + cfg[1] 35 | print "###########################################" 36 | 37 | #Convert data to floats (and labels to integers) 38 | reg_data = [] 39 | for i in reg[1:]: 40 | int_array = [] 41 | for pl in i[:-1]: 42 | int_array.append(float(pl)) 43 | int_array.append(0) 44 | reg_data.append(int_array) 45 | 46 | fac_data = [] 47 | for i in fac[1:]: 48 | int_array = [] 49 | for pl in i[:-1]: 50 | int_array.append(float(pl)) 51 | int_array.append(1) 52 | fac_data.append(int_array) 53 | 54 | 55 | #Shuffle both datasets 56 | shuffled_reg_data = random.sample(reg_data, len(reg_data)) 57 | shuffled_fac_data = random.sample(fac_data, len(fac_data)) 58 | 59 | #Build label tensors 60 | reg_labels = [] 61 | for i in shuffled_reg_data: 62 | reg_labels.append(int(i[len(reg_data[0])-1])) 63 | 64 | fac_labels = [] 65 | for i in shuffled_fac_data: 66 | fac_labels.append(int(i[len(reg_data[0])-1])) 67 | 68 | #Take label out of data tensors 69 | for i in range(0, len(shuffled_reg_data)): 70 | shuffled_reg_data[i].pop() 71 | 72 | for i in range(0, len(shuffled_fac_data)): 73 | shuffled_fac_data[i].pop() 74 | 75 | 76 | #Build training and testing datasets 77 | #Split each class data in the appropriate proportion for training 78 | reg_proportion_index = int(len(reg_labels)* SPLIT_FACTOR) 79 | reg_train_x = shuffled_reg_data[:reg_proportion_index] 80 | reg_train_y = reg_labels[:reg_proportion_index] 81 | 82 | fac_proportion_index = int(len(fac_labels)*SPLIT_FACTOR) 83 | fac_train_x = shuffled_fac_data[:fac_proportion_index] 84 | fac_train_y = fac_labels[:fac_proportion_index] 85 | 86 | #Create training sets by combining the randomly selected samples from each class 87 | train_x = reg_train_x + fac_train_x 88 | train_y = reg_train_y + fac_train_y 89 | 90 | #Make the split for the testing data 91 | reg_test_x = shuffled_reg_data[reg_proportion_index:] 92 | reg_test_y = reg_labels[reg_proportion_index:] 93 | 94 | fac_test_x = shuffled_fac_data[fac_proportion_index:] 95 | fac_test_y = fac_labels[fac_proportion_index:] 96 | 97 | #Create testing set by combining the holdout samples 98 | test_x = reg_test_x + fac_test_x 99 | test_y = reg_test_y + fac_test_y 100 | 101 | return train_x, train_y, test_x, test_y 102 | 103 | def gatherAllData(data_folder, cfg): 104 | #Load Datasets 105 | f = open(data_folder + cfg[0] + "_dataset.csv", 'r') 106 | reader = csv.reader(f, delimiter=',') 107 | reg = list(reader) 108 | 109 | f = open(data_folder + cfg[1] + "_dataset.csv", 'r') 110 | reader = csv.reader(f, delimiter=',') 111 | fac = list(reader) 112 | print "###########################################" 113 | print "Configuration " + cfg[1] 114 | print "###########################################" 115 | 116 | #Convert data to floats (and labels to integers) 117 | reg_data = [] 118 | for i in reg[1:]: 119 | int_array = [] 120 | for pl in i[:-1]: 121 | int_array.append(float(pl)) 122 | int_array.append(0) 123 | reg_data.append(int_array) 124 | 125 | fac_data = [] 126 | for i in fac[1:]: 127 | int_array = [] 128 | for pl in i[:-1]: 129 | int_array.append(float(pl)) 130 | int_array.append(1) 131 | fac_data.append(int_array) 132 | 133 | 134 | #Shuffle both datasets 135 | shuffled_reg_data = random.sample(reg_data, len(reg_data)) 136 | shuffled_fac_data = random.sample(fac_data, len(fac_data)) 137 | 138 | #Build label tensors 139 | reg_labels = [] 140 | for i in shuffled_reg_data: 141 | reg_labels.append(int(i[len(reg_data[0])-1])) 142 | 143 | fac_labels = [] 144 | for i in shuffled_fac_data: 145 | fac_labels.append(int(i[len(reg_data[0])-1])) 146 | 147 | #Take label out of data tensors 148 | for i in range(0, len(shuffled_reg_data)): 149 | shuffled_reg_data[i].pop() 150 | 151 | for i in range(0, len(shuffled_fac_data)): 152 | shuffled_fac_data[i].pop() 153 | 154 | #Create training sets by combining the randomly selected samples from each class 155 | train_x = shuffled_reg_data + shuffled_fac_data 156 | train_y = reg_labels + fac_labels 157 | 158 | #Shuffle positive/negative samples for CV purposes 159 | x_shuf = [] 160 | y_shuf = [] 161 | index_shuf = range(len(train_x)) 162 | shuffle(index_shuf) 163 | for i in index_shuf: 164 | x_shuf.append(train_x[i]) 165 | y_shuf.append(train_y[i]) 166 | 167 | return x_shuf, y_shuf 168 | 169 | 170 | def runXGBoost(data_folder, cfg): 171 | #Gather the dataset 172 | print "Gather dataset" 173 | train_x, train_y, test_x, test_y = gatherHoldoutData(data_folder, cfg) 174 | 175 | 176 | param = {'max_depth': 2, 'eta': 1, 'silent': 1, 'objective': 'binary:logistic'} 177 | param['nthread'] = 4 178 | param['eval_metric'] = 'auc' 179 | 180 | 181 | model = XGBClassifier() 182 | model.fit(np.asarray(train_x), np.asarray(train_y)) 183 | 184 | y_pred = model.predict(np.asarray(test_x)) 185 | predictions = [round(value) for value in y_pred] 186 | 187 | # evaluate predictions 188 | accuracy = accuracy_score(np.asarray(test_y), predictions) 189 | print("Accuracy: %.2f%%" % (accuracy * 100.0)) 190 | 191 | y_pred = model.predict_proba(np.asarray(test_x))[:,1] 192 | print 'Area under ROC:', roc_auc_score(np.asarray(test_y),y_pred) 193 | 194 | 195 | def runClassification_CV(data_folder,cfg,classifier): 196 | print "Gather dataset" 197 | train_x, train_y= gatherAllData(data_folder, cfg) 198 | 199 | model = classifier[0] 200 | clf_name = classifier[1] 201 | 202 | #Report Cross-Validation Accuracy 203 | scores = cross_val_score(model, np.asarray(train_x), np.asarray(train_y), cv=10) 204 | print clf_name 205 | print "Avg. Accuracy: " + str(sum(scores)/float(len(scores))) 206 | 207 | cv = KFold(n_splits=10) 208 | tprs = [] 209 | aucs = [] 210 | mean_fpr = np.linspace(0, 1, 100) 211 | 212 | 213 | #Split the data in k-folds, perform classification, and report ROC 214 | i = 0 215 | for train, test in cv.split(train_x, train_y): 216 | probas_ = model.fit(np.asarray(train_x)[train], np.asarray(train_y)[train]).predict_proba(np.asarray(train_x)[test]) 217 | 218 | # Compute ROC curve and area under the curve 219 | fpr, tpr, thresholds = roc_curve(np.asarray(train_y)[test], probas_[:, 1]) 220 | tprs.append(interp(mean_fpr, fpr, tpr)) 221 | tprs[-1][0] = 0.0 222 | roc_auc = auc(fpr, tpr) 223 | aucs.append(roc_auc) 224 | #plt.plot(fpr, tpr, lw=1, alpha=0.3, label='ROC fold %d (AUC = %0.2f)' % (i, roc_auc)) 225 | i += 1 226 | 227 | plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r', label='Random Guess', alpha=.8) 228 | 229 | 230 | mean_tpr = np.mean(tprs, axis=0) 231 | mean_tpr[-1] = 1.0 232 | mean_auc = auc(mean_fpr, mean_tpr) 233 | 234 | unblock70 = True 235 | unblock80 = True 236 | unblock90 = True 237 | unblock95 = True 238 | for n, i in enumerate(mean_tpr): 239 | if(i >= 0.7 and unblock70): 240 | print '70% TPR = ' + str(mean_fpr[n]) 241 | unblock70 = False 242 | if(i >= 0.8 and unblock80): 243 | print '80% TPR = ' + str(mean_fpr[n]) 244 | unblock80 = False 245 | if(i >= 0.9 and unblock90): 246 | print '90% TPR = ' + str(mean_fpr[n]) 247 | unblock90 = False 248 | if(i >= 0.95 and unblock95): 249 | print '95% TPR = ' + str(mean_fpr[n]) 250 | unblock95 = False 251 | 252 | #Figure properties 253 | fig = plt.figure() 254 | ax1 = fig.add_subplot(111) 255 | 256 | std_auc = np.std(aucs) 257 | plt.plot(mean_fpr, mean_tpr, color='b', label=r'Mean ROC (AUC = %0.2f $\pm$ %0.3f)' % (mean_auc, std_auc), lw=2, alpha=.8) 258 | 259 | #Compute Standard Deviation between folds 260 | std_tpr = np.std(tprs, axis=0) 261 | tprs_upper = np.minimum(mean_tpr + std_tpr, 1) 262 | tprs_lower = np.maximum(mean_tpr - std_tpr, 0) 263 | plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.3, label=r'$\pm$ ROC Std. Dev.') 264 | 265 | 266 | 267 | ax1.plot([0, 1], [0, 1], 'k--', lw=2, color='orange', label = 'Random Guess') 268 | ax1.grid(color='black', linestyle='dotted') 269 | 270 | plt.title('Receiver Operating Characteristic (ROC)') 271 | plt.xlabel('False Positive Rate', fontsize='x-large') 272 | plt.ylabel('True Positive Rate', fontsize='x-large') 273 | plt.legend(loc='lower right', fontsize='large') 274 | 275 | plt.setp(ax1.get_xticklabels(), fontsize=14) 276 | plt.setp(ax1.get_yticklabels(), fontsize=14) 277 | 278 | fig.savefig('xgBoost/' + "ROC_" + clf_name + "_" + cfg[1] + ".pdf") # save the figure to file 279 | plt.close(fig) 280 | 281 | if __name__ == "__main__": 282 | data_folder = 'TrafficCaptures/' 283 | 284 | cfgs = [ 285 | ["YouTube_home_world_live", 286 | "CovertCast_home_world"]] 287 | 288 | 289 | classifiers = [ 290 | [RandomForestClassifier(n_estimators=100, max_features=None), "RandomForest"], 291 | [DecisionTreeClassifier(), "Decision Tree"], 292 | [XGBClassifier(),"XGBoost"] 293 | ] 294 | 295 | 296 | if not os.path.exists('xgBoost'): 297 | os.makedirs('xgBoost') 298 | 299 | for cfg in cfgs: 300 | for classifier in classifiers: 301 | print "Running classifiers for " + cfg[0] + " and " + cfg[1] 302 | runClassification_CV(data_folder, cfg, classifier) 303 | -------------------------------------------------------------------------------- /DeltaShaperAnalysis/EMD_classifier.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import dpkt 3 | import subprocess 4 | import socket 5 | import os 6 | from random import randint 7 | import math 8 | from itertools import product 9 | import datetime 10 | from matplotlib import pyplot as plt 11 | from matplotlib.pyplot import cm 12 | from matplotlib.ticker import MultipleLocator, FormatStrFormatter 13 | import numpy as np 14 | from pyemd import emd 15 | import collections 16 | 17 | BIN_WIDTH = [50] #20,50,100 18 | folder = "auxFolder/" 19 | 20 | cfgs = [ 21 | ["RegularTraffic", 22 | "DeltaShaperTraffic_320"], 23 | ["RegularTraffic", 24 | "DeltaShaperTraffic_160"]] 25 | 26 | 27 | def GatherChatSamples(sampleFolder, baselines, binWidth): 28 | Samples = [] 29 | for baseline in baselines: 30 | for cap in os.listdir(folder + sampleFolder + baseline): 31 | Samples.append(folder + sampleFolder + baseline + "/" + cap + "/" + 'packetCount_' + str(binWidth)) 32 | return Samples 33 | 34 | def ComputeRate(sampleFolder, emdResults, num_irregular_samples, num_regular_samples, binWidth): 35 | deltas = np.arange(0.001, 1, 0.001) 36 | 37 | Sensitivity = [] 38 | Specificity = [] 39 | 40 | holding90 = True 41 | holding80 = True 42 | holding70 = True 43 | 44 | thresh90 = 0 45 | thresh80 = 0 46 | thresh70 = 0 47 | 48 | val90 = 0 49 | val80 = 0 50 | val70 = 0 51 | 52 | max_acc = 0 53 | max_delta = 0 54 | max_tpr = 0 55 | max_tnr = 0 56 | max_fpr = 0 57 | 58 | found_conservative_threshold = False 59 | conservative_acc = 0 60 | conservative_delta = 0 61 | conservative_tnr = 0 62 | 63 | accuracy = 0 64 | for delta in deltas: 65 | FPositives = 0 66 | FNegatives = 0 67 | TPositives = 0 68 | TNegatives = 0 69 | 70 | 71 | #Positives are DS samples classified as DS 72 | for i, capEMD in enumerate(emdResults): 73 | if(capEMD > delta and i < num_regular_samples): # Regular baselines 74 | FPositives += 1 75 | if (capEMD < delta and i < num_regular_samples): 76 | TNegatives += 1 77 | if(capEMD <= delta and i >= num_regular_samples): #Irregular baseline 78 | FNegatives += 1 79 | if(capEMD > delta and i >= num_regular_samples): 80 | TPositives += 1 81 | """ 82 | #NEGATED 83 | for i, capEMD in enumerate(emdResults): 84 | if(capEMD > delta and i < num_regular_samples): # Regular baselines 85 | TNegatives += 1 86 | if (capEMD < delta and i < num_regular_samples): 87 | FPositives += 1 88 | if(capEMD <= delta and i >= num_regular_samples): #Irregular baseline 89 | TPositives += 1 90 | if(capEMD > delta and i >= num_regular_samples): 91 | FNegatives += 1 92 | """ 93 | Sensitivity.append(TPositives/(TPositives+float(FNegatives))) 94 | Specificity.append(TNegatives/(TNegatives+float(FPositives))) 95 | 96 | accuracy = (TPositives + TNegatives)/float(num_irregular_samples + num_regular_samples) 97 | TNR = TNegatives/(TNegatives+float(FPositives)) 98 | FNR = FNegatives/(TPositives+float(FNegatives)) 99 | TPR = TPositives/(TPositives+float(FNegatives)) 100 | FPR = FPositives/(FPositives+float(TNegatives)) 101 | #print delta 102 | #print TNegatives/(TNegatives+float(FPositives)) 103 | 104 | if(holding90): 105 | if(FNR >= 0.1): 106 | holding90 = False 107 | thresh90 = delta 108 | val90 = FPR 109 | 110 | if(holding80): 111 | if(FNR >= 0.2): 112 | holding80 = False 113 | thresh80 = delta 114 | val80 = FPR 115 | 116 | if(holding70): 117 | if(FNR >= 0.3): 118 | holding70 = False 119 | thresh70 = delta 120 | val70 = FPR 121 | 122 | #Conservative threshold - the delta where all legitimate videos are classified as such 123 | if(TNegatives/(TNegatives+float(FPositives)) >= 1 and not found_conservative_threshold): 124 | conservative_acc = accuracy 125 | conservative_delta = delta 126 | conservative_tpr = TPositives/(TPositives+float(FNegatives)) 127 | found_conservative_threshold = True 128 | 129 | if(accuracy > max_acc): 130 | max_acc = accuracy 131 | max_delta = delta 132 | max_tpr = TPositives/(TPositives+float(FNegatives)) 133 | max_tnr = TNegatives/(TNegatives+float(FPositives)) 134 | max_fpr = 1 - max_tnr 135 | 136 | fig = plt.figure() 137 | ax1 = fig.add_subplot(111) 138 | 139 | print "TPR90 = " + str(val90) 140 | print "TPR80 = " + str(val80) 141 | print "TPR70 = " + str(val70) 142 | 143 | print "AUC" 144 | auc = np.trapz(np.array(Sensitivity)[::-1], (1-np.array(Specificity))[::-1]) 145 | print auc 146 | #ROC Curve 147 | np.save('EMD/' + sampleFolder + baselines[1] + '/Rate_' + str(binWidth) + '_Sensitivity', np.array(Sensitivity)) 148 | np.save('EMD/' + sampleFolder + baselines[1] + '/Rate_' + str(binWidth) + '_Specificity', np.array(Specificity)) 149 | 150 | ax1.plot(1 - np.array(Specificity), np.array(Sensitivity), 'k.-', color='black', label = 'ROC (AUC = %0.2f)' % (auc)) 151 | ax1.plot([0, 1], [0, 1], 'k--', lw=2, color='orange', label = 'Random Guess') 152 | ax1.grid(color='black', linestyle='dotted') 153 | 154 | plt.title('Receiver Operating Characteristic (ROC)') 155 | plt.xlabel('False Positive Rate', fontsize='x-large') 156 | plt.ylabel('True Positive Rate', fontsize='x-large') 157 | plt.legend(loc='lower right', fontsize='large') 158 | 159 | plt.setp(ax1.get_xticklabels(), fontsize=14) 160 | plt.setp(ax1.get_yticklabels(), fontsize=14) 161 | 162 | max_stats = "Max acc: " + str(max_acc) + " Max TPR:" + str(max_tpr) + " Max TNR:" + str(max_tnr) + " Max FPR:" + str(max_fpr) + " delta:" + str(max_delta) 163 | 164 | fig.savefig('EMD/' + sampleFolder + baselines[1] + '/Rate_' + str(binWidth) + '.pdf') # save the figure to file 165 | plt.close(fig) 166 | 167 | conservative_stats = "Con. acc: " + str(conservative_acc) + " Con. TPR:" + str(conservative_tnr) + " delta:" + str(conservative_delta) 168 | print conservative_stats 169 | return max_stats 170 | 171 | 172 | def GenerateDists(samples, binWidth): 173 | dists = [] 174 | print "Building distributions" 175 | 176 | for sample in samples: 177 | #print sample 178 | f = open(sample, 'r') 179 | 180 | Gk = {} 181 | bins=[] 182 | #Generate the set of all possible bins 183 | for i in range(0,1500, binWidth): 184 | Gk[str(i).replace(" ", "")] = 0 185 | 186 | 187 | lines = f.readlines() 188 | for line in lines: 189 | try: 190 | bins.append(line.rstrip('\n')) 191 | except IndexError: 192 | break #Reached last index, stop processing 193 | 194 | #Account for each bin elem 195 | for i in bins: 196 | Gk[str(i)]+=1 197 | 198 | od = collections.OrderedDict(sorted(Gk.items())) 199 | Gklist = [] 200 | for i in od: 201 | Gklist.append(float(od[i])) 202 | Gklist = np.array(Gklist) 203 | 204 | dists.append(Gklist) 205 | f.close() 206 | print "End - Building distributions" 207 | 208 | #Build distance matrix 209 | Gk = {} 210 | bins=[] 211 | #Generate the set of all possible bins 212 | for i in range(0,1500, binWidth): 213 | Gk[str(i).replace(" ", "")] = 0 214 | 215 | #Generate distance matrix 216 | distance_matrix = [] 217 | for i in range(0,len(Gk)): 218 | line =[] 219 | for j in range(0,len(Gk)): 220 | if(i==j): 221 | line.append(0.0) 222 | else: 223 | line.append(1.0) 224 | 225 | distance_matrix.append(np.array(line)) 226 | distance_matrix = np.array(distance_matrix) 227 | 228 | return dists, distance_matrix 229 | 230 | 231 | def Classifier(toClassify, allSamples, baseSamples, distance_matrix, binWidth): 232 | emdResults = [] 233 | emdSum = 0 234 | ################################## 235 | #Read first element in combination 236 | ################################## 237 | Gk_corelist = toClassify 238 | 239 | for n, sample in enumerate(baseSamples): 240 | 241 | Gklist = sample 242 | ############################ 243 | ###### NORMALIZATION ####### 244 | ############################ 245 | ground1 = max(Gk_corelist) 246 | ground2 = max(Gklist) 247 | if(ground1 > ground2): 248 | MAX = ground1 249 | else: 250 | MAX = ground2 251 | 252 | if(max(np.cumsum(Gk_corelist)) > max(np.cumsum(Gklist))): 253 | cSum = max(np.cumsum(Gk_corelist)) 254 | else: 255 | cSum = max(np.cumsum(Gklist)) 256 | 257 | dtm = distance_matrix/cSum 258 | 259 | emdR = float(emd(Gk_corelist, Gklist, dtm)) 260 | emdSum += emdR 261 | emdResults.append(emdR) 262 | 263 | avgEMD = emdSum / len(emdResults) 264 | #print str(avgEMD) 265 | return avgEMD 266 | 267 | def plotEMD(sampleFolder, baselines, binWidth): 268 | regularSamples = GatherChatSamples(sampleFolder,baselines[:-1], binWidth) 269 | allSamples = GatherChatSamples(sampleFolder, baselines, binWidth) 270 | 271 | allSamplesDists, distance_matrix = GenerateDists(allSamples, binWidth) 272 | 273 | emdResults = [] 274 | for n, bs in enumerate(allSamplesDists): 275 | #print allSamples[n] 276 | emdResults.append(Classifier(bs, allSamples, allSamplesDists[:len(regularSamples)], distance_matrix, binWidth)) 277 | 278 | acc = float(0) 279 | for i in range(0,len(regularSamples)): 280 | acc += emdResults[i] 281 | acc = acc/len(regularSamples) 282 | print "AVG Regular " + str(acc) 283 | 284 | max_stat = ComputeRate(sampleFolder, emdResults, len(allSamples) - len(regularSamples), len(regularSamples), binWidth) 285 | print max_stat 286 | 287 | fig = plt.figure() 288 | ax1 = fig.add_subplot(111) 289 | 290 | means = [np.mean(x) for x in emdResults] 291 | plt.scatter(range(1,len(emdResults)+1), means) 292 | 293 | 294 | minor_ticks = np.arange(0, len(emdResults)+1, 1) 295 | ax1.set_xticks(minor_ticks, minor=True) 296 | majorLocator = MultipleLocator(5) 297 | majorFormatter = FormatStrFormatter('%d') 298 | ax1.xaxis.set_major_locator(majorLocator) 299 | ax1.xaxis.set_major_formatter(majorFormatter) 300 | 301 | for label in (ax1.get_xticklabels() + ax1.get_yticklabels()): 302 | label.set_fontsize(14) 303 | 304 | start, end = ax1.get_xlim() 305 | ax1.yaxis.set_ticks(np.arange(start, end, 0.025)) 306 | ax1.yaxis.set_major_formatter(FormatStrFormatter('%0.3f')) 307 | plt.xlim(xmin=0, xmax=len(allSamples)) 308 | plt.ylim(ymin=0,ymax=0.5) 309 | 310 | plt.setp(ax1.get_xticklabels(), fontsize=14) 311 | plt.setp(ax1.get_yticklabels(), fontsize=14) 312 | plt.title(max_stat, fontsize='xx-small') 313 | plt.xlabel('Stream i', fontsize='x-large') 314 | plt.ylabel('EMD Cost', fontsize='x-large') 315 | fig.savefig('EMD/' + sampleFolder + baselines[1] + '/EMD_' + str(binWidth) + '.pdf') # save the figure to file 316 | plt.close(fig) 317 | 318 | 319 | if __name__ == "__main__": 320 | 321 | sampleFolders = ["TrafficCaptures/480Resolution/"] 322 | 323 | if not os.path.exists('EMD'): 324 | os.makedirs('EMD') 325 | 326 | for sampleFolder in sampleFolders: 327 | for baselines in cfgs: 328 | print "===========================================" 329 | print "Analyzing " + baselines[0] + " - " + baselines[1] 330 | for binWidth in BIN_WIDTH: 331 | print "##############" 332 | print "BinWidth: " + str(binWidth) 333 | if not os.path.exists('EMD/' + sampleFolder + baselines[1]): 334 | os.makedirs('EMD/' + sampleFolder + baselines[1]) 335 | plotEMD(sampleFolder, baselines, binWidth) 336 | -------------------------------------------------------------------------------- /FacetAnalysis/EMD_classifier.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import dpkt 3 | import subprocess 4 | import socket 5 | import os 6 | from random import randint 7 | import math 8 | from itertools import product 9 | import time 10 | import datetime 11 | from matplotlib import pyplot as plt 12 | from matplotlib.pyplot import cm 13 | from matplotlib.ticker import MultipleLocator, FormatStrFormatter 14 | import numpy as np 15 | from pyemd import emd 16 | import collections 17 | 18 | BIN_WIDTH = [50] 19 | folder = "auxFolder/" 20 | 21 | cfgs = [ 22 | ["RegularTraffic_Christmas", 23 | "FacetTraffic_12.5_Christmas"], 24 | ["RegularTraffic_Christmas", 25 | "FacetTraffic_25_Christmas"], 26 | ["RegularTraffic_Christmas", 27 | "FacetTraffic_50_Christmas"]] 28 | 29 | 30 | 31 | def GatherChatSamples(sampleFolder, baselines, binWidth): 32 | Samples = [] 33 | for baseline in baselines: 34 | for cap in os.listdir(folder + sampleFolder + baseline): 35 | Samples.append(folder + sampleFolder + baseline + "/" + cap + "/" + 'packetCount_' + str(binWidth)) 36 | return Samples 37 | 38 | def ComputeRate(sampleFolder, emdResults, num_irregular_samples, num_regular_samples, binWidth): 39 | deltas = np.arange(0.001, 0.5, 0.001) 40 | 41 | Sensitivity = [] 42 | Specificity = [] 43 | 44 | holding90 = True 45 | holding80 = True 46 | holding70 = True 47 | 48 | thresh90 = 0 49 | thresh80 = 0 50 | thresh70 = 0 51 | 52 | val90 = 0 53 | val80 = 0 54 | val70 = 0 55 | 56 | max_acc = 0 57 | max_delta = 0 58 | max_tpr = 0 59 | max_tnr = 0 60 | max_fpr = 0 61 | 62 | accuracy = 0 63 | for delta in deltas: 64 | FPositives = 0 65 | FNegatives = 0 66 | TPositives = 0 67 | TNegatives = 0 68 | 69 | """ 70 | #Positives are Facet samples classified as Facet 71 | for i, capEMD in enumerate(emdResults): 72 | if(capEMD > delta and i < num_regular_samples): # Regular baselines 73 | FPositives += 1 74 | if (capEMD < delta and i < num_regular_samples): 75 | TNegatives += 1 76 | if(capEMD <= delta and i >= num_regular_samples): #Irregular baseline 77 | FNegatives += 1 78 | if(capEMD > delta and i >= num_regular_samples): 79 | TPositives += 1 80 | """ 81 | #NEGATED 82 | for i, capEMD in enumerate(emdResults): 83 | if(capEMD > delta and i < num_regular_samples): # Regular baselines 84 | TNegatives += 1 85 | if (capEMD < delta and i < num_regular_samples): 86 | FPositives += 1 87 | if(capEMD <= delta and i >= num_regular_samples): #Irregular baseline 88 | TPositives += 1 89 | if(capEMD > delta and i >= num_regular_samples): 90 | FNegatives += 1 91 | 92 | Sensitivity.append(TPositives/(TPositives+float(FNegatives))) 93 | Specificity.append(TNegatives/(TNegatives+float(FPositives))) 94 | 95 | accuracy = (TPositives + TNegatives)/float(num_irregular_samples + num_regular_samples) 96 | TNR = TNegatives/(TNegatives+float(FPositives)) 97 | FNR = FNegatives/(TPositives+float(FNegatives)) 98 | TPR = TPositives/(TPositives+float(FNegatives)) 99 | FPR = FPositives/(FPositives+float(TNegatives)) 100 | 101 | if(holding90): 102 | if(FPR >= 0.1): 103 | holding90 = False 104 | thresh90 = delta 105 | val90 = FNR 106 | 107 | if(holding80): 108 | if(FPR >= 0.2): 109 | holding80 = False 110 | thresh80 = delta 111 | val80 = FNR 112 | 113 | if(holding70): 114 | if(FPR >= 0.3): 115 | holding70 = False 116 | thresh70 = delta 117 | val70 = FNR 118 | 119 | if(accuracy > max_acc): 120 | max_acc = accuracy 121 | max_delta = delta 122 | max_tpr = TPositives/(TPositives+float(FNegatives)) 123 | max_tnr = TNegatives/(TNegatives+float(FPositives)) 124 | max_fpr = 1 - max_tnr 125 | 126 | fig = plt.figure() 127 | ax1 = fig.add_subplot(111) 128 | 129 | print "TPR90 = " + str(val90) 130 | print "TPR80 = " + str(val80) 131 | print "TPR70 = " + str(val70) 132 | 133 | print "AUC" 134 | auc = np.trapz(np.array(Sensitivity), 1 - np.array(Specificity)) 135 | print auc 136 | #ROC Curve 137 | 138 | np.save('EMD/' + sampleFolder + baselines[1] + '/Rate_' + str(binWidth) + '_Sensitivity', np.array(Sensitivity)) 139 | np.save('EMD/' + sampleFolder + baselines[1] + '/Rate_' + str(binWidth) + '_Specificity', np.array(Specificity)) 140 | ax1.plot(1 - np.array(Specificity), np.array(Sensitivity), 'k.-', color='black', label = 'ROC (AUC = %0.2f)' % (auc)) 141 | ax1.plot([0, 1], [0, 1], 'k--', lw=2, color='orange', label = 'Random Guess') 142 | ax1.grid(color='black', linestyle='dotted') 143 | 144 | plt.title('Receiver Operating Characteristic (ROC)') 145 | plt.xlabel('False Positive Rate', fontsize='x-large') 146 | plt.ylabel('True Positive Rate', fontsize='x-large') 147 | plt.legend(loc='lower right', fontsize='large') 148 | 149 | plt.setp(ax1.get_xticklabels(), fontsize=14) 150 | plt.setp(ax1.get_yticklabels(), fontsize=14) 151 | 152 | max_stats = "Max acc: " + str(max_acc) + " Max TPR:" + str(max_tpr) + " Max TNR:" + str(max_tnr) + " Max FPR:" + str(max_fpr) + " delta:" + str(max_delta) 153 | 154 | fig.savefig('EMD/' + sampleFolder + baselines[1] + '/Rate_' + str(binWidth) + '.pdf') # save the figure to file 155 | plt.close(fig) 156 | 157 | return max_stats 158 | 159 | 160 | def GenerateDists(samples, binWidth): 161 | dists = [] 162 | print "Building distributions" 163 | 164 | for sample in samples: 165 | #print sample 166 | f = open(sample, 'r') 167 | 168 | Gk = {} 169 | bins=[] 170 | #Generate the set of all possible bins 171 | for i in range(0,1500, binWidth): 172 | Gk[str(i).replace(" ", "")] = 0 173 | 174 | 175 | lines = f.readlines() 176 | for line in lines: 177 | try: 178 | bins.append(line.rstrip('\n')) 179 | except IndexError: 180 | break #Reached last index, stop processing 181 | 182 | #Account for each bin elem 183 | for i in bins: 184 | Gk[str(i)]+=1 185 | 186 | od = collections.OrderedDict(sorted(Gk.items())) 187 | Gklist = [] 188 | for i in od: 189 | Gklist.append(float(od[i])) 190 | Gklist = np.array(Gklist) 191 | 192 | dists.append(Gklist) 193 | f.close() 194 | print "End - Building distributions" 195 | 196 | #Build distance matrix 197 | Gk = {} 198 | bins=[] 199 | #Generate the set of all possible bins 200 | for i in range(0,1500, binWidth): 201 | Gk[str(i).replace(" ", "")] = 0 202 | 203 | #Generate distance matrix 204 | distance_matrix = [] 205 | for i in range(0,len(Gk)): 206 | line =[] 207 | for j in range(0,len(Gk)): 208 | if(i==j): 209 | line.append(0.0) 210 | else: 211 | line.append(1.0) 212 | 213 | distance_matrix.append(np.array(line)) 214 | distance_matrix = np.array(distance_matrix) 215 | 216 | return dists, distance_matrix 217 | 218 | 219 | def Classifier(toClassify, allSamples, baseSamples, distance_matrix, binWidth): 220 | emdResults = [] 221 | emdSum = 0 222 | ################################## 223 | #Read first element in combination 224 | ################################## 225 | Gk_corelist = toClassify 226 | 227 | times = [] 228 | #start_time = time.time() 229 | for n, sample in enumerate(baseSamples): 230 | 231 | Gklist = sample 232 | ############################ 233 | ###### NORMALIZATION ####### 234 | ############################ 235 | ground1 = max(Gk_corelist) 236 | ground2 = max(Gklist) 237 | if(ground1 > ground2): 238 | MAX = ground1 239 | else: 240 | MAX = ground2 241 | 242 | if(max(np.cumsum(Gk_corelist)) > max(np.cumsum(Gklist))): 243 | cSum = max(np.cumsum(Gk_corelist)) 244 | else: 245 | cSum = max(np.cumsum(Gklist)) 246 | 247 | dtm = distance_matrix/cSum 248 | 249 | #start_time = time.time() 250 | emdR = float(emd(Gk_corelist, Gklist, dtm)) 251 | #end_time = time.time() 252 | #times.append(end_time - start_time) 253 | emdSum += emdR 254 | emdResults.append(emdR) 255 | #end_time = time.time() 256 | #print "Sample classification time: " + "{0:.5f}".format(end_time - start_time) 257 | #print "Avg. EMD time: " + "{0:.5f}".format(np.mean(times,axis=0)) 258 | avgEMD = emdSum / len(emdResults) 259 | #print str(avgEMD) 260 | return avgEMD 261 | 262 | def plotEMD(sampleFolder, baselines, binWidth): 263 | regularSamples = GatherChatSamples(sampleFolder,baselines[:-1], binWidth) 264 | allSamples = GatherChatSamples(sampleFolder, baselines, binWidth) 265 | 266 | allSamplesDists, distance_matrix = GenerateDists(allSamples, binWidth) 267 | 268 | start_time = time.time() 269 | emdResults = [] 270 | times = [] 271 | for n, bs in enumerate(allSamplesDists): 272 | #print allSamples[n] 273 | start_sample_time = time.time() 274 | emdResults.append(Classifier(bs, allSamples, allSamplesDists[:len(regularSamples)], distance_matrix, binWidth)) 275 | end_sample_time = time.time() 276 | times.append(end_sample_time - start_sample_time) 277 | print "Avg Sample Classification: " + "{0:.5f}".format(np.mean(times,axis=0)) 278 | end_time = time.time() 279 | print "Time Elapsed: " + "{0:.5f}".format(end_time - start_time) 280 | 281 | acc = float(0) 282 | for i in range(0,len(regularSamples)): 283 | acc += emdResults[i] 284 | acc = acc/len(regularSamples) 285 | print "AVG Regular " + str(acc) 286 | 287 | max_stat = ComputeRate(sampleFolder, emdResults, len(allSamples) - len(regularSamples), len(regularSamples), binWidth) 288 | print max_stat 289 | """ 290 | fig = plt.figure() 291 | ax1 = fig.add_subplot(111) 292 | 293 | means = [np.mean(x) for x in emdResults] 294 | plt.scatter(range(1,len(emdResults)+1), means) 295 | 296 | 297 | minor_ticks = np.arange(0, len(emdResults)+1, 1) 298 | ax1.set_xticks(minor_ticks, minor=True) 299 | majorLocator = MultipleLocator(5) 300 | majorFormatter = FormatStrFormatter('%d') 301 | ax1.xaxis.set_major_locator(majorLocator) 302 | ax1.xaxis.set_major_formatter(majorFormatter) 303 | 304 | for label in (ax1.get_xticklabels() + ax1.get_yticklabels()): 305 | label.set_fontsize(14) 306 | 307 | start, end = ax1.get_xlim() 308 | ax1.yaxis.set_ticks(np.arange(start, end, 0.025)) 309 | ax1.yaxis.set_major_formatter(FormatStrFormatter('%0.3f')) 310 | plt.xlim(xmin=0, xmax=len(allSamples)) 311 | plt.ylim(ymin=0,ymax=0.5) 312 | 313 | plt.setp(ax1.get_xticklabels(), fontsize=14) 314 | plt.setp(ax1.get_yticklabels(), fontsize=14) 315 | plt.title(max_stat, fontsize='xx-small') 316 | plt.xlabel('Stream i', fontsize='x-large') 317 | plt.ylabel('EMD Cost', fontsize='x-large') 318 | fig.savefig('EMD/' + sampleFolder + baselines[1] + '/EMD_' + str(binWidth) + '.pdf') # save the figure to file 319 | plt.close(fig) 320 | """ 321 | 322 | if __name__ == "__main__": 323 | 324 | sampleFolders = ["TrafficCaptures/240Resolution/"] 325 | 326 | if not os.path.exists('EMD'): 327 | os.makedirs('EMD') 328 | 329 | for sampleFolder in sampleFolders: 330 | for baselines in cfgs: 331 | print "===========================================" 332 | print "Analyzing " + baselines[0] + " - " + baselines[1] 333 | for binWidth in BIN_WIDTH: 334 | print "##############" 335 | print "BinWidth: " + str(binWidth) 336 | if not os.path.exists('EMD/' + sampleFolder + baselines[1]): 337 | os.makedirs('EMD/' + sampleFolder + baselines[1]) 338 | plotEMD(sampleFolder, baselines, binWidth) 339 | -------------------------------------------------------------------------------- /DeltaShaperAnalysis/xgboost_classifier.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import os 3 | import csv 4 | import numpy as np 5 | from scipy import interp 6 | import random 7 | from random import shuffle 8 | import math 9 | import time 10 | #Classifiers 11 | from xgboost import XGBClassifier 12 | from sklearn.ensemble import RandomForestClassifier 13 | from sklearn.tree import DecisionTreeClassifier 14 | from sklearn.neighbors import KNeighborsClassifier 15 | #Eval Metrics 16 | from sklearn.model_selection import train_test_split, KFold 17 | from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, auc 18 | from sklearn.model_selection import cross_val_score 19 | 20 | np.random.seed(1) 21 | random.seed(1) 22 | 23 | 24 | def gatherHoldoutData(data_folder, cfg): 25 | SPLIT_FACTOR = 0.7 26 | #Load Datasets 27 | f = open(data_folder + cfg[0] + "_dataset.csv", 'r') 28 | reader = csv.reader(f, delimiter=',') 29 | reg = list(reader) 30 | 31 | f = open(data_folder + cfg[1] + "_dataset.csv", 'r') 32 | reader = csv.reader(f, delimiter=',') 33 | fac = list(reader) 34 | print "###########################################" 35 | print "Configuration " + cfg[1] 36 | print "###########################################" 37 | 38 | #Convert data to floats (and labels to integers) 39 | features_id = reg[0] 40 | reg_data = [] 41 | for i in reg[1:]: 42 | int_array = [] 43 | for pl in i[:-1]: 44 | int_array.append(float(pl)) 45 | int_array.append(0) 46 | reg_data.append(int_array) 47 | 48 | fac_data = [] 49 | for i in fac[1:]: 50 | int_array = [] 51 | for pl in i[:-1]: 52 | int_array.append(float(pl)) 53 | int_array.append(1) 54 | fac_data.append(int_array) 55 | 56 | 57 | #Shuffle both datasets 58 | shuffled_reg_data = random.sample(reg_data, len(reg_data)) 59 | shuffled_fac_data = random.sample(fac_data, len(fac_data)) 60 | 61 | #Build label tensors 62 | reg_labels = [] 63 | for i in shuffled_reg_data: 64 | reg_labels.append(int(i[len(reg_data[0])-1])) 65 | 66 | fac_labels = [] 67 | for i in shuffled_fac_data: 68 | fac_labels.append(int(i[len(reg_data[0])-1])) 69 | 70 | #Take label out of data tensors 71 | for i in range(0, len(shuffled_reg_data)): 72 | shuffled_reg_data[i].pop() 73 | 74 | for i in range(0, len(shuffled_fac_data)): 75 | shuffled_fac_data[i].pop() 76 | 77 | 78 | #Build training and testing datasets 79 | #Split each class data in the appropriate proportion for training 80 | reg_proportion_index = int(len(reg_labels)* SPLIT_FACTOR) 81 | reg_train_x = shuffled_reg_data[:reg_proportion_index] 82 | reg_train_y = reg_labels[:reg_proportion_index] 83 | 84 | fac_proportion_index = int(len(fac_labels)*SPLIT_FACTOR) 85 | fac_train_x = shuffled_fac_data[:fac_proportion_index] 86 | fac_train_y = fac_labels[:fac_proportion_index] 87 | 88 | #Create training sets by combining the randomly selected samples from each class 89 | train_x = reg_train_x + fac_train_x 90 | train_y = reg_train_y + fac_train_y 91 | 92 | #Make the split for the testing data 93 | reg_test_x = shuffled_reg_data[reg_proportion_index:] 94 | reg_test_y = reg_labels[reg_proportion_index:] 95 | 96 | fac_test_x = shuffled_fac_data[fac_proportion_index:] 97 | fac_test_y = fac_labels[fac_proportion_index:] 98 | 99 | #Create testing set by combining the holdout samples 100 | test_x = reg_test_x + fac_test_x 101 | test_y = reg_test_y + fac_test_y 102 | 103 | return train_x, train_y, test_x, test_y 104 | 105 | def gatherAllData(data_folder, cfg, dataset_fraction): 106 | #Load Datasets 107 | f = open(data_folder + cfg[0] + "_dataset.csv", 'r') 108 | reader = csv.reader(f, delimiter=',') 109 | reg = list(reader) 110 | reg = reg[:int(dataset_fraction*len(reg))] 111 | 112 | 113 | f = open(data_folder + cfg[1] + "_dataset.csv", 'r') 114 | reader = csv.reader(f, delimiter=',') 115 | fac = list(reader) 116 | fac = fac[:int(dataset_fraction*len(fac))] 117 | 118 | print "###########################################" 119 | print "Configuration " + cfg[1] 120 | print "###########################################" 121 | 122 | #Convert data to floats (and labels to integers) 123 | features_id = reg[0] 124 | reg_data = [] 125 | for i in reg[1:]: 126 | int_array = [] 127 | for pl in i[:-1]: 128 | int_array.append(float(pl)) 129 | int_array.append(0) 130 | reg_data.append(int_array) 131 | 132 | fac_data = [] 133 | for i in fac[1:]: 134 | int_array = [] 135 | for pl in i[:-1]: 136 | int_array.append(float(pl)) 137 | int_array.append(1) 138 | fac_data.append(int_array) 139 | 140 | 141 | #Shuffle both datasets 142 | shuffled_reg_data = random.sample(reg_data, len(reg_data)) 143 | shuffled_fac_data = random.sample(fac_data, len(fac_data)) 144 | 145 | #Build label tensors 146 | reg_labels = [] 147 | for i in shuffled_reg_data: 148 | reg_labels.append(int(i[len(reg_data[0])-1])) 149 | 150 | fac_labels = [] 151 | for i in shuffled_fac_data: 152 | fac_labels.append(int(i[len(reg_data[0])-1])) 153 | 154 | #Take label out of data tensors 155 | for i in range(0, len(shuffled_reg_data)): 156 | shuffled_reg_data[i].pop() 157 | 158 | for i in range(0, len(shuffled_fac_data)): 159 | shuffled_fac_data[i].pop() 160 | 161 | #Create training sets by combining the randomly selected samples from each class 162 | train_x = shuffled_reg_data + shuffled_fac_data 163 | train_y = reg_labels + fac_labels 164 | 165 | #Shuffle positive/negative samples for CV purposes 166 | x_shuf = [] 167 | y_shuf = [] 168 | index_shuf = range(len(train_x)) 169 | shuffle(index_shuf) 170 | for i in index_shuf: 171 | x_shuf.append(train_x[i]) 172 | y_shuf.append(train_y[i]) 173 | 174 | return x_shuf, y_shuf, features_id 175 | 176 | 177 | def runXGBoost(data_folder, cfg): 178 | #Gather the dataset 179 | print "Gather dataset" 180 | train_x, train_y, test_x, test_y = gatherHoldoutData(data_folder, cfg) 181 | 182 | 183 | param = {'max_depth': 2, 'eta': 1, 'silent': 1, 'objective': 'binary:logistic'} 184 | param['nthread'] = 4 185 | param['eval_metric'] = 'auc' 186 | 187 | 188 | model = XGBClassifier() 189 | model.fit(np.asarray(train_x), np.asarray(train_y)) 190 | 191 | y_pred = model.predict(np.asarray(test_x)) 192 | predictions = [round(value) for value in y_pred] 193 | 194 | # evaluate predictions 195 | accuracy = accuracy_score(np.asarray(test_y), predictions) 196 | print("Accuracy: %.2f%%" % (accuracy * 100.0)) 197 | 198 | y_pred = model.predict_proba(np.asarray(test_x))[:,1] 199 | print 'Area under ROC:', roc_auc_score(np.asarray(test_y),y_pred) 200 | 201 | 202 | def runClassification_CV(data_folder, feature_set, cfg,classifier): 203 | print "Gather dataset" 204 | dataset_fraction = 1.0 205 | train_x, train_y, features_id = gatherAllData(data_folder, cfg, dataset_fraction) 206 | 207 | model = classifier[0] 208 | clf_name = classifier[1] 209 | 210 | #Report Cross-Validation Accuracy 211 | #scores = cross_val_score(model, np.asarray(train_x), np.asarray(train_y), cv=10) 212 | print clf_name 213 | #print "Avg. Accuracy: " + str(sum(scores)/float(len(scores))) 214 | 215 | cv = KFold(n_splits=10) 216 | tprs = [] 217 | aucs = [] 218 | mean_fpr = np.linspace(0, 1, 100) 219 | train_times = [] 220 | test_times = [] 221 | importances = [] 222 | 223 | #Split the data in k-folds, perform classification, and report ROC 224 | i = 0 225 | for train, test in cv.split(train_x, train_y): 226 | 227 | start_train = time.time() 228 | model = model.fit(np.asarray(train_x)[train], np.asarray(train_y)[train]) 229 | end_train = time.time() 230 | train_times.append(end_train - start_train) 231 | 232 | start_test = time.time() 233 | probas_ = model.predict_proba(np.asarray(train_x)[test]) 234 | end_test = time.time() 235 | test_times.append(end_test - start_test) 236 | 237 | """ 238 | c=[] 239 | for value in np.asarray(train_x)[test]: 240 | a = np.reshape(value,(1, -1)) 241 | c.append(a) 242 | 243 | load = [] 244 | for v in c: 245 | start_test = time.time() 246 | a = model.predict_proba(v) 247 | end_test = time.time() 248 | load.append(end_test - start_test) 249 | print "Individual prediction avg: " + "{0:.5f}".format(np.mean(load)) 250 | """ 251 | fpr, tpr, thresholds = roc_curve(np.asarray(train_y)[test], probas_[:, 1]) 252 | tprs.append(interp(mean_fpr, fpr, tpr)) 253 | tprs[-1][0] = 0.0 254 | roc_auc = auc(fpr, tpr) 255 | aucs.append(roc_auc) 256 | 257 | #Check feature importance in this fold 258 | f_imp = model.feature_importances_ 259 | importances.append(f_imp) 260 | #plt.plot(fpr, tpr, lw=1, alpha=0.3, label='ROC fold %d (AUC = %0.2f)' % (i, roc_auc)) 261 | i += 1 262 | 263 | plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r', label='Random Guess', alpha=.8) 264 | 265 | 266 | mean_tpr = np.mean(tprs, axis=0) 267 | mean_tpr[-1] = 1.0 268 | mean_auc = auc(mean_fpr, mean_tpr) 269 | print "Model AUC: " + "{0:.3f}".format(mean_auc) 270 | print "Training time (Avg. fold): " + str(np.mean(train_times, axis=0)) 271 | print "Test time (Avg. fold): " + str(np.mean(test_times, axis=0)) 272 | 273 | unblock70 = True 274 | unblock80 = True 275 | unblock90 = True 276 | unblock95 = True 277 | for n, i in enumerate(mean_tpr): 278 | if(i >= 0.7 and unblock70): 279 | print '70% TPR = ' + "{0:.3f}".format(mean_fpr[n]) 280 | unblock70 = False 281 | if(i >= 0.8 and unblock80): 282 | print '80% TPR = ' + "{0:.3f}".format(mean_fpr[n]) 283 | unblock80 = False 284 | if(i >= 0.9 and unblock90): 285 | print '90% TPR = ' + "{0:.3f}".format(mean_fpr[n]) 286 | unblock90 = False 287 | if(i >= 0.95 and unblock95): 288 | print '95% TPR = ' + "{0:.3f}".format(mean_fpr[n]) 289 | unblock95 = False 290 | 291 | #Figure properties 292 | fig = plt.figure() 293 | ax1 = fig.add_subplot(111) 294 | 295 | std_auc = np.std(aucs) 296 | np.save('xgBoost/' + feature_set + "/ROC_" + clf_name + "_" + cfg[1] + "_Sensitivity", np.array(mean_tpr)) 297 | np.save('xgBoost/' + feature_set + "/ROC_" + clf_name + "_" + cfg[1] + "_Specificity", np.array(mean_fpr)) 298 | 299 | plt.plot(mean_fpr, mean_tpr, color='b', label=r'Mean ROC (AUC = %0.2f $\pm$ %0.3f)' % (mean_auc, std_auc), lw=2, alpha=.8) 300 | 301 | #Compute Standard Deviation between folds 302 | std_tpr = np.std(tprs, axis=0) 303 | tprs_upper = np.minimum(mean_tpr + std_tpr, 1) 304 | tprs_lower = np.maximum(mean_tpr - std_tpr, 0) 305 | plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.3, label=r'$\pm$ ROC Std. Dev.') 306 | 307 | 308 | 309 | ax1.plot([0, 1], [0, 1], 'k--', lw=2, color='orange', label = 'Random Guess') 310 | ax1.grid(color='black', linestyle='dotted') 311 | 312 | plt.title('Receiver Operating Characteristic (ROC)') 313 | plt.xlabel('False Positive Rate', fontsize='x-large') 314 | plt.ylabel('True Positive Rate', fontsize='x-large') 315 | plt.legend(loc='lower right', fontsize='large') 316 | 317 | plt.setp(ax1.get_xticklabels(), fontsize=14) 318 | plt.setp(ax1.get_yticklabels(), fontsize=14) 319 | 320 | #fig.savefig('xgBoost/' + feature_set + "/ROC_" + clf_name + "_" + cfg[1] + ".pdf") # save the figure to file 321 | plt.close(fig) 322 | 323 | mean_importances = [] 324 | for n in range(0,len(importances[0])): 325 | mean_imp = (importances[0][n] + importances[1][n] + importances[2][n] + importances[3][n] + importances[4][n] + importances[5][n] + importances[6][n] + importances[7][n] + importances[8][n] + importances[9][n])/10.0 326 | mean_importances.append(mean_imp) 327 | f_imp = zip(mean_importances,features_id) 328 | f_imp.sort(key = lambda t: t[0], reverse=True) 329 | 330 | np.save('xgBoost/' + feature_set + "/FeatureImportance_" + clf_name + "_" + cfg[1], np.array(f_imp)) 331 | 332 | #for f in f_imp[:20]: 333 | # print "Importance: %f, Feature: %s" % (f[0], f[1]) 334 | 335 | 336 | def runClassification_adhocCV(data_folder,feature_set, cfg,classifier): 337 | print "Gather dataset" 338 | dataset_fraction = 1.0 339 | train_x, train_y, features_id = gatherAllData(data_folder, cfg, dataset_fraction) 340 | 341 | model = classifier[0] 342 | clf_name = classifier[1] 343 | 344 | #Report Cross-Validation Accuracy 345 | #scores = cross_val_score(model, np.asarray(train_x), np.asarray(train_y), cv=10) 346 | print clf_name 347 | #print "Avg. Accuracy: " + str(sum(scores)/float(len(scores))) 348 | 349 | cv = KFold(n_splits=10) 350 | tprs = [] 351 | aucs = [] 352 | mean_fpr = np.linspace(0, 1, 100) 353 | train_times = [] 354 | test_times = [] 355 | importances = [] 356 | 357 | #Split the data in k-folds, perform classification, and report ROC 358 | 359 | for i in range(0,10): 360 | X_train, X_test, y_train, y_test = train_test_split(train_x, train_y, test_size=0.9) 361 | start_train = time.time() 362 | model = model.fit(np.asarray(X_train), np.asarray(y_train)) 363 | end_train = time.time() 364 | train_times.append(end_train - start_train) 365 | 366 | start_test = time.time() 367 | probas_ = model.predict_proba(np.asarray(X_test)) 368 | end_test = time.time() 369 | test_times.append(end_test - start_test) 370 | 371 | # Compute ROC curve and area under the curve 372 | fpr, tpr, thresholds = roc_curve(np.asarray(y_test), probas_[:, 1], pos_label=1) 373 | #print "Accuracy " + str(accuracy_score(np.asarray(train_y)[test], probas_[:, 1])) 374 | tprs.append(interp(mean_fpr, fpr, tpr)) 375 | tprs[-1][0] = 0.0 376 | roc_auc = auc(fpr, tpr) 377 | aucs.append(roc_auc) 378 | 379 | #Check feature importance in this fold 380 | f_imp = model.feature_importances_ 381 | importances.append(f_imp) 382 | 383 | 384 | plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r', label='Random Guess', alpha=.8) 385 | 386 | 387 | mean_tpr = np.mean(tprs, axis=0) 388 | mean_tpr[-1] = 1.0 389 | mean_auc = auc(mean_fpr, mean_tpr) 390 | print "Model AUC: " + "{0:.3f}".format(mean_auc) 391 | print "Training time (Avg. fold): " + str(np.mean(train_times, axis=0)) 392 | print "Test time (Avg. fold): " + str(np.mean(test_times, axis=0)) 393 | 394 | 395 | unblock70 = True 396 | unblock80 = True 397 | unblock90 = True 398 | unblock95 = True 399 | for n, i in enumerate(mean_tpr): 400 | if(i >= 0.7 and unblock70): 401 | print '70% TPR = ' + "{0:.3f}".format(mean_fpr[n]) 402 | unblock70 = False 403 | if(i >= 0.8 and unblock80): 404 | print '80% TPR = ' + "{0:.3f}".format(mean_fpr[n]) 405 | unblock80 = False 406 | if(i >= 0.9 and unblock90): 407 | print '90% TPR = ' + "{0:.3f}".format(mean_fpr[n]) 408 | unblock90 = False 409 | if(i >= 0.95 and unblock95): 410 | print '95% TPR = ' + "{0:.3f}".format(mean_fpr[n]) 411 | unblock95 = False 412 | 413 | #Figure properties 414 | fig = plt.figure() 415 | ax1 = fig.add_subplot(111) 416 | 417 | std_auc = np.std(aucs) 418 | 419 | #np.save('xgBoost/' + feature_set + "/ROC_" + clf_name + "_" + cfg[1] + "_Sensitivity", np.array(mean_tpr)) 420 | #np.save('xgBoost/' + feature_set + "/ROC_" + clf_name + "_" + cfg[1] + "_Specificity", np.array(mean_fpr)) 421 | plt.plot(mean_fpr, mean_tpr, color='b', label=r'Mean ROC (AUC = %0.2f $\pm$ %0.3f)' % (mean_auc, std_auc), lw=2, alpha=.8) 422 | 423 | #Compute Standard Deviation between folds 424 | std_tpr = np.std(tprs, axis=0) 425 | tprs_upper = np.minimum(mean_tpr + std_tpr, 1) 426 | tprs_lower = np.maximum(mean_tpr - std_tpr, 0) 427 | plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.3, label=r'$\pm$ ROC Std. Dev.') 428 | 429 | 430 | 431 | ax1.plot([0, 1], [0, 1], 'k--', lw=2, color='orange', label = 'Random Guess') 432 | ax1.grid(color='black', linestyle='dotted') 433 | 434 | plt.title('Receiver Operating Characteristic (ROC)') 435 | plt.xlabel('False Positive Rate', fontsize='x-large') 436 | plt.ylabel('True Positive Rate', fontsize='x-large') 437 | plt.legend(loc='lower right', fontsize='large') 438 | 439 | plt.setp(ax1.get_xticklabels(), fontsize=14) 440 | plt.setp(ax1.get_yticklabels(), fontsize=14) 441 | 442 | #fig.savefig('xgBoost/' + feature_set + "/ROC_" + clf_name + "_" + cfg[1] + ".pdf") # save the figure to file 443 | plt.close(fig) 444 | 445 | #Compute mean importance of feature accross CV folds 446 | bin_number = list(range(len(train_x[0]))) 447 | mean_importances = [] 448 | for n in range(0,len(importances[0])): 449 | mean_imp = (importances[0][n] + importances[1][n] + importances[2][n] + importances[3][n] + importances[4][n] + importances[5][n] + importances[6][n] + importances[7][n] + importances[8][n] + importances[9][n])/10.0 450 | mean_importances.append(mean_imp) 451 | #print mean_importances 452 | f_imp = zip(bin_number,mean_importances,features_id) 453 | f_imp.sort(key = lambda t: t[1], reverse=True) 454 | 455 | #np.save('xgBoost/' + feature_set + "/FeatureImportance_" + clf_name + "_" + cfg[1], np.array(f_imp)) 456 | 457 | #for f in f_imp[:20]: 458 | # print "Importance: %f, Feature: %s" % (f[1], f[2]) 459 | 460 | 461 | if __name__ == "__main__": 462 | 463 | cfgs = [ 464 | ["RegularTraffic", 465 | "DeltaShaperTraffic_320"], 466 | ["RegularTraffic", 467 | "DeltaShaperTraffic_160"]] 468 | 469 | if not os.path.exists('xgBoost'): 470 | os.makedirs('xgBoost') 471 | 472 | classifiers = [ 473 | [DecisionTreeClassifier(), "DecisionTree"], 474 | [RandomForestClassifier(n_estimators=100, max_features='auto',n_jobs=1), "RandomForest"], 475 | [XGBClassifier(),"XGBoost"] 476 | ] 477 | 478 | feature_set = 'Stats_60' 479 | data_folder = 'FeatureSets/' + feature_set + '/' 480 | if not os.path.exists('xgBoost/' + feature_set): 481 | os.makedirs('xgBoost/' + feature_set) 482 | 483 | print "\n=================================================" 484 | print "One-class SVM - Summary Statistic Features - Set1" 485 | print "=================================================" 486 | for cfg in cfgs: 487 | for classifier in classifiers: 488 | print "Running classifiers for " + cfg[0] + " and " + cfg[1] 489 | runClassification_CV(data_folder, feature_set, cfg, classifier) 490 | print "#####################################\n" 491 | 492 | 493 | feature_set = 'PL_60' 494 | data_folder = 'FeatureSets/' + feature_set + '/' 495 | if not os.path.exists('xgBoost/' + feature_set): 496 | os.makedirs('xgBoost/' + feature_set) 497 | 498 | print "\n=================================================" 499 | print "One-class SVM - Packet Length Features - Set2" 500 | print "=================================================" 501 | for cfg in cfgs: 502 | for classifier in classifiers: 503 | print "Running classifiers for " + cfg[0] + " and " + cfg[1] 504 | runClassification_CV(data_folder, feature_set, cfg, classifier) 505 | 506 | 507 | 508 | 509 | 510 | -------------------------------------------------------------------------------- /FacetAnalysis/xgboost_classifier.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import os 3 | import csv 4 | import numpy as np 5 | from scipy import interp 6 | import random 7 | from random import shuffle 8 | import math 9 | import time 10 | 11 | import sklearn 12 | from sklearn import preprocessing 13 | from sklearn.model_selection import ParameterGrid 14 | #Classifiers 15 | from xgboost import XGBClassifier 16 | import xgboost as xgb 17 | from sklearn.ensemble import RandomForestClassifier 18 | from sklearn.tree import DecisionTreeClassifier 19 | from sklearn.svm import SVC 20 | #Eval Metrics 21 | import sys 22 | from sklearn.model_selection import train_test_split, KFold 23 | from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, auc 24 | from sklearn.model_selection import cross_val_score 25 | from sklearn.decomposition import PCA 26 | from sklearn.model_selection import GridSearchCV 27 | from sklearn.metrics import classification_report 28 | 29 | sklearn.set_config(assume_finite=True) 30 | np.random.seed(1) 31 | random.seed(1) 32 | 33 | 34 | def gatherHoldoutData(data_folder, cfg): 35 | SPLIT_FACTOR = 0.7 36 | #Load Datasets 37 | f = open(data_folder + cfg[0] + "_dataset.csv", 'r') 38 | reader = csv.reader(f, delimiter=',') 39 | reg = list(reader) 40 | 41 | f = open(data_folder + cfg[1] + "_dataset.csv", 'r') 42 | reader = csv.reader(f, delimiter=',') 43 | fac = list(reader) 44 | print "###########################################" 45 | print "Configuration " + cfg[1] 46 | print "###########################################" 47 | 48 | #Convert data to floats (and labels to integers) 49 | reg_data = [] 50 | 51 | for i in reg[1:]: 52 | int_array = [] 53 | for pl in i[:-1]: 54 | int_array.append(float(pl)) 55 | int_array.append(0) 56 | reg_data.append(int_array) 57 | 58 | fac_data = [] 59 | for i in fac[1:]: 60 | int_array = [] 61 | for pl in i[:-1]: 62 | int_array.append(float(pl)) 63 | int_array.append(1) 64 | fac_data.append(int_array) 65 | 66 | 67 | #Shuffle both datasets 68 | shuffled_reg_data = random.sample(reg_data, len(reg_data)) 69 | shuffled_fac_data = random.sample(fac_data, len(fac_data)) 70 | 71 | #Build label tensors 72 | reg_labels = [] 73 | for i in shuffled_reg_data: 74 | reg_labels.append(int(i[len(reg_data[0])-1])) 75 | 76 | fac_labels = [] 77 | for i in shuffled_fac_data: 78 | fac_labels.append(int(i[len(reg_data[0])-1])) 79 | 80 | #Take label out of data tensors 81 | for i in range(0, len(shuffled_reg_data)): 82 | shuffled_reg_data[i].pop() 83 | 84 | for i in range(0, len(shuffled_fac_data)): 85 | shuffled_fac_data[i].pop() 86 | 87 | 88 | #Build training and testing datasets 89 | #Split each class data in the appropriate proportion for training 90 | reg_proportion_index = int(len(reg_labels)* SPLIT_FACTOR) 91 | reg_train_x = shuffled_reg_data[:reg_proportion_index] 92 | reg_train_y = reg_labels[:reg_proportion_index] 93 | 94 | fac_proportion_index = int(len(fac_labels)*SPLIT_FACTOR) 95 | fac_train_x = shuffled_fac_data[:fac_proportion_index] 96 | fac_train_y = fac_labels[:fac_proportion_index] 97 | 98 | #Create training sets by combining the randomly selected samples from each class 99 | train_x = reg_train_x + fac_train_x 100 | train_y = reg_train_y + fac_train_y 101 | 102 | #Make the split for the testing data 103 | reg_test_x = shuffled_reg_data[reg_proportion_index:] 104 | reg_test_y = reg_labels[reg_proportion_index:] 105 | 106 | fac_test_x = shuffled_fac_data[fac_proportion_index:] 107 | fac_test_y = fac_labels[fac_proportion_index:] 108 | 109 | #Create testing set by combining the holdout samples 110 | test_x = reg_test_x + fac_test_x 111 | test_y = reg_test_y + fac_test_y 112 | 113 | return train_x, train_y, test_x, test_y 114 | 115 | def gatherAllData(data_folder, cfg, dataset_fraction): 116 | #Load Datasets 117 | f = open(data_folder + cfg[0] + "_dataset.csv", 'r') 118 | reader = csv.reader(f, delimiter=',') 119 | reg = list(reader) 120 | reg = reg[:int(dataset_fraction*len(reg))] 121 | #print sys.getsizeof(reg) 122 | #print sys.getsizeof(reg[0]) 123 | #print sys.getsizeof(reg[1]) 124 | f = open(data_folder + cfg[1] + "_dataset.csv", 'r') 125 | reader = csv.reader(f, delimiter=',') 126 | fac = list(reader) 127 | fac = fac[:int(dataset_fraction*len(fac))] 128 | 129 | #print "Size regular dataset: " + str(len(reg)) 130 | #print "Size censored dataset: " + str(len(fac)) 131 | print "###########################################" 132 | print "Configuration " + cfg[1] 133 | print "###########################################" 134 | 135 | #Convert data to floats (and labels to integers) 136 | features_id = reg[0] 137 | reg_data = [] 138 | for i in reg[1:]: 139 | int_array = [] 140 | for pl in i[:-1]: 141 | int_array.append(float(pl)) 142 | int_array.append(0) 143 | reg_data.append(int_array) 144 | 145 | fac_data = [] 146 | for i in fac[1:]: 147 | int_array = [] 148 | for pl in i[:-1]: 149 | int_array.append(float(pl)) 150 | int_array.append(1) 151 | fac_data.append(int_array) 152 | 153 | 154 | #Shuffle both datasets 155 | shuffled_reg_data = random.sample(reg_data, len(reg_data)) 156 | shuffled_fac_data = random.sample(fac_data, len(fac_data)) 157 | #shuffled_reg_data = shuffled_reg_data[:int(dataset_fraction*len(shuffled_reg_data))] 158 | #shuffled_fac_data = shuffled_fac_data[:int(dataset_fraction*len(shuffled_fac_data))] 159 | #Build label tensors 160 | reg_labels = [] 161 | for i in shuffled_reg_data: 162 | reg_labels.append(int(i[len(reg_data[0])-1])) 163 | 164 | fac_labels = [] 165 | for i in shuffled_fac_data: 166 | fac_labels.append(int(i[len(reg_data[0])-1])) 167 | 168 | #Take label out of data tensors 169 | for i in range(0, len(shuffled_reg_data)): 170 | shuffled_reg_data[i].pop() 171 | 172 | for i in range(0, len(shuffled_fac_data)): 173 | shuffled_fac_data[i].pop() 174 | 175 | #Create training sets by combining the randomly selected samples from each class 176 | train_x = shuffled_reg_data + shuffled_fac_data 177 | train_y = reg_labels + fac_labels 178 | 179 | #Shuffle positive/negative samples for CV purposes 180 | x_shuf = [] 181 | y_shuf = [] 182 | index_shuf = range(len(train_x)) 183 | shuffle(index_shuf) 184 | for i in index_shuf: 185 | x_shuf.append(train_x[i]) 186 | y_shuf.append(train_y[i]) 187 | 188 | return x_shuf, y_shuf, features_id 189 | 190 | 191 | def runClassification_CV(data_folder,feature_set, cfg,classifier): 192 | print "Gather dataset" 193 | dataset_fraction = 1.0 194 | train_x, train_y, features_id = gatherAllData(data_folder, cfg, dataset_fraction) 195 | 196 | model = classifier[0] 197 | clf_name = classifier[1] 198 | 199 | #Report Cross-Validation Accuracy 200 | #scores = cross_val_score(model, np.asarray(train_x), np.asarray(train_y), cv=10) 201 | print clf_name 202 | #print "Avg. Accuracy: " + str(sum(scores)/float(len(scores))) 203 | 204 | cv = KFold(n_splits=10) 205 | tprs = [] 206 | aucs = [] 207 | mean_fpr = np.linspace(0, 1, 100) 208 | train_times = [] 209 | test_times = [] 210 | importances = [] 211 | 212 | #Split the data in k-folds, perform classification, and report ROC 213 | i = 0 214 | for train, test in cv.split(train_x, train_y): 215 | 216 | start_train = time.time() 217 | model = model.fit(np.asarray(train_x)[train], np.asarray(train_y)[train]) 218 | end_train = time.time() 219 | train_times.append(end_train - start_train) 220 | 221 | start_test = time.time() 222 | probas_ = model.predict_proba(np.asarray(train_x)[test]) 223 | end_test = time.time() 224 | test_times.append(end_test - start_test) 225 | 226 | """ 227 | #For time benchmarking 228 | c=[] 229 | for value in np.asarray(train_x)[test]: 230 | a = np.reshape(value,(1, -1)) 231 | c.append(a) 232 | 233 | load = [] 234 | for v in c: 235 | start_test = time.time() 236 | a = model.predict_proba(v) 237 | end_test = time.time() 238 | load.append(end_test - start_test) 239 | print "Individual prediction avg: " + "{0:.5f}".format(np.mean(load)) 240 | """ 241 | 242 | # Compute ROC curve and area under the curve 243 | fpr, tpr, thresholds = roc_curve(np.asarray(train_y)[test], probas_[:, 1], pos_label=1) 244 | #print "Accuracy " + str(accuracy_score(np.asarray(train_y)[test], probas_[:, 1])) 245 | tprs.append(interp(mean_fpr, fpr, tpr)) 246 | tprs[-1][0] = 0.0 247 | roc_auc = auc(fpr, tpr) 248 | aucs.append(roc_auc) 249 | 250 | #Check feature importance in this fold 251 | f_imp = model.feature_importances_ 252 | importances.append(f_imp) 253 | #plt.plot(fpr, tpr, lw=1, alpha=0.3, label='ROC fold %d (AUC = %0.2f)' % (i, roc_auc)) 254 | i += 1 255 | 256 | plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r', label='Random Guess', alpha=.8) 257 | 258 | 259 | mean_tpr = np.mean(tprs, axis=0) 260 | mean_tpr[-1] = 1.0 261 | mean_auc = auc(mean_fpr, mean_tpr) 262 | print "Model AUC: " + "{0:.3f}".format(mean_auc) 263 | print "Training time (Avg. fold): " + str(np.mean(train_times, axis=0)) 264 | print "Test time (Avg. fold): " + str(np.mean(test_times, axis=0)) 265 | 266 | 267 | unblock70 = True 268 | unblock80 = True 269 | unblock90 = True 270 | unblock95 = True 271 | for n, i in enumerate(mean_tpr): 272 | if(i >= 0.7 and unblock70): 273 | print '70% TPR = ' + "{0:.3f}".format(mean_fpr[n]) 274 | unblock70 = False 275 | if(i >= 0.8 and unblock80): 276 | print '80% TPR = ' + "{0:.3f}".format(mean_fpr[n]) 277 | unblock80 = False 278 | if(i >= 0.9 and unblock90): 279 | print '90% TPR = ' + "{0:.3f}".format(mean_fpr[n]) 280 | unblock90 = False 281 | if(i >= 0.95 and unblock95): 282 | print '95% TPR = ' + "{0:.3f}".format(mean_fpr[n]) 283 | unblock95 = False 284 | 285 | #Figure properties 286 | fig = plt.figure() 287 | ax1 = fig.add_subplot(111) 288 | 289 | std_auc = np.std(aucs) 290 | 291 | np.save('xgBoost/' + feature_set + "/ROC_" + clf_name + "_" + cfg[1] + "_Sensitivity", np.array(mean_tpr)) 292 | np.save('xgBoost/' + feature_set + "/ROC_" + clf_name + "_" + cfg[1] + "_Specificity", np.array(mean_fpr)) 293 | plt.plot(mean_fpr, mean_tpr, color='b', label=r'Mean ROC (AUC = %0.2f $\pm$ %0.3f)' % (mean_auc, std_auc), lw=2, alpha=.8) 294 | 295 | #Compute Standard Deviation between folds 296 | std_tpr = np.std(tprs, axis=0) 297 | tprs_upper = np.minimum(mean_tpr + std_tpr, 1) 298 | tprs_lower = np.maximum(mean_tpr - std_tpr, 0) 299 | plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.3, label=r'$\pm$ ROC Std. Dev.') 300 | 301 | 302 | 303 | ax1.plot([0, 1], [0, 1], 'k--', lw=2, color='orange', label = 'Random Guess') 304 | ax1.grid(color='black', linestyle='dotted') 305 | 306 | plt.title('Receiver Operating Characteristic (ROC)') 307 | plt.xlabel('False Positive Rate', fontsize='x-large') 308 | plt.ylabel('True Positive Rate', fontsize='x-large') 309 | plt.legend(loc='lower right', fontsize='large') 310 | 311 | plt.setp(ax1.get_xticklabels(), fontsize=14) 312 | plt.setp(ax1.get_yticklabels(), fontsize=14) 313 | 314 | fig.savefig('xgBoost/' + feature_set + "/ROC_" + clf_name + "_" + cfg[1] + ".pdf") # save the figure to file 315 | plt.close(fig) 316 | 317 | #Compute mean importance of feature accross CV folds 318 | bin_number = list(range(len(train_x[0]))) 319 | mean_importances = [] 320 | for n in range(0,len(importances[0])): 321 | mean_imp = (importances[0][n] + importances[1][n] + importances[2][n] + importances[3][n] + importances[4][n] + importances[5][n] + importances[6][n] + importances[7][n] + importances[8][n] + importances[9][n])/10.0 322 | mean_importances.append(mean_imp) 323 | #print mean_importances 324 | f_imp = zip(bin_number,mean_importances,features_id) 325 | f_imp.sort(key = lambda t: t[1], reverse=True) 326 | 327 | np.save('xgBoost/' + feature_set + "/FeatureImportance_" + clf_name + "_" + cfg[1], np.array(f_imp)) 328 | 329 | 330 | 331 | def runClassification_adhocCV(data_folder,feature_set, cfg,classifier): 332 | print "Gather dataset" 333 | dataset_fraction = 1.0 334 | train_x, train_y, features_id = gatherAllData(data_folder, cfg, dataset_fraction) 335 | 336 | model = classifier[0] 337 | clf_name = classifier[1] 338 | 339 | #Report Cross-Validation Accuracy 340 | #scores = cross_val_score(model, np.asarray(train_x), np.asarray(train_y), cv=10) 341 | print clf_name 342 | #print "Avg. Accuracy: " + str(sum(scores)/float(len(scores))) 343 | 344 | cv = KFold(n_splits=10) 345 | tprs = [] 346 | aucs = [] 347 | mean_fpr = np.linspace(0, 1, 100) 348 | train_times = [] 349 | test_times = [] 350 | importances = [] 351 | 352 | #Split the data in k-folds, perform classification, and report ROC 353 | 354 | for i in range(0,10): 355 | X_train, X_test, y_train, y_test = train_test_split(train_x, train_y, test_size=0.1) 356 | start_train = time.time() 357 | model = model.fit(np.asarray(X_train), np.asarray(y_train)) 358 | end_train = time.time() 359 | train_times.append(end_train - start_train) 360 | 361 | start_test = time.time() 362 | probas_ = model.predict_proba(np.asarray(X_test)) 363 | end_test = time.time() 364 | test_times.append(end_test - start_test) 365 | 366 | # Compute ROC curve and area under the curve 367 | fpr, tpr, thresholds = roc_curve(np.asarray(y_test), probas_[:, 1], pos_label=1) 368 | #print "Accuracy " + str(accuracy_score(np.asarray(train_y)[test], probas_[:, 1])) 369 | tprs.append(interp(mean_fpr, fpr, tpr)) 370 | tprs[-1][0] = 0.0 371 | roc_auc = auc(fpr, tpr) 372 | aucs.append(roc_auc) 373 | 374 | #Check feature importance in this fold 375 | f_imp = model.feature_importances_ 376 | importances.append(f_imp) 377 | 378 | 379 | plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r', label='Random Guess', alpha=.8) 380 | 381 | 382 | mean_tpr = np.mean(tprs, axis=0) 383 | mean_tpr[-1] = 1.0 384 | mean_auc = auc(mean_fpr, mean_tpr) 385 | print "Model AUC: " + "{0:.3f}".format(mean_auc) 386 | print "Training time (Avg. fold): " + str(np.mean(train_times, axis=0)) 387 | print "Test time (Avg. fold): " + str(np.mean(test_times, axis=0)) 388 | 389 | 390 | unblock70 = True 391 | unblock80 = True 392 | unblock90 = True 393 | unblock95 = True 394 | for n, i in enumerate(mean_tpr): 395 | if(i >= 0.7 and unblock70): 396 | print '70% TPR = ' + "{0:.3f}".format(mean_fpr[n]) 397 | unblock70 = False 398 | if(i >= 0.8 and unblock80): 399 | print '80% TPR = ' + "{0:.3f}".format(mean_fpr[n]) 400 | unblock80 = False 401 | if(i >= 0.9 and unblock90): 402 | print '90% TPR = ' + "{0:.3f}".format(mean_fpr[n]) 403 | unblock90 = False 404 | if(i >= 0.95 and unblock95): 405 | print '95% TPR = ' + "{0:.3f}".format(mean_fpr[n]) 406 | unblock95 = False 407 | 408 | #Figure properties 409 | fig = plt.figure() 410 | ax1 = fig.add_subplot(111) 411 | 412 | std_auc = np.std(aucs) 413 | 414 | #np.save('xgBoost/' + feature_set + "/ROC_" + clf_name + "_" + cfg[1] + "_Sensitivity", np.array(mean_tpr)) 415 | #np.save('xgBoost/' + feature_set + "/ROC_" + clf_name + "_" + cfg[1] + "_Specificity", np.array(mean_fpr)) 416 | plt.plot(mean_fpr, mean_tpr, color='b', label=r'Mean ROC (AUC = %0.2f $\pm$ %0.3f)' % (mean_auc, std_auc), lw=2, alpha=.8) 417 | 418 | #Compute Standard Deviation between folds 419 | std_tpr = np.std(tprs, axis=0) 420 | tprs_upper = np.minimum(mean_tpr + std_tpr, 1) 421 | tprs_lower = np.maximum(mean_tpr - std_tpr, 0) 422 | plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.3, label=r'$\pm$ ROC Std. Dev.') 423 | 424 | 425 | 426 | ax1.plot([0, 1], [0, 1], 'k--', lw=2, color='orange', label = 'Random Guess') 427 | ax1.grid(color='black', linestyle='dotted') 428 | 429 | plt.title('Receiver Operating Characteristic (ROC)') 430 | plt.xlabel('False Positive Rate', fontsize='x-large') 431 | plt.ylabel('True Positive Rate', fontsize='x-large') 432 | plt.legend(loc='lower right', fontsize='large') 433 | 434 | plt.setp(ax1.get_xticklabels(), fontsize=14) 435 | plt.setp(ax1.get_yticklabels(), fontsize=14) 436 | 437 | #fig.savefig('xgBoost/' + feature_set + "/ROC_" + clf_name + "_" + cfg[1] + ".pdf") # save the figure to file 438 | plt.close(fig) 439 | 440 | #Compute mean importance of feature accross CV folds 441 | bin_number = list(range(len(train_x[0]))) 442 | mean_importances = [] 443 | for n in range(0,len(importances[0])): 444 | mean_imp = (importances[0][n] + importances[1][n] + importances[2][n] + importances[3][n] + importances[4][n] + importances[5][n] + importances[6][n] + importances[7][n] + importances[8][n] + importances[9][n])/10.0 445 | mean_importances.append(mean_imp) 446 | #print mean_importances 447 | f_imp = zip(bin_number,mean_importances,features_id) 448 | f_imp.sort(key = lambda t: t[1], reverse=True) 449 | 450 | #np.save('xgBoost/' + feature_set + "/FeatureImportance_" + clf_name + "_" + cfg[1], np.array(f_imp)) 451 | 452 | #for f in f_imp[:20]: 453 | # print "Importance: %f, Feature: %s" % (f[1], f[2]) 454 | 455 | if __name__ == "__main__": 456 | cfgs = [ 457 | ["RegularTraffic_Christmas", 458 | "FacetTraffic_12.5_Christmas"], 459 | ["RegularTraffic_Christmas", 460 | "FacetTraffic_25_Christmas"], 461 | ["RegularTraffic_Christmas", 462 | "FacetTraffic_50_Christmas"]] 463 | 464 | if not os.path.exists('xgBoost'): 465 | os.makedirs('xgBoost') 466 | 467 | 468 | classifiers = [ 469 | [DecisionTreeClassifier(), "DecisionTree"], 470 | [RandomForestClassifier(n_estimators=100, max_features='auto',n_jobs=1), "RandomForest"], 471 | [XGBClassifier(),"XGBoost"] 472 | ] 473 | 474 | 475 | feature_set = 'Stats_60' #'Stats_60' / 'PL_60' 476 | data_folder = 'FeatureSets/' + feature_set + '/' 477 | if not os.path.exists('xgBoost/' + feature_set): 478 | os.makedirs('xgBoost/' + feature_set) 479 | 480 | print "\n=================================================" 481 | print "One-class SVM - Summary Statistic Features - Set1" 482 | print "=================================================" 483 | for cfg in cfgs: 484 | for classifier in classifiers: 485 | print "Running classifiers for " + cfg[0] + " and " + cfg[1] 486 | runClassification_CV(data_folder, feature_set, cfg, classifier) 487 | print "#####################################\n" 488 | 489 | 490 | feature_set = 'PL_60' #'Stats_60' / 'PL_60' 491 | data_folder = 'FeatureSets/' + feature_set + '/' 492 | if not os.path.exists('xgBoost/' + feature_set): 493 | os.makedirs('xgBoost/' + feature_set) 494 | 495 | print "\n=================================================" 496 | print "One-class SVM - Packet Length Features - Set2" 497 | print "=================================================" 498 | for cfg in cfgs: 499 | for classifier in classifiers: 500 | print "Running classifiers for " + cfg[0] + " and " + cfg[1] 501 | runClassification_CV(data_folder, feature_set, cfg, classifier) 502 | -------------------------------------------------------------------------------- /FacetAnalysis/autoencoder.py: -------------------------------------------------------------------------------- 1 | import socket 2 | import dpkt 3 | import os 4 | import tensorflow as tf 5 | import csv 6 | import numpy as np 7 | import random 8 | import math 9 | from sklearn.metrics import roc_curve, auc 10 | from matplotlib import pyplot as plt 11 | from sklearn import preprocessing 12 | import time 13 | 14 | from copy import deepcopy 15 | from scipy import interp 16 | 17 | np.random.seed(1) 18 | graph_level_seed = 1 19 | operation_level_seed = 1 20 | tf.set_random_seed(graph_level_seed) 21 | random.seed(1) 22 | 23 | plt.rcParams['font.family'] = 'Helvetica' 24 | 25 | def gatherDataset_january(data_folder, cfg, SPLIT_FACTOR): 26 | random.seed(1) 27 | #Load Datasets 28 | f = open(data_folder + cfg[0] + "_dataset.csv", 'r') 29 | reader = csv.reader(f, delimiter=',') 30 | reg = list(reader) 31 | 32 | f = open(data_folder + cfg[1] + "_dataset.csv", 'r') 33 | reader = csv.reader(f, delimiter=',') 34 | fac = list(reader) 35 | print "###########################################" 36 | print "Configuration " + cfg[1] 37 | print "###########################################" 38 | 39 | #Convert data to floats (and labels to integers) 40 | reg_data = [] 41 | for i in reg[1:]: 42 | int_array = [] 43 | for pl in i[:-1]: 44 | int_array.append(float(pl)) 45 | int_array.append(1) 46 | reg_data.append(int_array) 47 | 48 | fac_data = [] 49 | for i in fac[1:]: 50 | int_array = [] 51 | for pl in i[:-1]: 52 | int_array.append(float(pl)) 53 | int_array.append(0) 54 | fac_data.append(int_array) 55 | 56 | 57 | #Shuffle both datasets 58 | shuffled_reg_data = random.sample(reg_data, len(reg_data)) 59 | shuffled_fac_data = random.sample(fac_data, len(fac_data)) 60 | 61 | #Build label tensors 62 | reg_labels = [] 63 | for i in shuffled_reg_data: 64 | reg_labels.append(int(i[len(reg_data[0])-1])) 65 | 66 | fac_labels = [] 67 | for i in shuffled_fac_data: 68 | fac_labels.append(int(i[len(reg_data[0])-1])) 69 | 70 | #Take label out of data tensors 71 | for i in range(0, len(shuffled_reg_data)): 72 | shuffled_reg_data[i].pop() 73 | 74 | for i in range(0, len(shuffled_fac_data)): 75 | shuffled_fac_data[i].pop() 76 | 77 | 78 | #Build training and testing datasets 79 | #Split each class data in the appropriate proportion for training 80 | reg_proportion_index = int(len(reg_labels)* SPLIT_FACTOR) 81 | reg_train_x = shuffled_reg_data[:reg_proportion_index] 82 | reg_train_y = reg_labels[:reg_proportion_index] 83 | 84 | fac_proportion_index = int(len(fac_labels)*SPLIT_FACTOR) 85 | fac_train_x = shuffled_fac_data[:fac_proportion_index] 86 | fac_train_y = fac_labels[:fac_proportion_index] 87 | 88 | #Create training sets by simply using normal samples 89 | train_x = reg_train_x #+ fac_train_x 90 | train_y = reg_train_y #+ fac_train_y 91 | 92 | #Make the split for the testing data 93 | reg_test_x = shuffled_reg_data[reg_proportion_index:] 94 | reg_test_y = reg_labels[reg_proportion_index:] 95 | 96 | fac_test_x = shuffled_fac_data[fac_proportion_index:] 97 | fac_test_y = fac_labels[fac_proportion_index:] 98 | 99 | #Create testing set by combining the holdout samples 100 | test_x = reg_test_x + fac_test_x 101 | test_y = reg_test_y + fac_test_y 102 | 103 | return train_x, train_y, test_x, test_y, len(reg_data[0]) 104 | 105 | def gatherDataset_10times(data_folder, cfg, split_factor): 106 | random.seed(1) 107 | SPLIT_FACTOR = split_factor 108 | #Load Datasets 109 | f = open(data_folder + cfg[0] + "_dataset.csv", 'r') 110 | reader = csv.reader(f, delimiter=',') 111 | reg = list(reader) 112 | 113 | f = open(data_folder + cfg[1] + "_dataset.csv", 'r') 114 | reader = csv.reader(f, delimiter=',') 115 | fac = list(reader) 116 | print "###########################################" 117 | print "Configuration " + cfg[1] 118 | print "###########################################" 119 | 120 | 121 | #Convert data to floats (and labels to integers) 122 | reg_data = [] 123 | for i in reg[1:]: 124 | int_array = [] 125 | for pl in i[:-1]: 126 | int_array.append(float(pl)) 127 | int_array.append(0) #0, inliers 128 | reg_data.append(int_array) 129 | 130 | fac_data = [] 131 | for i in fac[1:]: 132 | int_array = [] 133 | for pl in i[:-1]: 134 | int_array.append(float(pl)) 135 | int_array.append(1) #1, outliers 136 | fac_data.append(int_array) 137 | 138 | train_x_t = [] 139 | train_y_t = [] 140 | test_x_t = [] 141 | test_y_t = [] 142 | 143 | for k in range(0,10): 144 | reg_data2 = deepcopy(reg_data) 145 | fac_data2 = deepcopy(fac_data) 146 | 147 | 148 | #Shuffle both datasets 149 | shuffled_reg_data = random.sample(reg_data2, len(reg_data2)) 150 | shuffled_fac_data = random.sample(fac_data2, len(fac_data2)) 151 | 152 | #Build label tensors 153 | reg_labels = [] 154 | for i in shuffled_reg_data: 155 | reg_labels.append(int(i[len(reg_data2[0])-1])) 156 | 157 | fac_labels = [] 158 | for i in shuffled_fac_data: 159 | fac_labels.append(int(i[len(reg_data2[0])-1])) 160 | 161 | #Take label out of data tensors 162 | for i in range(0, len(shuffled_reg_data)): 163 | shuffled_reg_data[i].pop() 164 | 165 | for i in range(0, len(shuffled_fac_data)): 166 | shuffled_fac_data[i].pop() 167 | 168 | 169 | #Build training and testing datasets 170 | #Split each class data in the appropriate proportion for training 171 | reg_proportion_index = int(len(reg_labels)* SPLIT_FACTOR) 172 | reg_train_x = shuffled_reg_data[:reg_proportion_index] 173 | reg_train_y = reg_labels[:reg_proportion_index] 174 | 175 | fac_proportion_index = int(len(fac_labels)*SPLIT_FACTOR) 176 | fac_train_x = shuffled_fac_data[:fac_proportion_index] 177 | fac_train_y = fac_labels[:fac_proportion_index] 178 | 179 | #Create training sets by combining the randomly selected samples from each class 180 | train_x = reg_train_x 181 | train_y = reg_train_y 182 | 183 | #Make the split for the testing data 184 | reg_test_x = shuffled_reg_data[reg_proportion_index:] 185 | reg_test_y = reg_labels[reg_proportion_index:] 186 | fac_test_x = shuffled_fac_data[fac_proportion_index:] 187 | fac_test_y = fac_labels[fac_proportion_index:] 188 | 189 | #Create testing set by combining the holdout samples 190 | test_x = reg_test_x + fac_test_x 191 | test_y = reg_test_y + fac_test_y 192 | 193 | train_x_t.append(train_x) 194 | train_y_t.append(train_y) 195 | test_x_t.append(test_x) 196 | test_y_t.append(test_y) 197 | 198 | return train_x_t, train_y_t, test_x_t, test_y_t, len(reg_data2[0]) 199 | 200 | class Encoder(object): 201 | def __init__(self, inp, n_features, n_hidden, drop_input, drop_hidden, repr_size): 202 | # inp is the placeholder for the input, n_features is the number of features our data has (21 in this example) 203 | # n_hidden is the size of the first hidden layer and repr_size is the dimensionality of the representation 204 | self.inp = inp 205 | self.n_features = n_features 206 | self.n_hidden = n_hidden 207 | self.W1 = tf.Variable(tf.random_normal([n_features, self.n_hidden], stddev=0.35)) 208 | self.W2 = tf.Variable(tf.random_normal([self.n_hidden, repr_size], stddev=0.35)) 209 | 210 | 211 | self.b1 = tf.Variable(tf.random_normal([self.n_hidden], stddev=0.35)) 212 | self.b2 = tf.Variable(tf.random_normal([repr_size], stddev=0.35)) 213 | 214 | self.layer_0 = tf.nn.dropout(self.inp, drop_input) 215 | self.layer_1 = tf.nn.relu(tf.matmul(self.layer_0, self.W1) + self.b1) 216 | self.layer_1 = tf.nn.dropout(self.layer_1, drop_hidden) 217 | self.encoder_out = tf.matmul(self.layer_1, self.W2) + self.b2 218 | 219 | 220 | class Decoder(object): 221 | def __init__(self, inp, n_features, n_hidden, drop_input, drop_hidden, repr_size): 222 | self.inp = inp 223 | self.n_hidden = n_hidden 224 | self.W1 = tf.Variable(tf.random_normal([repr_size, self.n_hidden], stddev=0.35)) 225 | self.W2 = tf.Variable(tf.random_normal([self.n_hidden, n_features], stddev=0.35)) 226 | self.b1 = tf.Variable(tf.random_normal([self.n_hidden], stddev=0.35)) 227 | self.b2 = tf.Variable(tf.random_normal([n_features], stddev=0.35)) 228 | 229 | self.layer_0 = tf.nn.dropout(self.inp, drop_input) 230 | self.layer_1 = tf.nn.relu(tf.matmul(self.layer_0, self.W1) + self.b1) 231 | self.layer_1 = tf.nn.dropout(self.layer_1, drop_hidden) 232 | self.decoder_out = tf.matmul(self.layer_1, self.W2) + self.b2 233 | 234 | class Autoencoder(object): 235 | def __init__(self, n_features, batch_size, n_hidden, drop_input, drop_hidden, repr_size, learning_rate): 236 | # n_features is the number of features our data has (21 in this example) 237 | # repr_size the dimensionality of our representation 238 | # n_hidden_1 is the size of the layers closest to the in and output 239 | # n_hidden_2 is the size of the layers closest to the embedding layer 240 | # batch_size number of samples to run per batch 241 | 242 | self.n_features = n_features 243 | self.batch_size = batch_size 244 | self.n_hidden = n_hidden 245 | self.drop_input = drop_input 246 | self.hidden = drop_hidden 247 | self.repr_size = repr_size 248 | 249 | # Start session, placeholder has None in shape for batches 250 | self.sess = tf.Session() 251 | self.inp = tf.placeholder(tf.float32, [None, n_features]) 252 | 253 | # Make the encoder and the decoder 254 | self.encoder = Encoder(self.inp, n_features, n_hidden, drop_input, drop_hidden, repr_size) 255 | self.decoder = Decoder(self.encoder.encoder_out, n_features, n_hidden, drop_input, drop_hidden, repr_size) 256 | 257 | # Loss function mean squared error and AdamOptimizer 258 | self.loss = tf.reduce_mean(tf.square(self.decoder.decoder_out - self.inp), -1) 259 | self.mean_loss = tf.reduce_mean(self.loss) 260 | self.optimizer = tf.train.AdamOptimizer(learning_rate) 261 | self.train_op = self.optimizer.minimize(self.mean_loss) 262 | 263 | # Initialize all variables 264 | self.sess.run(tf.global_variables_initializer()) 265 | 266 | def run_epoch(self, data_list): 267 | # Train once over the passed data_list and return the mean reconstruction loss after the epoch 268 | for index in range(len(data_list) // self.batch_size): 269 | self.sess.run(self.train_op, feed_dict={self.inp: data_list[index * self.batch_size : (index+1) * self.batch_size]}) 270 | return self.sess.run(self.mean_loss, feed_dict={self.inp: data_list}) 271 | 272 | def representations(self, data_list): 273 | # Return a list of representations for the given list of samples 274 | return self.sess.run(self.encoder.encoder_out, feed_dict={self.inp: data_list}) 275 | 276 | def reconstruction_errors(self, data_list): 277 | # Get mean squared reconstruction errors of passed data_list 278 | return self.sess.run(self.loss, feed_dict={self.inp: data_list}) 279 | 280 | 281 | def runANN(data_folder,cfg): 282 | epochs = 1000 283 | #Gather the dataset 284 | #train_x, train_y are just regular samples 285 | train_x, train_y, test_x, test_y, num_input = gatherDataset_january(data_folder, cfg, 0.7) 286 | 287 | #std_scale = preprocessing.StandardScaler().fit(train_x) 288 | #train_x = std_scale.transform(train_x) 289 | #test_x = std_scale.transform(test_x) 290 | 291 | #n_features, batch_size, n_hidden, drop_input, drop_hidden, repr_size 292 | ae = Autoencoder(num_input, 128, 128, 0.8, 0.5, 32) 293 | for i in range(epochs): 294 | if(i%50==0): 295 | print "Epoch: " + str(i) 296 | ae.run_epoch(train_x) 297 | 298 | """ 299 | #Show compressed representation of samples (valid for repr_size=2,3) 300 | anomaly_repr = ae.representations(test_x[len(test_x)/2:]) 301 | normal_repr = ae.representations(test_x[:len(test_x)/2]) 302 | anom_x, anom_y = zip(*anomaly_repr) 303 | norm_x, norm_y = zip(*normal_repr) 304 | plt.scatter(anom_x, anom_y, color='red', alpha=0.7) 305 | plt.scatter(norm_x, norm_y, alpha=0.7) 306 | plt.show() 307 | """ 308 | 309 | #Reconstruct samples 310 | anomaly_errors = ae.reconstruction_errors(test_x[len(test_x)/2:]) 311 | normal_val_errors = ae.reconstruction_errors(test_x[:len(test_x)/2]) 312 | 313 | roc_y = [1 for _ in range(len(anomaly_errors))] + [0 for _ in range(len(normal_val_errors))] 314 | roc_score = np.concatenate([anomaly_errors, normal_val_errors]) 315 | 316 | 317 | # Compute ROC curve and ROC area for each class 318 | #number of thresholds = number of data samples - default drop_intermediate 319 | # does not show some low performing configs for creating smoother ROCs 320 | 321 | fpr, tpr, thresholds = roc_curve(roc_y, roc_score, drop_intermediate=True) 322 | roc_auc = auc(fpr, tpr) 323 | 324 | 325 | plt.figure() 326 | lw = 2 327 | plt.plot(fpr, tpr, color='darkorange',lw=lw, label='ROC curve (area = %0.2f)' % roc_auc) 328 | plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--') 329 | plt.xlim([0.0, 1.0]) 330 | plt.ylim([0.0, 1.05]) 331 | plt.xlabel('False Positive Rate') 332 | plt.ylabel('True Positive Rate') 333 | plt.title('Receiver operating characteristic example') 334 | plt.legend(loc="lower right") 335 | plt.show() 336 | 337 | 338 | def runANNSearch(data_folder,cfg): 339 | epochs = 100 340 | #Gather the dataset 341 | #train_x, train_y are just regular samples 342 | train_x_t, train_y_t, test_x_t, test_y_t, num_input = gatherDataset_10times(data_folder, cfg, 0.9) 343 | 344 | #std_scale = preprocessing.StandardScaler().fit(train_x) 345 | #train_x = std_scale.transform(train_x) 346 | #test_x = std_scale.transform(test_x) 347 | 348 | max_auc = 0 349 | max_batch_size = 0 350 | max_hidden = 0 351 | max_repr_size = 0 352 | 353 | auc_report = [] 354 | n_hidden_report = [] 355 | repr_size_report = [] 356 | batch_sizes_report = [] 357 | 358 | best_config = [] 359 | max_auc = 0 360 | 361 | learning_rates = [0.001] # [0.01, 0.001] # default is 0.001 362 | batch_sizes = [32]#[8, 16, 32, 64, 128, 256] 363 | n_hiddens = [8, 16, 32, 64, 128, 256]#np.logspace(2, 10, base=2, num=12) 364 | #drop_inputs = [0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1] 365 | #drop_hiddens = [0.5, 0.6, 0.7, 0.8, 0.9, 1] 366 | repr_sizes = [4, 8, 16, 32, 64, 128, 256] #np.logspace(2, 10, base=2, num=12) #num 20 367 | 368 | for learning_rate in learning_rates: 369 | for batch_size in batch_sizes: 370 | for n_hidden in n_hiddens: 371 | for repr_size in repr_sizes: 372 | if(repr_size <= n_hidden): 373 | #start = time.time() 374 | np.random.seed(1) 375 | graph_level_seed = 1 376 | operation_level_seed = 1 377 | tf.set_random_seed(graph_level_seed) 378 | random.seed(1) 379 | 380 | step_auc = [] 381 | mean_fpr = np.linspace(0, 1, 100) 382 | tprs = [] 383 | for n in range(0,10): 384 | #n_features, batch_size, n_hidden, drop_input, drop_hidden, repr_size 385 | ae = Autoencoder(num_input, batch_size, int(n_hidden), 1, 1, int(repr_size), learning_rate) 386 | 387 | train_x = train_x_t[n] 388 | train_y = train_y_t[n] 389 | test_x = test_x_t[n] 390 | test_y = test_y_t[n] 391 | 392 | for i in range(epochs): 393 | ae.run_epoch(train_x) 394 | 395 | #Reconstruct samples 396 | anomaly_errors = ae.reconstruction_errors(test_x[len(test_x)/2:]) 397 | normal_val_errors = ae.reconstruction_errors(test_x[:len(test_x)/2]) 398 | 399 | roc_y = [1 for _ in range(len(anomaly_errors))] + [0 for _ in range(len(normal_val_errors))] 400 | roc_score = np.concatenate([anomaly_errors, normal_val_errors]) 401 | 402 | 403 | # Compute ROC curve and ROC area for each class 404 | fpr, tpr, thresholds = roc_curve(roc_y, roc_score, drop_intermediate=True) 405 | tprs.append(interp(mean_fpr, fpr, tpr)) 406 | tprs[-1][0] = 0.0 407 | roc_auc = auc(fpr, tpr) 408 | #print "Fold %i auc: %f" % (n, roc_auc) 409 | step_auc.append(roc_auc) 410 | 411 | avg_auc = sum(step_auc)/float(len(step_auc)) 412 | 413 | auc_report.append(avg_auc) 414 | """ 415 | n_hidden_report.append(int(n_hidden)) 416 | repr_size_report.append(int(repr_size)) 417 | batch_sizes_report.append(batch_size) 418 | """ 419 | mean_tpr = np.mean(tprs, axis=0) 420 | mean_tpr[-1] = 1.0 421 | mean_auc = auc(mean_fpr, mean_tpr) 422 | 423 | if(mean_auc > max_auc): 424 | max_auc = mean_auc 425 | best_config = [mean_fpr, mean_tpr, n_hidden, repr_size] 426 | 427 | #end = time.time() 428 | #print(end - start) 429 | print ("%f - Batch Size:%i, Learning Rate:%f, n_hidden:%i, repr_size:%i" % (avg_auc, batch_size, learning_rate, int(n_hidden), int(repr_size))) 430 | 431 | 432 | fig = plt.figure() 433 | ax1 = fig.add_subplot(111) 434 | plt.xlim([0, 1]) 435 | plt.ylim([0, 1]) 436 | plt.xlabel('False Positive Rate', fontsize=26) 437 | plt.ylabel('True Positive Rate', fontsize=26) 438 | 439 | ax1.plot([0, 1], [0, 1], 'k--', lw=2, color='orange', label = 'Random Guess') 440 | ax1.grid(color='black', linestyle='dotted') 441 | plt.setp(ax1.get_xticklabels(), fontsize=16) 442 | plt.setp(ax1.get_yticklabels(), fontsize=16) 443 | plt.plot(best_config[0], best_config[1], color='b', label=r'ROC (AUC = %0.2f)' % (max_auc), lw=2, alpha=.8) 444 | plt.legend(loc='lower right', fontsize='x-large') 445 | 446 | fig.savefig('Autoencoder/' + "Facet_Autoencoder_" + cfg[1] + ".pdf") # save the figure to file 447 | plt.close(fig) 448 | 449 | print "################\n# Summary" 450 | print "Max. AUC: %f, N_hidden: %i, Repr_Size: %i" % (max_auc, best_config[2],best_config[3]) 451 | print "Avg. AUC %f: " % (np.mean(auc_report,axis=0)) 452 | """ 453 | full_report = zip(auc_report, batch_sizes_report, n_hidden_report, repr_size_report) 454 | full_report.sort(key = lambda t: t[0]) 455 | 456 | f = open(cfg[1] + '_report.txt', 'w') 457 | 458 | for item in full_report: 459 | f.write("%f - Batch Size:%i, n_hidden:%i, repr_size:%i\n" % (item[0], item[1], item[2], item[3])) 460 | np.save(cfg[1] + '_report', np.array(full_report)) 461 | """ 462 | 463 | 464 | if __name__ == "__main__": 465 | 466 | cfgs = [ 467 | ["RegularTraffic_Christmas", 468 | "FacetTraffic_12.5_Christmas"], 469 | ["RegularTraffic_Christmas", 470 | "FacetTraffic_25_Christmas"], 471 | ["RegularTraffic_Christmas", 472 | "FacetTraffic_50_Christmas"]] 473 | 474 | 475 | print "Autoencoder - Packet Length Features - Set2" 476 | feature_set = 'PL_60' 477 | data_folder = 'FeatureSets/' + feature_set + '/' 478 | 479 | for cfg in cfgs: 480 | runANNSearch(data_folder,cfg) 481 | -------------------------------------------------------------------------------- /CovertCastAnalysis/X2_classifier.py: -------------------------------------------------------------------------------- 1 | import dpkt 2 | import os 3 | from matplotlib import pyplot as plt 4 | from matplotlib.pyplot import cm 5 | import numpy as np 6 | import socket 7 | import collections 8 | from itertools import product 9 | from scipy.stats import entropy, chisquare, norm, rv_continuous 10 | import random 11 | 12 | random.seed(a=1) 13 | 14 | auxFolder = 'auxFolder/' 15 | 16 | cfgs = [ 17 | ["YouTube_home_world_live", 18 | "CovertCast_home_world"] 19 | ] 20 | 21 | 22 | BIN_WIDTH = [20] 23 | #BIN_WIDTH = [50] 24 | 25 | def ComputeBiGramDistributions(sampleFolder, cfg, binWidth): 26 | freq_dists = [] 27 | 28 | for mode in cfg: 29 | #Compute frequency distribution for A and B 30 | freq_dist = [] 31 | for sample in os.listdir(sampleFolder + mode): 32 | 33 | f = open(auxFolder + os.path.dirname(sampleFolder) + "/" + mode + "/" + sample + '/bigrams_' + str(binWidth), 'r') 34 | 35 | bin_dict = {} 36 | bigrams=[] 37 | #Generate the set of all possible bigrams 38 | for i in product(range(0,1500, binWidth), repeat=2): 39 | bin_dict[str(i).replace(" ", "")] = 1 40 | 41 | 42 | lines = f.readlines() 43 | for line in lines: 44 | try: 45 | bigrams.append(line.rstrip('\n')) 46 | except IndexError: 47 | break #Reached last index, stop processing 48 | f.close() 49 | 50 | #Account for each bin elem 51 | for i in bigrams: 52 | bin_dict['('+str(i)+')']+=1 53 | 54 | #Order bin_key : num_packets 55 | od_dict = collections.OrderedDict(sorted(bin_dict.items())) 56 | bin_list = [] 57 | for i in od_dict: 58 | bin_list.append(float(od_dict[i])) 59 | 60 | #Build up the list of a distribution samples freq dist 61 | freq_dist.append(bin_list) 62 | #Build up the list of all freq dists for different sample folders 63 | freq_dists.append(freq_dist) 64 | 65 | return freq_dists 66 | 67 | 68 | def computeIntraVariance(freq_dists): 69 | varIntra = np.zeros(len(freq_dists[0][0])) 70 | 71 | for i in range(0, len(freq_dists[0][0])): 72 | somatory = 0 73 | 74 | for m in freq_dists: 75 | term = 0 76 | #Compute total n_grams in model 77 | total_ngrams_model = 0 78 | for v in m: 79 | total_ngrams_model += sum(v) 80 | 81 | #Compute probability of a given n_gram in model 82 | prob_ngram_model = 0 83 | for v in m: 84 | prob_ngram_model += v[i] 85 | prob_ngram_model = prob_ngram_model / float(total_ngrams_model) 86 | 87 | for v in m: 88 | n_gram_prob_v = v[i]/sum(v) 89 | term += (float(n_gram_prob_v) - prob_ngram_model)**2 90 | 91 | somatory += 1/float(len(m)) * term 92 | 93 | varIntra[i] = 1/2.0 * somatory 94 | 95 | return varIntra 96 | 97 | 98 | def computeInterVariance(freq_dists): 99 | varInter = np.zeros(len(freq_dists[0][0])) 100 | 101 | total_videos = len(freq_dists[0]) + len(freq_dists[1]) 102 | 103 | for i in range(0, len(freq_dists[0][0])): 104 | somatory = 0 105 | 106 | ###For each model 107 | for n, m in enumerate(freq_dists): 108 | #Compute total n_grams in model 109 | total_ngrams_model = 0 110 | for v in m: 111 | total_ngrams_model += sum(v) 112 | 113 | #Compute total n_grams in other model 114 | total_ngrams_other_model = 0 115 | for v in freq_dists[(n+1)%2]: 116 | total_ngrams_other_model += sum(v) 117 | 118 | #Compute probability of a given n_gram in model 119 | prob_ngram_model = 0 120 | for v in m: 121 | prob_ngram_model += v[i] 122 | prob_ngram_model = prob_ngram_model / float(total_ngrams_model) 123 | 124 | #Compute probability of a given n_gram in the other model 125 | prob_ngram_other_model = 0 126 | for v in freq_dists[(n+1)%2]: 127 | prob_ngram_other_model += v[i] 128 | prob_ngram_other_model = prob_ngram_other_model / float(total_ngrams_other_model) 129 | 130 | ###For each video in model 131 | for v in m: 132 | n_gram_prob_v = v[i]/sum(v) 133 | somatory += (float(n_gram_prob_v) - prob_ngram_model)**2 134 | 135 | varInter[i] = 1.0/total_videos * somatory 136 | 137 | return varInter 138 | 139 | 140 | def optimizeBigrams(freq_dists): 141 | 142 | varIntra = computeIntraVariance(freq_dists) 143 | varInter = computeInterVariance(freq_dists) 144 | 145 | DIS = np.zeros(len(varIntra)) 146 | DIS = varInter/varIntra 147 | 148 | indexes_to_remove = [] 149 | 150 | for n, i in enumerate(DIS): 151 | if(i < 1): 152 | indexes_to_remove.append(n) 153 | 154 | return indexes_to_remove 155 | 156 | 157 | def buildModels(freq_dists): 158 | ##################################### 159 | # Build models 160 | ##################################### 161 | model_chat = np.zeros(len(freq_dists[0][0])) 162 | model_censored = np.zeros(len(freq_dists[0][0])) 163 | 164 | total_ngrams_chat_set = 0 165 | for dist in freq_dists[0]: 166 | total_ngrams_chat_set += sum(dist) 167 | 168 | total_ngrams_censored_set = 0 169 | for dist in freq_dists[1]: 170 | total_ngrams_censored_set += sum(dist) 171 | 172 | 173 | for i in range(0, len(model_chat)): 174 | somatory = 0 175 | for v in freq_dists[0]: 176 | n_gram_prob = v[i]/sum(v) 177 | v_total_grams = sum(v) 178 | somatory += (v_total_grams * n_gram_prob) 179 | model_chat[i] = (1/total_ngrams_chat_set) * somatory 180 | 181 | 182 | for i in range(0, len(model_censored)): 183 | somatory = 0 184 | for v in freq_dists[1]: 185 | n_gram_prob = v[i]/float(sum(v)) 186 | v_total_grams = sum(v) 187 | somatory += (v_total_grams * n_gram_prob) 188 | model_censored[i] = (1/float(total_ngrams_censored_set)) * somatory 189 | 190 | return model_chat, model_censored 191 | 192 | #Reproduces Facet Fixed threshold evalution 193 | def Prepare_X_Fixed(fig_folder, cfg,binWidth,freq_dists): 194 | optimization = True 195 | 196 | #Transform original freq_dists to include only the better bi-grams 197 | chat_samples = freq_dists[0] 198 | censored_samples = freq_dists[1] 199 | 200 | filtered_freq_dists = [] 201 | filtered_chat_samples = [] 202 | filtered_censored_samples = [] 203 | 204 | if(optimization): 205 | #Optimize bigram choice, build updated frequency distributions 206 | indexes_to_remove = optimizeBigrams(freq_dists) 207 | 208 | for sample in chat_samples: 209 | filtered_chat_samples.append(np.delete(sample, indexes_to_remove)) 210 | 211 | for sample in censored_samples: 212 | filtered_censored_samples.append(np.delete(sample, indexes_to_remove)) 213 | else: 214 | #Ignore optimization procedure, carry on with original frequency distributions 215 | filtered_chat_samples = chat_samples 216 | filtered_censored_samples = censored_samples 217 | 218 | #2x Cross validation 219 | filtered_freq_dists1 = [] 220 | filtered_freq_dists2 = [] 221 | 222 | #To Remove 223 | #x = random.sample(filtered_chat_samples, len(filtered_chat_samples)) 224 | #x2 = random.sample(filtered_censored_samples, len(filtered_censored_samples)) 225 | 226 | filtered_freq_dists1.append(filtered_chat_samples[:len(filtered_chat_samples)/2]) 227 | filtered_freq_dists1.append(filtered_censored_samples[:len(filtered_censored_samples)/2]) 228 | 229 | filtered_freq_dists2.append(filtered_chat_samples[len(filtered_chat_samples)/2:]) 230 | filtered_freq_dists2.append(filtered_censored_samples[len(filtered_censored_samples)/2:]) 231 | 232 | model_chat1, model_censored1 = buildModels(filtered_freq_dists1) 233 | acc1, tnr1, fnr1, tpr1, fpr1, ppv1, npv1 = X_Classify_Fixed(cfg,binWidth,filtered_freq_dists2, model_chat1, model_censored1) 234 | print "1st Fold" 235 | print "Acc = " + str(acc1) 236 | print "TPR = " + str(tpr1) 237 | print "TNR = " + str(tnr1) 238 | print "FPR = " + str(fpr1) 239 | print "FNR = " + str(fnr1) 240 | print "PPV = " + str(ppv1) 241 | print "NPV = " + str(npv1) 242 | 243 | model_chat2, model_censored2 = buildModels(filtered_freq_dists2) 244 | acc2, tnr2, fnr2, tpr2, fpr2, ppv2, npv2 = X_Classify_Fixed(cfg,binWidth,filtered_freq_dists1, model_chat2, model_censored2) 245 | print "\n2nd Fold" 246 | print "Acc = " + str(acc2) 247 | print "TPR = " + str(tpr2) 248 | print "TNR = " + str(tnr2) 249 | print "FPR = " + str(fpr2) 250 | print "FNR = " + str(fnr2) 251 | print "PPV = " + str(ppv2) 252 | print "NPV = " + str(npv2) 253 | 254 | print "\n###################" 255 | print "Average" 256 | print "Acc = " + str((acc1 + acc2)/2.0) 257 | print "TPR = " + str((tpr1 + tpr2)/2.0) 258 | print "TNR = " + str((tnr1 + tnr2)/2.0) 259 | print "FPR = " + str((fpr1 + fpr2)/2.0) 260 | print "FNR = " + str((fnr1 + fnr2)/2.0) 261 | print "PPV = " + str((ppv1 + ppv2)/2.0) 262 | print "NPV = " + str((npv1 + npv2)/2.0) 263 | 264 | 265 | ###################################################################################### 266 | def X_Classify_Fixed(cfg, binWidth, freq_dists, model_chat, model_censored): 267 | ########################## 268 | #Classify samples 269 | ########################## 270 | FPositives = 0 271 | FNegatives = 0 272 | TPositives = 0 273 | TNegatives = 0 274 | 275 | #True negative is being classified as facet when it is facet 276 | for v in freq_dists[0]: 277 | chat_score = chisquare(v, model_chat) 278 | censored_score = chisquare(v, model_censored) 279 | 280 | if(chat_score < censored_score): 281 | TPositives += 1 282 | elif(censored_score < chat_score): 283 | FNegatives += 1 284 | 285 | for v in freq_dists[1]: 286 | chat_score = chisquare(v, model_chat) 287 | censored_score = chisquare(v, model_censored) 288 | 289 | if(censored_score < chat_score): 290 | TNegatives += 1 291 | elif(chat_score < censored_score): 292 | FPositives += 1 293 | 294 | 295 | accuracy = (TPositives + TNegatives)/float(len(freq_dists[0]) + len(freq_dists[1])) 296 | TNR = TNegatives/(TNegatives+float(FPositives)) 297 | FNR = FNegatives/(TPositives+float(FNegatives)) 298 | TPR = TPositives/(TPositives+float(FNegatives)) 299 | FPR = FPositives/(FPositives+float(TNegatives)) 300 | PPV = TPositives/(TPositives+float(FPositives)) 301 | NPV = TNegatives/(TNegatives+float(FNegatives)) 302 | 303 | return accuracy, TNR, FNR, TPR, FPR, PPV, NPV 304 | 305 | 306 | #Reproduces Facet Changing deltas evaluation 307 | def Prepare_X_RatioReproduction(fig_folder, cfg,binWidth,freq_dists): 308 | optimization = True 309 | 310 | 311 | #Transform original freq_dists to include only the better bi-grams 312 | chat_samples = freq_dists[0] 313 | censored_samples = freq_dists[1] 314 | 315 | filtered_freq_dists = [] 316 | filtered_chat_samples = [] 317 | filtered_censored_samples = [] 318 | 319 | if(optimization): 320 | #Optimize bigram choice, build updated frequency distributions 321 | indexes_to_remove = optimizeBigrams(freq_dists) 322 | 323 | for sample in chat_samples: 324 | filtered_chat_samples.append(np.delete(sample, indexes_to_remove)) 325 | 326 | for sample in censored_samples: 327 | filtered_censored_samples.append(np.delete(sample, indexes_to_remove)) 328 | else: 329 | #Ignore optimization procedure, carry on with original frequency distributions 330 | filtered_chat_samples = chat_samples 331 | filtered_censored_samples = censored_samples 332 | 333 | #2x Cross validation 334 | filtered_freq_dists1 = [] 335 | filtered_freq_dists2 = [] 336 | 337 | #To remove 338 | #x = random.sample(filtered_chat_samples, len(filtered_chat_samples)) 339 | #x2 = random.sample(filtered_censored_samples, len(filtered_censored_samples)) 340 | 341 | filtered_freq_dists1.append(filtered_chat_samples[:len(filtered_chat_samples)/2]) 342 | filtered_freq_dists1.append(filtered_censored_samples[:len(filtered_censored_samples)/2]) 343 | 344 | filtered_freq_dists2.append(filtered_chat_samples[len(filtered_chat_samples)/2:]) 345 | filtered_freq_dists2.append(filtered_censored_samples[len(filtered_censored_samples)/2:]) 346 | 347 | model_chat1, model_censored1 = buildModels(filtered_freq_dists1) 348 | max_acc, max_delta, max_tpr, max_fpr, val90, val80, val70, specificity, sensitivity = X_Classify_RatioReproduction(cfg,binWidth,filtered_freq_dists2, model_chat1, model_censored1) 349 | print "1st Fold" 350 | print "TPR90 = " + str(val90) 351 | print "TPR80 = " + str(val80) 352 | print "TPR70 = " + str(val70) 353 | print "Max acc: " + str(max_acc) + " Max TPR:" + str(max_tpr) + " Max FPR:" + str(max_fpr) + " delta:" + str(max_delta) 354 | 355 | model_chat2, model_censored2 = buildModels(filtered_freq_dists2) 356 | max_acc2, max_delta2, max_tpr2, max_fpr2, val902, val802, val702, specificity2, sensitivity2 = X_Classify_RatioReproduction(cfg,binWidth,filtered_freq_dists1, model_chat2, model_censored2) 357 | print "2nd Fold" 358 | print "TPR90 = " + str(val902) 359 | print "TPR80 = " + str(val802) 360 | print "TPR70 = " + str(val702) 361 | print "Max acc: " + str(max_acc2) + " Max TPR:" + str(max_tpr2) + " Max FPR:" + str(max_fpr2) + " delta:" + str(max_delta2) 362 | 363 | print "###################" 364 | print "Average FPR" 365 | print "TPR90 = " + str((val902+val90)/2.0) 366 | print "TPR80 = " + str((val802+val80)/2.0) 367 | print "TPR70 = " + str((val702+val70)/2.0) 368 | print "Max acc: " + str((max_acc+max_acc2)/2.0) + " Max TPR:" + str((max_tpr+max_tpr2)/2.0) + " Max FPR:" + str((max_fpr+max_fpr2)/2.0) + " delta:" + str((max_delta + max_delta2)/2.0) 369 | 370 | fig = plt.figure() 371 | ax1 = fig.add_subplot(111) 372 | 373 | 374 | Specificity = (specificity + specificity2)/2.0 375 | Sensitivity = (sensitivity + sensitivity2)/2.0 376 | 377 | """ 378 | np.set_printoptions(threshold=np.inf) 379 | print specificity 380 | print specificity2 381 | """ 382 | 383 | #ROC Curve 384 | ax1.plot(1 - specificity, sensitivity, color='red', lw=2, alpha=0.7, label = 'k-Fold ROC') 385 | ax1.plot(1 - specificity2, sensitivity2, color='red', lw=2, alpha=0.7) 386 | ax1.plot(1 - Specificity, Sensitivity, 'k.-', color='black', label = 'Mean ROC') 387 | ax1.plot([0, 1], [0, 1], 'k--', lw=2, color='orange', label = 'Random Guess') 388 | ax1.grid(color='black', linestyle='dotted') 389 | 390 | plt.title('Receiver Operating Characteristic (ROC)') 391 | plt.xlabel('False Positive Rate', fontsize='x-large') 392 | plt.ylabel('True Positive Rate', fontsize='x-large') 393 | plt.legend(loc='lower right', fontsize='large') 394 | 395 | plt.setp(ax1.get_xticklabels(), fontsize=14) 396 | plt.setp(ax1.get_yticklabels(), fontsize=14) 397 | 398 | fig.savefig(fig_folder + "ROC_" + str(optimization) + "_" + cfg[1] + "_" + str(binWidth)+".pdf") # save the figure to file 399 | plt.close(fig) 400 | 401 | def X_Classify_RatioReproduction(cfg, binWidth,freq_dists, model_chat, model_censored): 402 | ########################## 403 | #Classify samples 404 | ########################## 405 | deltas = np.arange(0.001, 5, 0.001) 406 | FalsePositives = [] 407 | FalseNegatives = [] 408 | TruePositives = [] 409 | TrueNegatives = [] 410 | 411 | Sensitivity = [] 412 | Specificity = [] 413 | FalsePositiveRate = [] 414 | FalseNegativeRate =[] 415 | 416 | holding90 = True 417 | holding80 = True 418 | holding70 = True 419 | 420 | thresh90 = 0 421 | thresh80 = 0 422 | thresh70 = 0 423 | 424 | val90 = 0 425 | val80 = 0 426 | val70 = 0 427 | 428 | max_acc = 0 429 | max_delta = 0 430 | max_tpr = 0 431 | max_fpr = 0 432 | 433 | for delta in deltas: 434 | FPositives = 0 435 | FNegatives = 0 436 | TPositives = 0 437 | TNegatives = 0 438 | 439 | chat_ratios = [] 440 | censored_ratios = [] 441 | 442 | #Positive example is chat 443 | #True positive is being classified as facet when it is facet 444 | for v in freq_dists[0]: 445 | chat_score, p_value = chisquare(v, model_chat) 446 | censored_score, p_value2 = chisquare(v, model_censored) 447 | 448 | 449 | ratio = chat_score / float(censored_score) 450 | chat_ratios.append(ratio) 451 | if(ratio < delta): 452 | TNegatives += 1 453 | elif(ratio > delta): 454 | FPositives += 1 455 | 456 | for v in freq_dists[1]: 457 | chat_score, p_value = chisquare(v, model_chat) 458 | censored_score, p_value2 = chisquare(v, model_censored) 459 | 460 | ratio = chat_score / float(censored_score) 461 | censored_ratios.append(ratio) 462 | if(ratio > delta): 463 | TPositives += 1 464 | elif(ratio < delta): 465 | FNegatives += 1 466 | 467 | 468 | accuracy = (TPositives + TNegatives)/float(len(freq_dists[0]) + len(freq_dists[1])) 469 | TNR = TNegatives/(TNegatives+float(FPositives)) 470 | FNR = FNegatives/(TPositives+float(FNegatives)) 471 | TPR = TPositives/(TPositives+float(FNegatives)) 472 | FPR = FPositives/(FPositives+float(TNegatives)) 473 | 474 | if(accuracy > max_acc): 475 | max_acc = accuracy 476 | max_tpr = TPR 477 | max_fpr = FPR 478 | max_delta = delta 479 | 480 | FalsePositives.append(FPositives) 481 | FalseNegatives.append(FNegatives) 482 | TruePositives.append(TPositives) 483 | TrueNegatives.append(TNegatives) 484 | Sensitivity.append(TPositives/(TPositives+float(FNegatives))) 485 | Specificity.append(TNegatives/(TNegatives+float(FPositives))) 486 | FalsePositiveRate.append(FPR) 487 | FalseNegativeRate.append(FNR) 488 | 489 | if(holding90): 490 | if(FNR >= 0.1): 491 | holding90 = False 492 | thresh90 = delta 493 | val90 = FPR 494 | 495 | if(holding80): 496 | if(FNR >= 0.2): 497 | holding80 = False 498 | thresh80 = delta 499 | val80 = FPR 500 | 501 | if(holding70): 502 | if(FNR >= 0.3): 503 | holding70 = False 504 | thresh70 = delta 505 | val70 = FPR 506 | 507 | return max_acc, max_delta, max_tpr, max_fpr, val90, val80, val70, np.array(Specificity), np.array(Sensitivity) 508 | 509 | 510 | 511 | 512 | 513 | if __name__ == "__main__": 514 | 515 | sampleFolder = "TrafficCaptures/" 516 | 517 | if not os.path.exists('X2'): 518 | os.makedirs('X2') 519 | if not os.path.exists('X2/' + os.path.dirname(sampleFolder)): 520 | os.makedirs('X2/' + os.path.dirname(sampleFolder)) 521 | 522 | fig_folder = 'X2/' + os.path.dirname(sampleFolder) + '/' 523 | 524 | 525 | print "###########################" 526 | print os.path.dirname(sampleFolder) 527 | print "###########################" 528 | for cfg in cfgs: 529 | random.seed(a=1) # re-seed 530 | print "=====================================" 531 | print "X classifier - " + cfg[0] + " vs " + cfg[1] 532 | for binWidth in BIN_WIDTH: 533 | print "---------------------" 534 | print "Bin Width: " + str(binWidth) 535 | print "---------------------" 536 | #Compute bigram distributions and shuffle the samples 537 | freq_dists = ComputeBiGramDistributions(sampleFolder, cfg, binWidth) 538 | x = random.sample(freq_dists[0], len(freq_dists[0])) 539 | x2 = random.sample(freq_dists[1], len(freq_dists[1])) 540 | freqs = [] 541 | freqs.append(x) 542 | freqs.append(x2) 543 | 544 | #For reproducing results of Facet paper (70%,80%,90% blockage) 545 | #Prepare_X_RatioReproduction(fig_folder, cfg,binWidth, freqs) 546 | 547 | #For getting fixed classification rates to compare with classifiers without a notion of internal thereshold 548 | Prepare_X_Fixed(fig_folder, cfg,binWidth, freqs) 549 | -------------------------------------------------------------------------------- /DeltaShaperAnalysis/X2_classifier.py: -------------------------------------------------------------------------------- 1 | import dpkt 2 | import os 3 | from matplotlib import pyplot as plt 4 | from matplotlib.pyplot import cm 5 | import numpy as np 6 | import socket 7 | import collections 8 | from itertools import product 9 | from scipy.stats import entropy, chisquare, norm, rv_continuous 10 | import random 11 | 12 | 13 | 14 | auxFolder = 'auxFolder/' 15 | 16 | cfgs = [ 17 | ["RegularTraffic", 18 | "DeltaShaperTraffic_320"], 19 | ["RegularTraffic", 20 | "DeltaShaperTraffic_160"]] 21 | 22 | 23 | BIN_WIDTH = [20] 24 | 25 | def ComputeBiGramDistributions(sampleFolder, cfg, binWidth): 26 | freq_dists = [] 27 | 28 | for mode in cfg: 29 | #Compute frequency distribution for A and B 30 | freq_dist = [] 31 | for sample in os.listdir(sampleFolder + mode): 32 | 33 | f = open(auxFolder + os.path.dirname(sampleFolder) + "/" + mode + "/" + sample + '/bigrams_' + str(binWidth), 'r') 34 | 35 | bin_dict = {} 36 | bigrams=[] 37 | #Generate the set of all possible bigrams 38 | for i in product(range(0,1500, binWidth), repeat=2): 39 | bin_dict[str(i).replace(" ", "")] = 1 40 | 41 | 42 | lines = f.readlines() 43 | for line in lines: 44 | try: 45 | bigrams.append(line.rstrip('\n')) 46 | except IndexError: 47 | break #Reached last index, stop processing 48 | f.close() 49 | 50 | #Account for each bin elem 51 | for i in bigrams: 52 | bin_dict['('+str(i)+')']+=1 53 | 54 | #Order bin_key : num_packets 55 | od_dict = collections.OrderedDict(sorted(bin_dict.items())) 56 | bin_list = [] 57 | for i in od_dict: 58 | bin_list.append(float(od_dict[i])) 59 | 60 | #Build up the list of a distribution samples freq dist 61 | freq_dist.append(bin_list) 62 | #Build up the list of all freq dists for different sample folders 63 | freq_dists.append(freq_dist) 64 | 65 | return freq_dists 66 | 67 | 68 | def computeIntraVariance(freq_dists): 69 | varIntra = np.zeros(len(freq_dists[0][0])) 70 | 71 | for i in range(0, len(freq_dists[0][0])): 72 | somatory = 0 73 | 74 | for m in freq_dists: 75 | term = 0 76 | #Compute total n_grams in model 77 | total_ngrams_model = 0 78 | for v in m: 79 | total_ngrams_model += sum(v) 80 | 81 | #Compute probability of a given n_gram in model 82 | prob_ngram_model = 0 83 | for v in m: 84 | prob_ngram_model += v[i] 85 | prob_ngram_model = prob_ngram_model / float(total_ngrams_model) 86 | 87 | for v in m: 88 | n_gram_prob_v = v[i]/sum(v) 89 | term += (float(n_gram_prob_v) - prob_ngram_model)**2 90 | 91 | somatory += 1/float(len(m)) * term 92 | 93 | varIntra[i] = 1/2.0 * somatory 94 | 95 | return varIntra 96 | 97 | 98 | def computeInterVariance(freq_dists): 99 | varInter = np.zeros(len(freq_dists[0][0])) 100 | 101 | total_videos = len(freq_dists[0]) + len(freq_dists[1]) 102 | 103 | for i in range(0, len(freq_dists[0][0])): 104 | somatory = 0 105 | 106 | ###For each model 107 | for n, m in enumerate(freq_dists): 108 | #Compute total n_grams in model 109 | total_ngrams_model = 0 110 | for v in m: 111 | total_ngrams_model += sum(v) 112 | 113 | #Compute total n_grams in other model 114 | total_ngrams_other_model = 0 115 | for v in freq_dists[(n+1)%2]: 116 | total_ngrams_other_model += sum(v) 117 | 118 | #Compute probability of a given n_gram in model 119 | prob_ngram_model = 0 120 | for v in m: 121 | prob_ngram_model += v[i] 122 | prob_ngram_model = prob_ngram_model / float(total_ngrams_model) 123 | 124 | #Compute probability of a given n_gram in the other model 125 | prob_ngram_other_model = 0 126 | for v in freq_dists[(n+1)%2]: 127 | prob_ngram_other_model += v[i] 128 | prob_ngram_other_model = prob_ngram_other_model / float(total_ngrams_other_model) 129 | 130 | ###For each video in model 131 | for v in m: 132 | n_gram_prob_v = v[i]/sum(v) 133 | somatory += (float(n_gram_prob_v) - prob_ngram_model)**2 134 | 135 | varInter[i] = 1.0/total_videos * somatory 136 | 137 | return varInter 138 | 139 | 140 | def optimizeBigrams(freq_dists): 141 | 142 | varIntra = computeIntraVariance(freq_dists) 143 | varInter = computeInterVariance(freq_dists) 144 | 145 | DIS = np.zeros(len(varIntra)) 146 | DIS = varInter/varIntra 147 | 148 | indexes_to_remove = [] 149 | 150 | for n, i in enumerate(DIS): 151 | if(i < 1): 152 | indexes_to_remove.append(n) 153 | 154 | return indexes_to_remove 155 | 156 | 157 | def buildModels(freq_dists): 158 | ##################################### 159 | # Build models 160 | ##################################### 161 | model_chat = np.zeros(len(freq_dists[0][0])) 162 | model_censored = np.zeros(len(freq_dists[0][0])) 163 | 164 | total_ngrams_chat_set = 0 165 | for dist in freq_dists[0]: 166 | total_ngrams_chat_set += sum(dist) 167 | 168 | total_ngrams_censored_set = 0 169 | for dist in freq_dists[1]: 170 | total_ngrams_censored_set += sum(dist) 171 | 172 | 173 | for i in range(0, len(model_chat)): 174 | somatory = 0 175 | for v in freq_dists[0]: 176 | n_gram_prob = v[i]/sum(v) 177 | v_total_grams = sum(v) 178 | somatory += (v_total_grams * n_gram_prob) 179 | model_chat[i] = (1/total_ngrams_chat_set) * somatory 180 | 181 | 182 | for i in range(0, len(model_censored)): 183 | somatory = 0 184 | for v in freq_dists[1]: 185 | n_gram_prob = v[i]/float(sum(v)) 186 | v_total_grams = sum(v) 187 | somatory += (v_total_grams * n_gram_prob) 188 | model_censored[i] = (1/float(total_ngrams_censored_set)) * somatory 189 | 190 | return model_chat, model_censored 191 | 192 | #Reproduces Facet Fixed threshold evalution 193 | def Prepare_X_Fixed(fig_folder, cfg,binWidth,freq_dists): 194 | optimization = True 195 | 196 | #Transform original freq_dists to include only the better bi-grams 197 | chat_samples = freq_dists[0] 198 | censored_samples = freq_dists[1] 199 | 200 | filtered_freq_dists = [] 201 | filtered_chat_samples = [] 202 | filtered_censored_samples = [] 203 | 204 | if(optimization): 205 | #Optimize bigram choice, build updated frequency distributions 206 | indexes_to_remove = optimizeBigrams(freq_dists) 207 | 208 | for sample in chat_samples: 209 | filtered_chat_samples.append(np.delete(sample, indexes_to_remove)) 210 | 211 | for sample in censored_samples: 212 | filtered_censored_samples.append(np.delete(sample, indexes_to_remove)) 213 | else: 214 | #Ignore optimization procedure, carry on with original frequency distributions 215 | filtered_chat_samples = chat_samples 216 | filtered_censored_samples = censored_samples 217 | 218 | 219 | #2x Cross validation 220 | filtered_freq_dists1 = [] 221 | filtered_freq_dists2 = [] 222 | 223 | filtered_freq_dists1.append(filtered_chat_samples[:len(filtered_chat_samples)/2]) 224 | filtered_freq_dists1.append(filtered_censored_samples[:len(filtered_censored_samples)/2]) 225 | 226 | filtered_freq_dists2.append(filtered_chat_samples[len(filtered_chat_samples)/2:]) 227 | filtered_freq_dists2.append(filtered_censored_samples[len(filtered_censored_samples)/2:]) 228 | 229 | model_chat1, model_censored1 = buildModels(filtered_freq_dists1) 230 | acc1, tnr1, fnr1, tpr1, fpr1, ppv1, npv1 = X_Classify_Fixed(cfg,binWidth,filtered_freq_dists2, model_chat1, model_censored1) 231 | print "1st Fold" 232 | print "Acc = " + str(acc1) 233 | print "TPR = " + str(tpr1) 234 | print "TNR = " + str(tnr1) 235 | print "FPR = " + str(fpr1) 236 | print "FNR = " + str(fnr1) 237 | print "PPV = " + str(ppv1) 238 | print "NPV = " + str(npv1) 239 | 240 | model_chat2, model_censored2 = buildModels(filtered_freq_dists2) 241 | acc2, tnr2, fnr2, tpr2, fpr2, ppv2, npv2 = X_Classify_Fixed(cfg,binWidth,filtered_freq_dists1, model_chat2, model_censored2) 242 | print "\n2nd Fold" 243 | print "Acc = " + str(acc2) 244 | print "TPR = " + str(tpr2) 245 | print "TNR = " + str(tnr2) 246 | print "FPR = " + str(fpr2) 247 | print "FNR = " + str(fnr2) 248 | print "PPV = " + str(ppv2) 249 | print "NPV = " + str(npv2) 250 | 251 | print "\n###################" 252 | print "Average" 253 | print "Acc = " + str((acc1 + acc2)/2.0) 254 | print "TPR = " + str((tpr1 + tpr2)/2.0) 255 | print "TNR = " + str((tnr1 + tnr2)/2.0) 256 | print "FPR = " + str((fpr1 + fpr2)/2.0) 257 | print "FNR = " + str((fnr1 + fnr2)/2.0) 258 | print "PPV = " + str((ppv1 + ppv2)/2.0) 259 | print "NPV = " + str((npv1 + npv2)/2.0) 260 | 261 | 262 | ###################################################################################### 263 | def X_Classify_Fixed(cfg, binWidth, freq_dists, model_chat, model_censored): 264 | ########################## 265 | #Classify samples 266 | ########################## 267 | FPositives = 0 268 | FNegatives = 0 269 | TPositives = 0 270 | TNegatives = 0 271 | 272 | #True negative is being classified as facet when it is facet 273 | for v in freq_dists[0]: 274 | chat_score = chisquare(v, model_chat) 275 | censored_score = chisquare(v, model_censored) 276 | 277 | if(chat_score < censored_score): 278 | TPositives += 1 279 | elif(censored_score < chat_score): 280 | FNegatives += 1 281 | 282 | for v in freq_dists[1]: 283 | chat_score = chisquare(v, model_chat) 284 | censored_score = chisquare(v, model_censored) 285 | 286 | if(censored_score < chat_score): 287 | TNegatives += 1 288 | elif(chat_score < censored_score): 289 | FPositives += 1 290 | 291 | 292 | accuracy = (TPositives + TNegatives)/float(len(freq_dists[0]) + len(freq_dists[1])) 293 | TNR = TNegatives/(TNegatives+float(FPositives)) 294 | FNR = FNegatives/(TPositives+float(FNegatives)) 295 | TPR = TPositives/(TPositives+float(FNegatives)) 296 | FPR = FPositives/(FPositives+float(TNegatives)) 297 | PPV = TPositives/(TPositives+float(FPositives)) 298 | NPV = TNegatives/(TNegatives+float(FNegatives)) 299 | 300 | return accuracy, TNR, FNR, TPR, FPR, PPV, NPV 301 | 302 | 303 | #Reproduces Facet Changing deltas evaluation 304 | def Prepare_X_RatioReproduction(fig_folder, cfg,binWidth,freq_dists): 305 | optimization = True 306 | 307 | 308 | #Transform original freq_dists to include only the better bi-grams 309 | chat_samples = freq_dists[0] 310 | censored_samples = freq_dists[1] 311 | 312 | filtered_freq_dists = [] 313 | filtered_chat_samples = [] 314 | filtered_censored_samples = [] 315 | 316 | if(optimization): 317 | #Optimize bigram choice, build updated frequency distributions 318 | indexes_to_remove = optimizeBigrams(freq_dists) 319 | np.save(fig_folder + "RemovedIndexes_" + cfg[1], np.array(indexes_to_remove)) 320 | 321 | for sample in chat_samples: 322 | filtered_chat_samples.append(np.delete(sample, indexes_to_remove)) 323 | 324 | for sample in censored_samples: 325 | filtered_censored_samples.append(np.delete(sample, indexes_to_remove)) 326 | else: 327 | #Ignore optimization procedure, carry on with original frequency distributions 328 | filtered_chat_samples = chat_samples 329 | filtered_censored_samples = censored_samples 330 | print "Finished optimization" 331 | #2x Cross validation 332 | filtered_freq_dists1 = [] 333 | filtered_freq_dists2 = [] 334 | 335 | filtered_freq_dists1.append(filtered_chat_samples[:len(filtered_chat_samples)/2]) 336 | filtered_freq_dists1.append(filtered_censored_samples[:len(filtered_censored_samples)/2]) 337 | 338 | filtered_freq_dists2.append(filtered_chat_samples[len(filtered_chat_samples)/2:]) 339 | filtered_freq_dists2.append(filtered_censored_samples[len(filtered_censored_samples)/2:]) 340 | 341 | model_chat1, model_censored1 = buildModels(filtered_freq_dists1) 342 | max_acc, max_delta, max_tpr, max_fpr, val90, val80, val70, specificity, sensitivity = X_Classify_RatioReproduction(cfg,binWidth,filtered_freq_dists2, model_chat1, model_censored1) 343 | print "1st Fold" 344 | print "TPR90 = " + str(val90) 345 | print "TPR80 = " + str(val80) 346 | print "TPR70 = " + str(val70) 347 | print "Max acc: " + str(max_acc) + " Max TPR:" + str(max_tpr) + " Max FPR:" + str(max_fpr) + " delta:" + str(max_delta) 348 | 349 | model_chat2, model_censored2 = buildModels(filtered_freq_dists2) 350 | max_acc2, max_delta2, max_tpr2, max_fpr2, val902, val802, val702, specificity2, sensitivity2 = X_Classify_RatioReproduction(cfg,binWidth,filtered_freq_dists1, model_chat2, model_censored2) 351 | print "2nd Fold" 352 | print "TPR90 = " + str(val902) 353 | print "TPR80 = " + str(val802) 354 | print "TPR70 = " + str(val702) 355 | print "Max acc: " + str(max_acc2) + " Max TPR:" + str(max_tpr2) + " Max FPR:" + str(max_fpr2) + " delta:" + str(max_delta2) 356 | 357 | print "###################" 358 | print "Average FPR" 359 | print "TPR90 = " + str((val902+val90)/2.0) 360 | print "TPR80 = " + str((val802+val80)/2.0) 361 | print "TPR70 = " + str((val702+val70)/2.0) 362 | print "Max acc: " + str((max_acc+max_acc2)/2.0) + " Max TPR:" + str((max_tpr+max_tpr2)/2.0) + " Max FPR:" + str((max_fpr+max_fpr2)/2.0) + " delta:" + str((max_delta + max_delta2)/2.0) 363 | 364 | fig = plt.figure() 365 | ax1 = fig.add_subplot(111) 366 | 367 | 368 | Specificity = (specificity + specificity2)/2.0 369 | Sensitivity = (sensitivity + sensitivity2)/2.0 370 | 371 | np.save(fig_folder + "ROC_" + str(optimization) + "_" + cfg[1] + "_" + str(binWidth)+"_Sensitivity", np.array(Sensitivity)) 372 | np.save(fig_folder + "ROC_" + str(optimization) + "_" + cfg[1] + "_" + str(binWidth)+"_Specificity", np.array(Specificity)) 373 | """ 374 | np.set_printoptions(threshold=np.inf) 375 | print specificity 376 | print specificity2 377 | """ 378 | 379 | print "AUC" 380 | auc = np.trapz(Sensitivity, 1 - Specificity) 381 | print auc 382 | #ROC Curve 383 | ax1.plot(1 - specificity, sensitivity, color='red', lw=2, alpha=0.7, label = 'K-Fold ROC') 384 | ax1.plot(1 - specificity2, sensitivity2, color='red', lw=2, alpha=0.7) 385 | ax1.plot(1 - Specificity, Sensitivity, 'k.-', color='black', label = 'Mean ROC (AUC = %0.2f)' % (auc)) 386 | ax1.plot([0, 1], [0, 1], 'k--', lw=2, color='orange', label = 'Random Guess') 387 | ax1.grid(color='black', linestyle='dotted') 388 | 389 | #plt.title('Receiver Operating Characteristic (ROC)') 390 | plt.xlabel('False Positive Rate', fontsize='xx-large') #one size down 391 | plt.ylabel('True Positive Rate', fontsize='xx-large') 392 | plt.legend(loc='lower right', fontsize='x-large') 393 | 394 | plt.setp(ax1.get_xticklabels(), fontsize=16) #14 395 | plt.setp(ax1.get_yticklabels(), fontsize=16) 396 | 397 | fig.savefig(fig_folder + "ROC_" + str(optimization) + "_" + cfg[1] + "_" + str(binWidth)+".pdf") # save the figure to file 398 | plt.close(fig) 399 | 400 | def X_Classify_RatioReproduction(cfg, binWidth,freq_dists, model_chat, model_censored): 401 | ########################## 402 | #Classify samples 403 | ########################## 404 | deltas = np.arange(0.001, 5, 0.001) 405 | FalsePositives = [] 406 | FalseNegatives = [] 407 | TruePositives = [] 408 | TrueNegatives = [] 409 | 410 | Sensitivity = [] 411 | Specificity = [] 412 | FalsePositiveRate = [] 413 | FalseNegativeRate =[] 414 | 415 | holding90 = True 416 | holding80 = True 417 | holding70 = True 418 | 419 | thresh90 = 0 420 | thresh80 = 0 421 | thresh70 = 0 422 | 423 | val90 = 0 424 | val80 = 0 425 | val70 = 0 426 | 427 | max_acc = 0 428 | max_delta = 0 429 | max_tpr = 0 430 | max_fpr = 0 431 | 432 | for delta in deltas: 433 | FPositives = 0 434 | FNegatives = 0 435 | TPositives = 0 436 | TNegatives = 0 437 | 438 | chat_ratios = [] 439 | censored_ratios = [] 440 | 441 | #Positive example is chat 442 | #True positive is being classified as facet when it is facet 443 | for v in freq_dists[0]: 444 | chat_score, p_value = chisquare(v, model_chat) 445 | censored_score, p_value2 = chisquare(v, model_censored) 446 | 447 | 448 | ratio = chat_score / float(censored_score) 449 | chat_ratios.append(ratio) 450 | if(ratio < delta): 451 | TNegatives += 1 452 | elif(ratio > delta): 453 | FPositives += 1 454 | 455 | for v in freq_dists[1]: 456 | chat_score, p_value = chisquare(v, model_chat) 457 | censored_score, p_value2 = chisquare(v, model_censored) 458 | 459 | ratio = chat_score / float(censored_score) 460 | censored_ratios.append(ratio) 461 | if(ratio > delta): 462 | TPositives += 1 463 | elif(ratio < delta): 464 | FNegatives += 1 465 | 466 | 467 | accuracy = (TPositives + TNegatives)/float(len(freq_dists[0]) + len(freq_dists[1])) 468 | TNR = TNegatives/(TNegatives+float(FPositives)) 469 | FNR = FNegatives/(TPositives+float(FNegatives)) 470 | TPR = TPositives/(TPositives+float(FNegatives)) 471 | FPR = FPositives/(FPositives+float(TNegatives)) 472 | 473 | if(accuracy > max_acc): 474 | max_acc = accuracy 475 | max_tpr = TPR 476 | max_fpr = FPR 477 | max_delta = delta 478 | 479 | FalsePositives.append(FPositives) 480 | FalseNegatives.append(FNegatives) 481 | TruePositives.append(TPositives) 482 | TrueNegatives.append(TNegatives) 483 | Sensitivity.append(TPositives/(TPositives+float(FNegatives))) 484 | Specificity.append(TNegatives/(TNegatives+float(FPositives))) 485 | FalsePositiveRate.append(FPR) 486 | FalseNegativeRate.append(FNR) 487 | 488 | if(holding90): 489 | if(FNR >= 0.1): 490 | holding90 = False 491 | thresh90 = delta 492 | val90 = FPR 493 | 494 | if(holding80): 495 | if(FNR >= 0.2): 496 | holding80 = False 497 | thresh80 = delta 498 | val80 = FPR 499 | 500 | if(holding70): 501 | if(FNR >= 0.3): 502 | holding70 = False 503 | thresh70 = delta 504 | val70 = FPR 505 | 506 | return max_acc, max_delta, max_tpr, max_fpr, val90, val80, val70, np.array(Specificity), np.array(Sensitivity) 507 | 508 | 509 | 510 | if __name__ == "__main__": 511 | 512 | sampleFolder = "TrafficCaptures/480Resolution/" 513 | 514 | if not os.path.exists('X2'): 515 | os.makedirs('X2') 516 | if not os.path.exists('X2/' + os.path.dirname(sampleFolder)): 517 | os.makedirs('X2/' + os.path.dirname(sampleFolder)) 518 | 519 | fig_folder = 'X2/' + os.path.dirname(sampleFolder) + '/' 520 | 521 | 522 | print "###########################" 523 | print os.path.dirname(sampleFolder) 524 | print "###########################" 525 | for cfg in cfgs: 526 | random.seed(a=1) # re-seed 527 | print "=====================================" 528 | print "X classifier - " + cfg[0] + " vs " + cfg[1] 529 | for binWidth in BIN_WIDTH: 530 | print "---------------------" 531 | print "Bin Width: " + str(binWidth) 532 | print "---------------------" 533 | #Compute bigram distributions and shuffle the samples 534 | freq_dists = ComputeBiGramDistributions(sampleFolder, cfg, binWidth) 535 | x = random.sample(freq_dists[0], len(freq_dists[0])) 536 | x2 = random.sample(freq_dists[1], len(freq_dists[1])) 537 | freqs = [] 538 | freqs.append(x) 539 | freqs.append(x2) 540 | 541 | print "Finished sample processing" 542 | #For reproducing results of Facet paper (70%,80%,90% blockage) 543 | Prepare_X_RatioReproduction(fig_folder, cfg,binWidth, freqs) 544 | 545 | #For getting fixed classification rates to compare with classifiers without a notion of internal thereshold 546 | #Prepare_X_Fixed(fig_folder, cfg,binWidth, freqs) 547 | -------------------------------------------------------------------------------- /DeltaShaperAnalysis/IsolationForest.py: -------------------------------------------------------------------------------- 1 | import socket 2 | import dpkt 3 | import os 4 | import csv 5 | import numpy as np 6 | import random 7 | import math 8 | from sklearn.ensemble import IsolationForest 9 | from sklearn.svm import OneClassSVM 10 | from sklearn.neighbors import LocalOutlierFactor 11 | 12 | from copy import deepcopy 13 | from scipy import interp 14 | import matplotlib.pyplot as plt 15 | from sklearn import preprocessing 16 | from sklearn.decomposition import PCA 17 | 18 | from sklearn.model_selection import GridSearchCV 19 | from sklearn.metrics import classification_report 20 | from sklearn.metrics import accuracy_score 21 | from sklearn.metrics import precision_score 22 | from sklearn.metrics import recall_score 23 | from sklearn.metrics import roc_auc_score 24 | from sklearn.metrics import roc_curve 25 | from sklearn.metrics import auc 26 | 27 | plt.rcParams['font.family'] = 'Helvetica' 28 | 29 | random.seed(42) 30 | rng = np.random.RandomState(42) 31 | 32 | def gatherHoldoutData(data_folder, cfg): 33 | 34 | SPLIT_FACTOR = 0.7 35 | #Load Datasets 36 | f = open(data_folder + cfg[0] + "_dataset.csv", 'r') 37 | reader = csv.reader(f, delimiter=',') 38 | reg = list(reader) 39 | 40 | f = open(data_folder + cfg[1] + "_dataset.csv", 'r') 41 | reader = csv.reader(f, delimiter=',') 42 | fac = list(reader) 43 | 44 | 45 | #Convert data to floats (and labels to integers) 46 | reg_data = [] 47 | for i in reg[1:]: 48 | int_array = [] 49 | for pl in i[:-1]: 50 | int_array.append(float(pl)) 51 | int_array.append(1) #0, inliers 52 | reg_data.append(int_array) 53 | 54 | fac_data = [] 55 | for i in fac[1:]: 56 | int_array = [] 57 | for pl in i[:-1]: 58 | int_array.append(float(pl)) 59 | int_array.append(-1) #1, outliers 60 | fac_data.append(int_array) 61 | 62 | 63 | #Shuffle both datasets 64 | shuffled_reg_data = random.sample(reg_data, len(reg_data)) 65 | shuffled_fac_data = random.sample(fac_data, len(fac_data)) 66 | 67 | #Build label tensors 68 | reg_labels = [] 69 | for i in shuffled_reg_data: 70 | reg_labels.append(int(i[len(reg_data[0])-1])) 71 | 72 | fac_labels = [] 73 | for i in shuffled_fac_data: 74 | fac_labels.append(int(i[len(reg_data[0])-1])) 75 | 76 | #Take label out of data tensors 77 | for i in range(0, len(shuffled_reg_data)): 78 | shuffled_reg_data[i].pop() 79 | 80 | for i in range(0, len(shuffled_fac_data)): 81 | shuffled_fac_data[i].pop() 82 | 83 | 84 | #Build training and testing datasets 85 | #Split each class data in the appropriate proportion for training 86 | reg_proportion_index = int(len(reg_labels)* SPLIT_FACTOR) 87 | reg_train_x = shuffled_reg_data[:reg_proportion_index] 88 | reg_train_y = reg_labels[:reg_proportion_index] 89 | 90 | fac_proportion_index = int(len(fac_labels)*SPLIT_FACTOR) 91 | fac_train_x = shuffled_fac_data[:fac_proportion_index] 92 | fac_train_y = fac_labels[:fac_proportion_index] 93 | 94 | #Create training sets by combining the randomly selected samples from each class 95 | train_x = reg_train_x + fac_train_x 96 | train_y = reg_train_y + fac_train_y 97 | 98 | #Make the split for the testing data 99 | reg_test_x = shuffled_reg_data[reg_proportion_index:] 100 | reg_test_y = reg_labels[reg_proportion_index:] 101 | 102 | fac_test_x = shuffled_fac_data[fac_proportion_index:] 103 | fac_test_y = fac_labels[fac_proportion_index:] 104 | 105 | #Create testing set by combining the holdout samples 106 | test_x = reg_test_x + fac_test_x 107 | test_y = reg_test_y + fac_test_y 108 | 109 | return train_x, train_y, test_x, test_y 110 | 111 | def gatherHoldoutData_10times(data_folder, cfg, split_factor): 112 | random.seed(1) 113 | SPLIT_FACTOR = split_factor 114 | #Load Datasets 115 | f = open(data_folder + cfg[0] + "_dataset.csv", 'r') 116 | reader = csv.reader(f, delimiter=',') 117 | reg = list(reader) 118 | 119 | f = open(data_folder + cfg[1] + "_dataset.csv", 'r') 120 | reader = csv.reader(f, delimiter=',') 121 | fac = list(reader) 122 | print "###########################################" 123 | print "Configuration " + cfg[1] 124 | print "###########################################" 125 | 126 | 127 | #Convert data to floats (and labels to integers) 128 | reg_data = [] 129 | for i in reg[1:]: 130 | int_array = [] 131 | for pl in i[:-1]: 132 | int_array.append(float(pl)) 133 | int_array.append(-1) #0, inliers 134 | reg_data.append(int_array) 135 | 136 | fac_data = [] 137 | for i in fac[1:]: 138 | int_array = [] 139 | for pl in i[:-1]: 140 | int_array.append(float(pl)) 141 | int_array.append(1) #1, outliers 142 | fac_data.append(int_array) 143 | 144 | train_x_t = [] 145 | train_y_t = [] 146 | test_x_t = [] 147 | test_y_t = [] 148 | 149 | for k in range(0,10): 150 | reg_data2 = deepcopy(reg_data) 151 | fac_data2 = deepcopy(fac_data) 152 | 153 | 154 | #Shuffle both datasets 155 | shuffled_reg_data = random.sample(reg_data2, len(reg_data2)) 156 | shuffled_fac_data = random.sample(fac_data2, len(fac_data2)) 157 | 158 | #Build label tensors 159 | reg_labels = [] 160 | for i in shuffled_reg_data: 161 | reg_labels.append(int(i[len(reg_data2[0])-1])) 162 | 163 | fac_labels = [] 164 | for i in shuffled_fac_data: 165 | fac_labels.append(int(i[len(reg_data2[0])-1])) 166 | 167 | #Take label out of data tensors 168 | for i in range(0, len(shuffled_reg_data)): 169 | shuffled_reg_data[i].pop() 170 | 171 | for i in range(0, len(shuffled_fac_data)): 172 | shuffled_fac_data[i].pop() 173 | 174 | 175 | #Build training and testing datasets 176 | #Split each class data in the appropriate proportion for training 177 | reg_proportion_index = int(len(reg_labels)* SPLIT_FACTOR) 178 | reg_train_x = shuffled_reg_data[:reg_proportion_index] 179 | reg_train_y = reg_labels[:reg_proportion_index] 180 | 181 | fac_proportion_index = int(len(fac_labels)*SPLIT_FACTOR) 182 | fac_train_x = shuffled_fac_data[:fac_proportion_index] 183 | fac_train_y = fac_labels[:fac_proportion_index] 184 | 185 | #Create training sets by combining the randomly selected samples from each class 186 | train_x = reg_train_x + fac_train_x 187 | train_y = reg_train_y + fac_train_y 188 | 189 | #Make the split for the testing data 190 | reg_test_x = shuffled_reg_data[reg_proportion_index:] 191 | reg_test_y = reg_labels[reg_proportion_index:] 192 | fac_test_x = shuffled_fac_data[fac_proportion_index:] 193 | fac_test_y = fac_labels[fac_proportion_index:] 194 | 195 | #Create testing set by combining the holdout samples 196 | test_x = reg_test_x + fac_test_x 197 | test_y = reg_test_y + fac_test_y 198 | 199 | train_x_t.append(train_x) 200 | train_y_t.append(train_y) 201 | test_x_t.append(test_x) 202 | test_y_t.append(test_y) 203 | 204 | 205 | return train_x_t, train_y_t, test_x_t, test_y_t 206 | 207 | def gatherAllData(data_folder, cfg): 208 | #Load Datasets 209 | f = open(data_folder + cfg[0] + "_dataset.csv", 'r') 210 | reader = csv.reader(f, delimiter=',') 211 | reg = list(reader) 212 | 213 | f = open(data_folder + cfg[1] + "_dataset.csv", 'r') 214 | reader = csv.reader(f, delimiter=',') 215 | fac = list(reader) 216 | print "###########################################" 217 | print "Configuration " + cfg[1] 218 | print "###########################################" 219 | 220 | #Convert data to floats (and labels to integers) 221 | reg_data = [] 222 | for i in reg[1:]: 223 | int_array = [] 224 | for pl in i[:-1]: 225 | int_array.append(float(pl)) 226 | int_array.append(0) 227 | reg_data.append(int_array) 228 | 229 | fac_data = [] 230 | for i in fac[1:]: 231 | int_array = [] 232 | for pl in i[:-1]: 233 | int_array.append(float(pl)) 234 | int_array.append(1) 235 | fac_data.append(int_array) 236 | 237 | 238 | #Shuffle both datasets 239 | shuffled_reg_data = random.sample(reg_data, len(reg_data)) 240 | shuffled_fac_data = random.sample(fac_data, len(fac_data)) 241 | 242 | #Build label tensors 243 | reg_labels = [] 244 | for i in shuffled_reg_data: 245 | reg_labels.append(int(i[len(reg_data[0])-1])) 246 | 247 | fac_labels = [] 248 | for i in shuffled_fac_data: 249 | fac_labels.append(int(i[len(reg_data[0])-1])) 250 | 251 | #Take label out of data tensors 252 | for i in range(0, len(shuffled_reg_data)): 253 | shuffled_reg_data[i].pop() 254 | 255 | for i in range(0, len(shuffled_fac_data)): 256 | shuffled_fac_data[i].pop() 257 | 258 | #Create training sets by combining the randomly selected samples from each class 259 | train_x = shuffled_reg_data + shuffled_fac_data 260 | train_y = reg_labels + fac_labels 261 | 262 | #Shuffle positive/negative samples for CV purposes 263 | x_shuf = [] 264 | y_shuf = [] 265 | index_shuf = range(len(train_x)) 266 | shuffle(index_shuf) 267 | for i in index_shuf: 268 | x_shuf.append(train_x[i]) 269 | y_shuf.append(train_y[i]) 270 | 271 | return x_shuf, y_shuf 272 | 273 | def runIsolationSearch(data_folder, cfg, cnt_factor): 274 | 275 | 276 | max_acc = 0 277 | max_tree = 0 278 | 279 | for n, t in enumerate(range(10,500,10)): 280 | print t 281 | acc = 0 282 | tnr = 0 283 | fnr = 0 284 | tpr = 0 285 | fpr = 0 286 | ppv = 0 287 | npv = 0 288 | for i in range(0,3): 289 | #Gather the dataset 290 | train_x, train_y, test_x, test_y = gatherHoldoutData(data_folder, cfg) 291 | 292 | clf = IsolationForest(n_estimators=int(t), random_state=rng, max_features=1.0, contamination=cnt_factor) 293 | 294 | # fit the model 295 | cnt_train = int(math.ceil(cnt_factor * (len(train_x)/2))) 296 | clf.fit(train_x[:(len(train_x)/2) + cnt_train]) 297 | 298 | #make predictions on testing data 299 | cnt_test = int(math.ceil(cnt_factor * (len(test_x)/2))) 300 | #y_true, y_pred = test_y[:(len(test_x)/2) + cnt_test], clf.predict(test_x[:(len(test_x)/2) + cnt_test]) 301 | 302 | y_true, y_pred = test_y, clf.predict(test_x) 303 | 304 | #print(roc_auc_score(y_true, -clf.decision_function(test_x[:(len(test_x)/2) + cnt_test]))) 305 | 306 | eps = 0.0000000001 307 | FPositives = 0 308 | FNegatives = 0 309 | TPositives = 0 310 | TNegatives = 0 311 | 312 | for n, lbl in enumerate(y_pred): 313 | if(lbl == -1 and y_true[n] == -1): 314 | TNegatives += 1 315 | elif(lbl == 1 and y_true[n] == -1): 316 | FPositives += 1 317 | elif(lbl == -1 and y_true[n] == 1): 318 | FNegatives += 1 319 | elif(lbl == 1 and y_true[n] == 1): 320 | TPositives += 1 321 | 322 | accuracy = (TPositives + TNegatives)/float((len(test_x))) 323 | TNR = TNegatives/(TNegatives+float(FPositives)+eps) 324 | FNR = FNegatives/(TPositives+float(FNegatives)) 325 | TPR = TPositives/(TPositives+float(FNegatives)) 326 | FPR = FPositives/(FPositives+float(TNegatives)+eps) 327 | PPV = TPositives/(TPositives+float(FPositives)) 328 | NPV = TNegatives/(TNegatives+float(FNegatives)+eps) 329 | 330 | acc+=accuracy 331 | tnr+=TNR 332 | fnr+=FNR 333 | tpr+=TPR 334 | fpr+=FPR 335 | ppv+=PPV 336 | npv+=NPV 337 | 338 | ac = acc/3 339 | if(int(t)%100 == 0): 340 | print "100 trees = " + str(ac) 341 | if(ac > max_acc): 342 | max_acc = ac 343 | max_tree = int(t) 344 | print max_acc 345 | print max_tree 346 | 347 | def runIsolationRounds(data_folder, cfg, cnt_factor): 348 | 349 | acc = 0 350 | tnr = 0 351 | fnr = 0 352 | tpr = 0 353 | fpr = 0 354 | ppv = 0 355 | npv = 0 356 | 357 | for i in range(0,10): 358 | #Gather the dataset 359 | train_x, train_y, test_x, test_y = gatherHoldoutData(data_folder, cfg) 360 | 361 | clf = IsolationForest(n_estimators=100, random_state=rng, max_features=1.0, contamination=cnt_factor) 362 | 363 | # fit the model 364 | cnt_train = int(math.ceil(cnt_factor * (len(train_x)/2))) 365 | clf.fit(train_x[:(len(train_x)/2) + cnt_train]) 366 | 367 | #make predictions on testing data 368 | cnt_test = int(math.ceil(cnt_factor * (len(test_x)/2))) 369 | #y_true, y_pred = test_y[:(len(test_x)/2) + cnt_test], clf.predict(test_x[:(len(test_x)/2) + cnt_test]) 370 | 371 | y_true, y_pred = test_y, clf.predict(test_x) 372 | 373 | #print(roc_auc_score(y_true, -clf.decision_function(test_x[:(len(test_x)/2) + cnt_test]))) 374 | 375 | eps = 0.0000000001 376 | FPositives = 0 377 | FNegatives = 0 378 | TPositives = 0 379 | TNegatives = 0 380 | 381 | for n, lbl in enumerate(y_pred): 382 | if(lbl == -1 and y_true[n] == -1): 383 | TNegatives += 1 384 | elif(lbl == 1 and y_true[n] == -1): 385 | FPositives += 1 386 | elif(lbl == -1 and y_true[n] == 1): 387 | FNegatives += 1 388 | elif(lbl == 1 and y_true[n] == 1): 389 | TPositives += 1 390 | 391 | accuracy = (TPositives + TNegatives)/float((len(test_x))) 392 | TNR = TNegatives/(TNegatives+float(FPositives)+eps) 393 | FNR = FNegatives/(TPositives+float(FNegatives)) 394 | TPR = TPositives/(TPositives+float(FNegatives)) 395 | FPR = FPositives/(FPositives+float(TNegatives)+eps) 396 | PPV = TPositives/(TPositives+float(FPositives)) 397 | NPV = TNegatives/(TNegatives+float(FNegatives)+eps) 398 | 399 | acc+=accuracy 400 | tnr+=TNR 401 | fnr+=FNR 402 | tpr+=TPR 403 | fpr+=FPR 404 | ppv+=PPV 405 | npv+=NPV 406 | 407 | 408 | print "Acc = " + str(acc/10) 409 | print "TPR = " + str(tpr/10) 410 | print "TNR = " + str(tnr/10) 411 | print "FPR = " + str(fpr/10) 412 | print "FNR = " + str(fnr/10) 413 | print "PPV = " + str(ppv/10) 414 | print "NPV = " + str(npv/10) 415 | 416 | def runIsolation(data_folder, cfg, cnt_factor): 417 | rng = np.random.RandomState(42) 418 | #Gather the dataset 419 | train_x, train_y, test_x, test_y = gatherHoldoutData(data_folder, cfg) 420 | 421 | clf = IsolationForest(n_estimators=100,random_state=rng, bootstrap=True, max_features=1.0, contamination=cnt_factor) 422 | 423 | # fit the model 424 | cnt_train = int(math.ceil(cnt_factor * (len(train_x)/2))) 425 | clf.fit(train_x[:(len(train_x)/2) + cnt_train]) 426 | 427 | #make predictions on testing data 428 | cnt_test = int(math.ceil(cnt_factor * (len(test_x)/2))) 429 | y_true, y_pred = test_y[:(len(test_x)/2) + cnt_test], clf.predict(test_x[:(len(test_x)/2) + cnt_test]) 430 | 431 | #y_true, y_pred = test_y, clf.predict(test_x) 432 | 433 | #print(roc_auc_score(y_true, -clf.decision_function(test_x[:(len(test_x)/2) + cnt_test]))) 434 | 435 | eps = 0.0000000001 436 | FPositives = 0 437 | FNegatives = 0 438 | TPositives = 0 439 | TNegatives = 0 440 | 441 | for n, lbl in enumerate(y_pred): 442 | if(lbl == -1 and y_true[n] == -1): 443 | TNegatives += 1 444 | elif(lbl == 1 and y_true[n] == -1): 445 | FPositives += 1 446 | elif(lbl == -1 and y_true[n] == 1): 447 | FNegatives += 1 448 | elif(lbl == 1 and y_true[n] == 1): 449 | TPositives += 1 450 | 451 | accuracy = (TPositives + TNegatives)/float((len(test_x)/2) + cnt_test) 452 | TNR = TNegatives/(TNegatives+float(FPositives)+eps) 453 | FNR = FNegatives/(TPositives+float(FNegatives)) 454 | TPR = TPositives/(TPositives+float(FNegatives)) 455 | FPR = FPositives/(FPositives+float(TNegatives)+eps) 456 | PPV = TPositives/(TPositives+float(FPositives)) 457 | NPV = TNegatives/(TNegatives+float(FNegatives)+eps) 458 | print "Acc = " + str(accuracy) 459 | print "TPR = " + str(TPR) 460 | print "TNR = " + str(TNR) 461 | print "FPR = " + str(FPR) 462 | print "FNR = " + str(FNR) 463 | print "PPV = " + str(PPV) 464 | print "NPV = " + str(NPV) 465 | 466 | def runOptimizedIso_CV(data_folder, cfg): 467 | train_X, train_Y, test_X, test_Y = gatherHoldoutData_10times(data_folder, cfg, 0.9) 468 | 469 | estimators = [50, 100, 200] #np.linspace(0.1, 1, 10) 470 | samples=[64, 128, 256, 512] 471 | cnt_factors = [0] 472 | 473 | auc_report = [] 474 | best_config = [] 475 | max_auc = 0 476 | for estimator in estimators: 477 | for s in samples: 478 | mean_fpr = np.linspace(0, 1, 100) 479 | tprs = [] 480 | for n in range(0,10): 481 | train_x = train_X[n] 482 | train_y = train_Y[n] 483 | test_x = test_X[n] 484 | test_y = test_Y[n] 485 | 486 | rng = np.random.RandomState(2) 487 | clf = IsolationForest(n_estimators=estimator, max_samples=s, random_state=rng, bootstrap=True, max_features=1.0, contamination=0.5) 488 | clf.fit(train_x) 489 | 490 | #make predictions on testing data 491 | y_true, y_pred = test_y, clf.predict(test_x) 492 | #print y_pred 493 | for n ,l in enumerate(y_pred): 494 | if(l==1): 495 | y_pred[n] = -1 496 | elif(l==-1): 497 | y_pred[n] = 1 498 | 499 | fpr, tpr, thresholds = roc_curve(y_true, y_pred, drop_intermediate=True,pos_label=1) 500 | #print y_true 501 | #print y_pred 502 | tprs.append(interp(mean_fpr, fpr, tpr)) 503 | tprs[-1][0] = 0.0 504 | 505 | roc_auc = auc(fpr, tpr) 506 | #print "Fold %i auc: %f" % (n, roc_auc) 507 | 508 | mean_tpr = np.mean(tprs, axis=0) 509 | mean_tpr[-1] = 1.0 510 | mean_auc = auc(mean_fpr, mean_tpr) 511 | auc_report.append(mean_auc) 512 | 513 | if(mean_auc > max_auc): 514 | max_auc = mean_auc 515 | best_config = [mean_fpr, mean_tpr, estimator,s] 516 | print ("%f - estimator:%i, max-samples: %i" % (mean_auc, estimator, s)) 517 | 518 | print "################\n# Summary" 519 | print "Max. AUC: %f, Estimator: %i, Samples: %i" % (max_auc, best_config[2],best_config[3]) 520 | print "Avg. AUC: %f, " % (np.mean(auc_report,axis=0)) 521 | #Figure properties 522 | 523 | fig = plt.figure() 524 | ax1 = fig.add_subplot(111) 525 | plt.xlim([0, 1]) 526 | plt.ylim([0, 1]) 527 | plt.xlabel('False Positive Rate', fontsize=26) 528 | plt.ylabel('True Positive Rate', fontsize=26) 529 | 530 | ax1.plot([0, 1], [0, 1], 'k--', lw=2, color='orange', label = 'Random Guess') 531 | ax1.grid(color='black', linestyle='dotted') 532 | plt.setp(ax1.get_xticklabels(), fontsize=16) 533 | plt.setp(ax1.get_yticklabels(), fontsize=16) 534 | plt.plot(best_config[0], best_config[1], color='b', label=r'ROC (AUC = %0.2f)' % (max_auc), lw=2, alpha=.8) 535 | plt.legend(loc='lower right', fontsize='x-large') 536 | 537 | fig.savefig('Isolation/' + "DeltaShaper_Isolation_" + cfg[1] + ".pdf") # save the figure to file 538 | plt.close(fig) 539 | 540 | 541 | if __name__ == "__main__": 542 | 543 | cfgs = [ 544 | ["RegularTraffic", 545 | "DeltaShaperTraffic_320"], 546 | ["RegularTraffic", 547 | "DeltaShaperTraffic_160"]] 548 | 549 | if not os.path.exists('Isolation'): 550 | os.makedirs('Isolation') 551 | 552 | print "Isolation Forest - Summary Statistic Features - Set1" 553 | feature_set = 'Stats_60' #'Stats_60' / 'PL_60' 554 | data_folder = 'FeatureSets/' + feature_set + '/' 555 | if not os.path.exists('Isolation/' + feature_set): 556 | os.makedirs('Isolation/' + feature_set) 557 | 558 | 559 | for cfg in cfgs: 560 | runOptimizedIso_CV(data_folder,cfg) 561 | print "#####################################\n" 562 | 563 | print "Isolation Forest - Packet Length Features - Set2" 564 | feature_set = 'PL_60' #'Stats_60' / 'PL_60' 565 | data_folder = 'FeatureSets/' + feature_set + '/' 566 | if not os.path.exists('Isolation/' + feature_set): 567 | os.makedirs('Isolation/' + feature_set) 568 | 569 | for cfg in cfgs: 570 | runOptimizedIso_CV(data_folder,cfg) 571 | 572 | --------------------------------------------------------------------------------