├── models ├── dl │ ├── graphDapp │ │ ├── __init__.py │ │ ├── readme.txt │ │ ├── graphDapp_config.py │ │ ├── logger_wrappers.py │ │ ├── model_seriealization.py │ │ ├── graphDapp_main_model.py │ │ ├── test.py │ │ ├── train.py │ │ └── DApp_Classifier.py │ ├── __pycache__ │ │ └── .gitignore │ ├── fsnet │ │ ├── requirement.txt │ │ ├── service_online.py │ │ ├── README.md │ │ ├── dataset.py │ │ ├── eval.py │ │ ├── preprocess.py │ │ └── main.py │ ├── mimetic │ │ ├── readme │ │ ├── logger_wrappers.py │ │ ├── model_seriealization.py │ │ └── build_model.py │ ├── df │ │ ├── Deep Fingerprinting.pdf │ │ ├── df_model_config.py │ │ ├── df_services.py │ │ ├── df_services_test.py │ │ ├── generate_dataset.py │ │ ├── df_main_model.py │ │ └── df_model.py │ ├── df_only_D │ │ ├── Deep Fingerprinting.pdf │ │ ├── df_model_config.py │ │ ├── df_services.py │ │ ├── df_services_test.py │ │ ├── generate_dataset.py │ │ ├── df_model.py │ │ └── df_main_model.py │ ├── appnet │ │ ├── readme │ │ ├── logger_wrappers.py │ │ ├── model_seriealization.py │ │ └── build_model.py │ ├── cnn │ │ ├── Automated Website Fingerprinting through deep learning.pdf │ │ ├── cnn_model_config.py │ │ └── cnn_model.py │ ├── lstm │ │ ├── Automated Website Fingerprinting through deep learning.pdf │ │ ├── lstm_model_config.py │ │ └── lstm_model.py │ ├── sdae │ │ ├── Automated Website Fingerprinting through deep learning.pdf │ │ ├── sdae_model_config.py │ │ └── sdae_model.py │ ├── select_gpu.py │ ├── accuracy_per_class.py │ ├── beauty │ │ ├── cnn_model_config.py │ │ ├── cnn_model.py │ │ └── beauty_main_model.py │ ├── df_eval.py │ ├── sdae_eval.py │ ├── cnn_eval.py │ ├── examples.py │ ├── cnn_example.py │ ├── df_example.py │ ├── sdae_example.py │ ├── README.md │ ├── lstm_eval.py │ ├── lstm_example.py │ └── awf_dataset_util │ │ └── data.py ├── ml │ ├── __init__.py │ ├── cumul │ │ ├── model.py │ │ ├── saved_model │ │ │ └── readme.txt │ │ ├── __init__.py │ │ ├── hyper_params.py │ │ ├── model_predict.py │ │ ├── model_train.py │ │ ├── attack_cumul.py │ │ ├── feature_extractor.py │ │ └── util.py │ ├── bind │ │ ├── __init__.py │ │ ├── hyper_params.py │ │ ├── run.py │ │ ├── eval.py │ │ ├── train.py │ │ └── README.md │ ├── appscanner │ │ ├── __init__.py │ │ ├── 【1】AppScanner.pdf │ │ ├── README │ │ ├── hyper_params.py │ │ ├── min_max.py │ │ ├── eval.py │ │ ├── feature_extractor.py │ │ ├── train.py │ │ └── model.py │ └── rdp │ │ ├── rdp_config.py │ │ ├── readme.cn │ │ ├── feature_extractor.py │ │ ├── convert_to_csv.py │ │ ├── util.py │ │ └── statistic_tractor.py ├── __init__.py └── model_base.py ├── .idea └── .name ├── .gitignore ├── run.sh ├── config.py └── get_dataset_statistics.py /models/dl/graphDapp/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.idea/.name: -------------------------------------------------------------------------------- 1 | traffic_classification_utils -------------------------------------------------------------------------------- /models/ml/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'dk' 2 | -------------------------------------------------------------------------------- /models/ml/cumul/model.py: -------------------------------------------------------------------------------- 1 | __author__ = 'dk' 2 | -------------------------------------------------------------------------------- /models/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'dk' 2 | 3 | 4 | -------------------------------------------------------------------------------- /models/dl/__pycache__/.gitignore: -------------------------------------------------------------------------------- 1 | *.cpython-36.pyc 2 | -------------------------------------------------------------------------------- /models/ml/bind/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'dk' 2 | -------------------------------------------------------------------------------- /models/ml/appscanner/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'dk' 2 | 3 | -------------------------------------------------------------------------------- /models/dl/fsnet/requirement.txt: -------------------------------------------------------------------------------- 1 | tqdm 2 | numpy == 1.14.5 3 | tensorflow == 1.8.0 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | !README 3 | !*/ 4 | !*.txt 5 | !*.name 6 | !*.py 7 | !*.md 8 | !*.pdf 9 | -------------------------------------------------------------------------------- /models/dl/mimetic/readme: -------------------------------------------------------------------------------- 1 | 1. 载荷信息: 2 | CNN处理 3 | 2. 序列信息: 4 | 包长序列、包方向序列、包达到时间间隔序列、tcp window size序列 使用GRU -------------------------------------------------------------------------------- /models/dl/graphDapp/readme.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jmhIcoding/traffic_classification_utils/HEAD/models/dl/graphDapp/readme.txt -------------------------------------------------------------------------------- /models/dl/df/Deep Fingerprinting.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jmhIcoding/traffic_classification_utils/HEAD/models/dl/df/Deep Fingerprinting.pdf -------------------------------------------------------------------------------- /models/ml/appscanner/【1】AppScanner.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jmhIcoding/traffic_classification_utils/HEAD/models/ml/appscanner/【1】AppScanner.pdf -------------------------------------------------------------------------------- /models/ml/cumul/saved_model/readme.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jmhIcoding/traffic_classification_utils/HEAD/models/ml/cumul/saved_model/readme.txt -------------------------------------------------------------------------------- /models/dl/df_only_D/Deep Fingerprinting.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jmhIcoding/traffic_classification_utils/HEAD/models/dl/df_only_D/Deep Fingerprinting.pdf -------------------------------------------------------------------------------- /models/ml/appscanner/README: -------------------------------------------------------------------------------- 1 | 本模型是appscanner模型,主要用于识别移动app的流量. 2 | 论文:Appscanner: Automatic fingerprinting of smartphone apps from encrypted network traffic (EuroS&P 2016) -------------------------------------------------------------------------------- /run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | source ~/venv/bin/activate && 4 | export PYTHONPATH=/home3/jmh/traffic_classification_utils/:$PYTHONPATH && 5 | python3 $1 #foolbox_example.py 6 | -------------------------------------------------------------------------------- /models/dl/appnet/readme: -------------------------------------------------------------------------------- 1 | 包长序列:embedding, 使用Bi-LSTM做特征提取 2 | 包长序列最长截断为20,embedding 长度为128 3 | 4 | TLS第一个握手包的字节载荷:embedding, 使用CNN做特征提取 5 | 第一个握手包字节截断为1014, embedding长度为256 6 | 7 | -------------------------------------------------------------------------------- /config.py: -------------------------------------------------------------------------------- 1 | __author__ = 'dk' 2 | import os 3 | project_base = os.path.dirname(os.path.realpath(__file__)) 4 | raw_dataset_base = project_base + '/dataset/' 5 | min_flow_len = 10 6 | whitelist = [] 7 | -------------------------------------------------------------------------------- /models/ml/cumul/__init__.py: -------------------------------------------------------------------------------- 1 | ################## 2 | ################## 3 | ##基于Cell+SVM,实现CUMUL的Tor 分类方法 4 | ##`https://nymity.ch/tor-dns/pdf/Panchenko2016a.pdf` 5 | ################## 6 | ################## 7 | -------------------------------------------------------------------------------- /models/dl/cnn/Automated Website Fingerprinting through deep learning.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jmhIcoding/traffic_classification_utils/HEAD/models/dl/cnn/Automated Website Fingerprinting through deep learning.pdf -------------------------------------------------------------------------------- /models/dl/lstm/Automated Website Fingerprinting through deep learning.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jmhIcoding/traffic_classification_utils/HEAD/models/dl/lstm/Automated Website Fingerprinting through deep learning.pdf -------------------------------------------------------------------------------- /models/dl/sdae/Automated Website Fingerprinting through deep learning.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jmhIcoding/traffic_classification_utils/HEAD/models/dl/sdae/Automated Website Fingerprinting through deep learning.pdf -------------------------------------------------------------------------------- /models/dl/graphDapp/graphDapp_config.py: -------------------------------------------------------------------------------- 1 | __author__ = 'dk' 2 | config = { 3 | 'device_id':'cuda:2', 4 | 'max_epoch':200, 5 | 'gin_layer_num':3, 6 | 'gin_hidden_units':64, 7 | 'iteration_nums':3, 8 | 'learning_rate':5e-4, 9 | 'batch_size':128, 10 | 11 | } 12 | -------------------------------------------------------------------------------- /models/ml/cumul/hyper_params.py: -------------------------------------------------------------------------------- 1 | __author__ = 'dk' 2 | hyper_params = {'boosting_type': 'rf', 3 | 'objective': 'multiclass', 4 | 'num_leaves': 512, 5 | 'learning_rate': 0.05, 6 | 'feature_fraction': 0.9, 7 | 'bagging_fraction': 0.8, 8 | 'bagging_freq': 5, 9 | 'verbose': -1, 10 | 'num_class':100, 11 | 'lambda_l1':0.05, 12 | 'lambda_l2':0.15 13 | } 14 | -------------------------------------------------------------------------------- /models/ml/bind/hyper_params.py: -------------------------------------------------------------------------------- 1 | __author__ = 'dk' 2 | hyper_params = { 3 | 'boosting_type': 'gbdt', 4 | 'objective': 'multiclass', 5 | 'num_leaves': 512, 6 | 'learning_rate': 0.05, 7 | 'feature_fraction': 0.9, 8 | 'bagging_fraction': 0.8, 9 | 'bagging_freq': 5, 10 | 'verbose': -1, 11 | 'lambda_l1':0.05, 12 | 'lambda_l2':0.15 13 | } 14 | -------------------------------------------------------------------------------- /models/ml/appscanner/hyper_params.py: -------------------------------------------------------------------------------- 1 | __author__ = 'dk' 2 | hyper_params = { 3 | 'boosting_type': 'gbdt', 4 | 'objective': 'multiclass', 5 | 'num_leaves': 32, 6 | 'learning_rate': 0.05, 7 | 'feature_fraction': 0.9, 8 | 'bagging_fraction': 0.8, 9 | 'bagging_freq': 5, 10 | 'verbose': -1, 11 | 'lambda_l1':0.05, 12 | 'lambda_l2':0.15 13 | } 14 | -------------------------------------------------------------------------------- /models/ml/rdp/rdp_config.py: -------------------------------------------------------------------------------- 1 | __author__ = 'dk' 2 | #超参数 3 | hyper_params={ 4 | 'boosting_type': 'rf', 5 | 'objective': 'multiclass', 6 | 'num_leaves': 512, 7 | 'learning_rate': 0.05, 8 | 'feature_fraction': 0.9, 9 | 'bagging_fraction': 0.8, 10 | 'bagging_freq': 5, 11 | 'verbose': 0, 12 | 'num_class':6, 13 | 'lambda_l1':0.05, 14 | 'lambda_l2':0.15, 15 | 'time_threshold':0.3 16 | } 17 | -------------------------------------------------------------------------------- /models/dl/df/df_model_config.py: -------------------------------------------------------------------------------- 1 | __author__ = 'dk' 2 | nb_classes_template = 55 #标签的个数 number of classes【此处需要修改】 3 | learning_params_template ={ 4 | "epoch":200, 5 | "batch_size":128, 6 | "in_dim":200, #输入向量的长度【此处需要修改】 7 | "input_length":200, #输入向量的长度【此处需要修改】 8 | "lr":0.002, #学习速率 9 | "beta_1":0.9, 10 | "beta_2":0.999, 11 | "epsilon":1e-08, 12 | "decay":0.0 13 | } 14 | 15 | assert learning_params_template['in_dim']==learning_params_template['input_length'] 16 | -------------------------------------------------------------------------------- /models/dl/df_only_D/df_model_config.py: -------------------------------------------------------------------------------- 1 | __author__ = 'dk' 2 | nb_classes_template = 55 #标签的个数 number of classes【此处需要修改】 3 | learning_params_template ={ 4 | "epoch":200, 5 | "batch_size":128, 6 | "in_dim":200, #输入向量的长度【此处需要修改】 7 | "input_length":200, #输入向量的长度【此处需要修改】 8 | "lr":0.002, #学习速率 9 | "beta_1":0.9, 10 | "beta_2":0.999, 11 | "epsilon":1e-08, 12 | "decay":0.0 13 | } 14 | 15 | assert learning_params_template['in_dim']==learning_params_template['input_length'] 16 | -------------------------------------------------------------------------------- /models/ml/rdp/readme.cn: -------------------------------------------------------------------------------- 1 | 方法来源:IPCCC 2019 2 | @inproceedings{jiang2019know, 3 | title={I Know What You Are Doing With Remote Desktop}, 4 | author={Jiang, Minghao and Gou, Gaopeng and Shi, Junzheng and Xiong, Gang}, 5 | booktitle={2019 IEEE 38th International Performance Computing and Communications Conference (IPCCC)}, 6 | pages={1--7}, 7 | year={2019}, 8 | organization={IEEE} 9 | } 10 | 11 | 方法逻辑: 12 | 1. 按照时间idle切分burst 13 | 2. 每个burst提取66维统计特征,前51维是包长的统计特征,后15维是包间隔的统计特征 14 | 3. 训练的时候,以burst为单元进行分类 15 | 4. 测试的时候,给定一条流,对里面的所有burst,按照投票选出它的标签 16 | 17 | -------------------------------------------------------------------------------- /models/dl/fsnet/service_online.py: -------------------------------------------------------------------------------- 1 | __author__ = 'dk' 2 | from flask import request, Flask, jsonify 3 | app = Flask(__name__) 4 | from fsnet_main_model import model 5 | 6 | fsnet_model = model('fgnet53', randseed= 128, splitrate=0.1) 7 | @app.route('/fsnet/logit',methods=['POST']) 8 | def get_logit(): 9 | if not request.json or not 'flow' in request.json: 10 | return jsonify({'error':'not flow in request'}) 11 | flow = request.json['flow'] 12 | logit =fsnet_model.logit_online(flow).tolist() 13 | return jsonify({'logit':logit}) 14 | 15 | if __name__ == '__main__': 16 | app.run(host='192.168.255.82',port=10086,debug=True) -------------------------------------------------------------------------------- /models/dl/graphDapp/logger_wrappers.py: -------------------------------------------------------------------------------- 1 | __author__ = 'dk' 2 | import logging 3 | import datetime 4 | logger=logging.Logger('graph neural network') 5 | logger.setLevel(logging.NOTSET) 6 | _WARNING = 10 7 | _INFO = 100 8 | _ERROR= 0 9 | level = _INFO 10 | def warning(msg): 11 | if level < _WARNING : 12 | return 13 | logger.warning(msg="Time:{0}, [WARN]: {1}".format(datetime.datetime.now(),msg)) 14 | 15 | def info(msg): 16 | if level < _INFO : 17 | return 18 | logger.warning(msg="Time:{0}, [INFO]: {1}".format(datetime.datetime.now(),msg)) 19 | 20 | def error(msg): 21 | logger.warning(msg="Time:{0}, [ERROR]: {1}".format(datetime.datetime.now(),msg)) -------------------------------------------------------------------------------- /models/ml/cumul/model_predict.py: -------------------------------------------------------------------------------- 1 | __author__ = 'jmh081701' 2 | import lightgbm as lgb 3 | from src.cumul.util import CUMUL_datagenerator 4 | from sklearn.metrics import accuracy_score 5 | import numpy as np 6 | saved_model = "./saved_model/cumul.model" 7 | model = lgb.Booster(model_file=saved_model) 8 | 9 | dator = CUMUL_datagenerator(is_train=True) 10 | def prediction(X): 11 | #X = dator.feature_extract(X) 12 | logit = model.predict(data=X) 13 | y = list(map(lambda x : np.argmax(x),logit)) 14 | #assert len(y.shape) == X.shape[0] 15 | 16 | return y 17 | 18 | if __name__ == '__main__': 19 | dator = CUMUL_datagenerator(is_train=True) 20 | test_X,test_y = dator.testSet() 21 | predict_y = prediction(test_X) 22 | accuracy = accuracy_score(test_y,predict_y) 23 | print('test accuracy:{0}'.format(accuracy)) 24 | 25 | -------------------------------------------------------------------------------- /models/dl/select_gpu.py: -------------------------------------------------------------------------------- 1 | __author__ = 'jmh081701' 2 | import os 3 | import sys 4 | import numpy as np 5 | cmd = "nvidia-smi --query-gpu=index,memory.free --format=csv" 6 | def get_free_gpu_id(): 7 | pipe =os.popen(cmd) 8 | freeMbs =[] 9 | for eachline in pipe: 10 | if 'index' in eachline: 11 | continue 12 | id,freeMb=eachline.split(',') 13 | freeMb = int(freeMb.replace("MiB","")) 14 | freeMbs.append(freeMb) 15 | 16 | return str(np.argmax(freeMbs)) 17 | def set_visible_gpu(): 18 | ''' 19 | 选择当前GPU列表里面,空余内存最大的显卡。 20 | windows平台不做任何选择 21 | ''' 22 | if sys.platform=='linux': 23 | os.environ["CUDA_VISIBLE_DEVICES"] = get_free_gpu_id() 24 | else: 25 | os.environ['CUDA_VISBALE_DEIVCES'] ='0' 26 | if __name__ == '__main__': 27 | 28 | print(get_free_gpu_id()) -------------------------------------------------------------------------------- /models/ml/rdp/feature_extractor.py: -------------------------------------------------------------------------------- 1 | __author__ = 'dk' 2 | from models.ml.rdp.rdp_config import hyper_params 3 | from models.ml.rdp.statistic_tractor import peak_feature 4 | 5 | def feature_extract(pkt_size, timestamps, time_threshold= hyper_params['time_threshold'] ): 6 | assert len(pkt_size) == len(timestamps) 7 | timestamps = [0.0] + [timestamps[i]- timestamps[i-1] for i in range(1, len(timestamps)) ] 8 | total_peak = [(each[0], each[1]) for each in zip(timestamps, pkt_size)] 9 | peaks = [[]] 10 | for i in range(len(total_peak)): 11 | if total_peak[i][0] <= time_threshold : 12 | peaks[-1].append(total_peak[i]) 13 | else: 14 | peaks.append([total_peak[i]]) 15 | 16 | features = [] 17 | for peak in peaks: 18 | feature = peak_feature(peak) 19 | features.append(feature) 20 | 21 | return features[1:] -------------------------------------------------------------------------------- /models/dl/accuracy_per_class.py: -------------------------------------------------------------------------------- 1 | __author__ = 'dk' 2 | #计算每个类别的准确率 3 | import json 4 | from sklearn import metrics 5 | from sklearn.metrics import classification_report 6 | def accuracy_per_class(y_real,y_pred): 7 | right={} 8 | error={} 9 | for i in range(len(y_real)): 10 | if y_real[i] not in right: 11 | right.setdefault(y_real[i],0) 12 | if y_real[i] not in error: 13 | error.setdefault(y_real[i],0) 14 | if y_real[i]==y_pred[i]: 15 | right[y_real[i]] += 1 16 | else: 17 | error[y_real[i]] += 1 18 | acc={} 19 | for each in right: 20 | acc.setdefault(each,right[each]/(right[each]+error[each])) 21 | print('Accuracy of each class:') 22 | print(acc) 23 | #for i in range(len(right)): 24 | # print("%0.2d\t%0.4f"%(i,acc[i]*100 if i in acc else 100)) 25 | 26 | #计算各种率 27 | print(classification_report(y_true=y_real,y_pred=y_pred,digits=5)) -------------------------------------------------------------------------------- /models/dl/df/df_services.py: -------------------------------------------------------------------------------- 1 | import flask 2 | from df_main_model import model 3 | from flask import Flask, request, jsonify 4 | import requests 5 | 6 | app = Flask(__name__) 7 | df_model = model('datacon',128,0.1) 8 | 9 | _labels = ['0.json','1.json','2.json','3.json','4.json','5.json','6.json','7.json','8.json','9.json','10.json'] 10 | _labels.sort() 11 | 12 | @app.route(rule= '/datacon', methods=['POST']) 13 | def tunnel(): 14 | try: 15 | request_data = request.json 16 | label = df_model.predict(request_data['packet_length']) 17 | label = [_labels[_id].replace('.json','') for _id in label] 18 | return jsonify({'status':'success', 'label': label}) 19 | 20 | except BaseException as exp: 21 | #raise exp 22 | return jsonify({'status':'error', 'data': str(exp)}) 23 | 24 | if __name__ == '__main__': 25 | app.run(host="0.0.0.0", 26 | port=8899, 27 | threaded=True) 28 | -------------------------------------------------------------------------------- /models/dl/df_only_D/df_services.py: -------------------------------------------------------------------------------- 1 | import flask 2 | from df_main_model import model 3 | from flask import Flask, request, jsonify 4 | import requests 5 | 6 | app = Flask(__name__) 7 | df_model = model('datacon',128,0.1) 8 | 9 | _labels = ['0.json','1.json','2.json','3.json','4.json','5.json','6.json','7.json','8.json','9.json','10.json'] 10 | _labels.sort() 11 | 12 | @app.route(rule= '/datacon', methods=['POST']) 13 | def tunnel(): 14 | try: 15 | request_data = request.json 16 | label = df_model.predict(request_data['packet_length']) 17 | label = [_labels[_id].replace('.json','') for _id in label] 18 | return jsonify({'status':'success', 'label': label}) 19 | 20 | except BaseException as exp: 21 | #raise exp 22 | return jsonify({'status':'error', 'data': str(exp)}) 23 | 24 | if __name__ == '__main__': 25 | app.run(host="0.0.0.0", 26 | port=8899, 27 | threaded=True) 28 | -------------------------------------------------------------------------------- /models/dl/df/df_services_test.py: -------------------------------------------------------------------------------- 1 | import flask 2 | from df_main_model import model 3 | from flask import Flask, request, jsonify 4 | import requests 5 | 6 | app = Flask(__name__) 7 | df_model = model('datacon_training',128,0.1) 8 | 9 | _labels = ['0.json','1.json','2.json','3.json','4.json','5.json','6.json','7.json','8.json','9.json','10.json'] 10 | _labels.sort() 11 | 12 | @app.route(rule= '/datacon', methods=['POST']) 13 | def tunnel(): 14 | try: 15 | request_data = request.json 16 | label = df_model.predict(request_data['packet_length']) 17 | label = [_labels[_id].replace('.json','') for _id in label] 18 | return jsonify({'status':'success', 'label': label}) 19 | 20 | except BaseException as exp: 21 | #raise exp 22 | return jsonify({'status':'error', 'data': str(exp)}) 23 | 24 | if __name__ == '__main__': 25 | app.run(host="0.0.0.0", 26 | port=8898, 27 | threaded=True) 28 | -------------------------------------------------------------------------------- /models/dl/df_only_D/df_services_test.py: -------------------------------------------------------------------------------- 1 | import flask 2 | from df_main_model import model 3 | from flask import Flask, request, jsonify 4 | import requests 5 | 6 | app = Flask(__name__) 7 | df_model = model('datacon_training',128,0.1) 8 | 9 | _labels = ['0.json','1.json','2.json','3.json','4.json','5.json','6.json','7.json','8.json','9.json','10.json'] 10 | _labels.sort() 11 | 12 | @app.route(rule= '/datacon', methods=['POST']) 13 | def tunnel(): 14 | try: 15 | request_data = request.json 16 | label = df_model.predict(request_data['packet_length']) 17 | label = [_labels[_id].replace('.json','') for _id in label] 18 | return jsonify({'status':'success', 'label': label}) 19 | 20 | except BaseException as exp: 21 | #raise exp 22 | return jsonify({'status':'error', 'data': str(exp)}) 23 | 24 | if __name__ == '__main__': 25 | app.run(host="0.0.0.0", 26 | port=8898, 27 | threaded=True) 28 | -------------------------------------------------------------------------------- /models/dl/lstm/lstm_model_config.py: -------------------------------------------------------------------------------- 1 | 2 | __author__ = 'dk' 3 | ###这个模型来自于:Automated website fingerprinting through deep learning (Vera Rimmer et.al ) 4 | #学习的参数 5 | nb_classes_template = 100 #分类的目标类别数目,指网站的数目【此处需要修改】 6 | learn_params_template={ 7 | "nb_epochs": 50, 8 | "maxlen": 40, #向量最大长度,最大包长序列长度【此处需要修改】 9 | "nb_features": 1, #这个是每个向量的每个分量的维度,类似于embed后的长度,默认就是+1,-1的序列,所以长度为1。 10 | "batch_size": 256, 11 | "val_split": 0.15, 12 | "test_split": 0.15, 13 | "optimizer": "rmsprop", 14 | "nb_layers": 2, 15 | "layers": [ 16 | { 17 | "units": 128, 18 | "dropout": 0.22244615886559121, 19 | "activation": "tanh", 20 | "rec_activation": "hard_sigmoid" 21 | }, 22 | { 23 | "units": 128, 24 | "dropout": 0.20857652372682717, 25 | "activation": "tanh", 26 | "rec_activation": "hard_sigmoid" 27 | } 28 | ], 29 | "lr": 0.0010053829131721616, 30 | "decay": 0, 31 | "momentum": 0.9, 32 | "nesterov": True 33 | } 34 | -------------------------------------------------------------------------------- /models/dl/mimetic/logger_wrappers.py: -------------------------------------------------------------------------------- 1 | __author__ = 'dk' 2 | import logging 3 | import datetime 4 | logger_name = 'appnet' 5 | scirpt_name = '' 6 | logger=logging.Logger(logger_name) 7 | logger.setLevel(logging.NOTSET) 8 | _WARNING = 10 9 | _INFO = 100 10 | _ERROR= 0 11 | level = _INFO 12 | def warning(msg): 13 | if level < _WARNING : 14 | return 15 | msg = "Time:{0}, [{2}-WARN]: {1}".format(datetime.datetime.now(),msg, scirpt_name) 16 | logger.warning(msg) 17 | with open(logger_name+'.log','a') as fp: 18 | fp.writelines(msg+'\n') 19 | 20 | def info(msg): 21 | if level < _INFO : 22 | return 23 | msg="Time:{0}, [{2}-INFO]: {1}".format(datetime.datetime.now(),msg, scirpt_name) 24 | logger.warning(msg) 25 | with open(logger_name+'.log','a') as fp: 26 | fp.writelines(msg+'\n') 27 | def error(msg): 28 | msg ="Time:{0}, [{2}-ERROR]: {1}".format(datetime.datetime.now(),msg, scirpt_name) 29 | logger.warning(msg) 30 | with open(logger_name+'.log','a') as fp: 31 | fp.writelines(msg+'\n') 32 | -------------------------------------------------------------------------------- /models/dl/appnet/logger_wrappers.py: -------------------------------------------------------------------------------- 1 | __author__ = 'dk' 2 | import logging 3 | import datetime 4 | logger_name = 'unsupervised_adaption_for_traffic_classification' 5 | scirpt_name = '' 6 | logger=logging.Logger(logger_name) 7 | logger.setLevel(logging.NOTSET) 8 | _WARNING = 10 9 | _INFO = 100 10 | _ERROR= 0 11 | level = _INFO 12 | def warning(msg): 13 | if level < _WARNING : 14 | return 15 | msg = "Time:{0}, [{2}-WARN]: {1}".format(datetime.datetime.now(),msg, scirpt_name) 16 | logger.warning(msg) 17 | with open(logger_name+'.log','a') as fp: 18 | fp.writelines(msg+'\n') 19 | 20 | def info(msg): 21 | if level < _INFO : 22 | return 23 | msg="Time:{0}, [{2}-INFO]: {1}".format(datetime.datetime.now(),msg, scirpt_name) 24 | logger.warning(msg) 25 | with open(logger_name+'.log','a') as fp: 26 | fp.writelines(msg+'\n') 27 | def error(msg): 28 | msg ="Time:{0}, [{2}-ERROR]: {1}".format(datetime.datetime.now(),msg, scirpt_name) 29 | logger.warning(msg) 30 | with open(logger_name+'.log','a') as fp: 31 | fp.writelines(msg+'\n') 32 | -------------------------------------------------------------------------------- /models/dl/appnet/model_seriealization.py: -------------------------------------------------------------------------------- 1 | __author__ = 'dk' 2 | #模型的保存和加载 3 | import torch 4 | import json 5 | import logger_wrappers 6 | import os 7 | 8 | def save(model,model_path): 9 | checkpoint_path = model_path 10 | path = model_path 11 | if os.path.exists(os.path.dirname(checkpoint_path)) == False: 12 | logger_wrappers.warning('create checkpoint path: {0}'.format(os.path.dirname(checkpoint_path))) 13 | os.makedirs(os.path.dirname(checkpoint_path), exist_ok= True) 14 | torch.save(model.state_dict(),path) 15 | info = "Dump model to {0} well.".format(checkpoint_path) 16 | logger_wrappers.warning(info) 17 | 18 | def load(model,model_path, use_gpu=True, device=None): 19 | #print(device) 20 | path = model_path 21 | if os.path.exists(path): 22 | if use_gpu == False: 23 | map_location= torch.device('cpu') 24 | else: 25 | map_location = lambda storage, loc: storage.cuda(int(device.split(":")[-1])) 26 | model_CKPT = torch.load(path, map_location=map_location) 27 | model.load_state_dict(model_CKPT) 28 | info ="Load model from {0} well.".format(path) 29 | logger_wrappers.warning(info) 30 | else: 31 | logger_wrappers.warning('Load empty model from {0}.'.format(path)) 32 | return model#,optimizer 33 | -------------------------------------------------------------------------------- /models/dl/mimetic/model_seriealization.py: -------------------------------------------------------------------------------- 1 | __author__ = 'dk' 2 | #模型的保存和加载 3 | import torch 4 | import json 5 | import logger_wrappers 6 | import os 7 | 8 | def save(model,model_path): 9 | checkpoint_path = model_path 10 | path = model_path 11 | if os.path.exists(os.path.dirname(checkpoint_path)) == False: 12 | logger_wrappers.warning('create checkpoint path: {0}'.format(os.path.dirname(checkpoint_path))) 13 | os.makedirs(os.path.dirname(checkpoint_path), exist_ok= True) 14 | torch.save(model.state_dict(),path) 15 | info = "Dump model to {0} well.".format(checkpoint_path) 16 | logger_wrappers.warning(info) 17 | 18 | def load(model,model_path, use_gpu=True, device=None): 19 | #print(device) 20 | path = model_path 21 | if os.path.exists(path): 22 | if use_gpu == False: 23 | map_location= torch.device('cpu') 24 | else: 25 | map_location = lambda storage, loc: storage.cuda(int(device.split(":")[-1])) 26 | model_CKPT = torch.load(path, map_location=map_location) 27 | model.load_state_dict(model_CKPT) 28 | info ="Load model from {0} well.".format(path) 29 | logger_wrappers.warning(info) 30 | else: 31 | logger_wrappers.warning('Load empty model from {0}.'.format(path)) 32 | return model#,optimizer 33 | -------------------------------------------------------------------------------- /models/ml/bind/run.py: -------------------------------------------------------------------------------- 1 | __author__ = 'dk' 2 | from BIND.build_vector_dataset import builder 3 | import lightgbm as lgb 4 | from sklearn.metrics import accuracy_score 5 | import numpy as np 6 | ##原始的包长序列 7 | bd = builder(raw_feature_dictory='./raw_feature/',global_feature_dict_filename="./raw_feature/global_feature_dict.vocb") 8 | X_train,y_train,X_test,y_test,X_valid,y_valid=bd.vectorize() 9 | saved_model = "bind.model" 10 | ##开始训练 11 | 12 | lgb_train = lgb.Dataset(data=X_train,label=y_train) 13 | lgb_eval = lgb.Dataset(data=X_valid,label=y_valid) 14 | hyper_params = { 15 | 'boosting_type': 'rf', 16 | 'objective': 'multiclass', 17 | 'num_leaves': 512, 18 | 'learning_rate': 0.05, 19 | 'feature_fraction': 0.9, 20 | 'bagging_fraction': 0.8, 21 | 'bagging_freq': 5, 22 | 'verbose': 0, 23 | 'num_class':55, 24 | 'lambda_l1':0.05, 25 | 'lambda_l2':0.15 26 | } 27 | 28 | gbm = lgb.train(params=hyper_params, 29 | train_set=lgb_train, 30 | valid_sets=lgb_eval, 31 | num_boost_round=100, 32 | early_stopping_rounds=5) 33 | #save model 34 | try: 35 | gbm.save_model(saved_model) 36 | except BaseException as exp: 37 | pass 38 | logit = gbm.predict(data=X_test) 39 | label_predict = list(map(lambda x : np.argmax(x),logit)) 40 | 41 | accuracy = accuracy_score(y_test,label_predict) 42 | 43 | print(accuracy) -------------------------------------------------------------------------------- /models/ml/bind/eval.py: -------------------------------------------------------------------------------- 1 | from BIND.build_vector_dataset import builder 2 | import lightgbm as lgb 3 | from sklearn.metrics import accuracy_score 4 | import numpy as np 5 | from accuracy_per_class import accuracy_per_class 6 | def main(raw_feature_dictory,modelpath,global_feature_dict_filename="./global_feature_dict.vocb"): 7 | ##原始的包长序列 8 | bd = builder(raw_feature_dictory=raw_feature_dictory,global_feature_dict_filename=global_feature_dict_filename) 9 | X_train,y_train,X_test,y_test,X_valid,y_valid=bd.vectorize(test_split_ratio=0.5) 10 | saved_model = modelpath 11 | ##开始训练 12 | 13 | lgb_train = lgb.Dataset(data=X_train,label=y_train) 14 | lgb_eval = lgb.Dataset(data=X_valid,label=y_valid) 15 | hyper_params = { 16 | 'boosting_type': 'rf', 17 | 'objective': 'multiclass', 18 | 'num_leaves': 512, 19 | 'learning_rate': 0.05, 20 | 'feature_fraction': 0.9, 21 | 'bagging_fraction': 0.8, 22 | 'bagging_freq': 5, 23 | 'verbose': 0, 24 | 'num_class':53, 25 | 'lambda_l1':0.05, 26 | 'lambda_l2':0.15 27 | } 28 | gbm = lgb.Booster(model_file=saved_model) 29 | logit = gbm.predict(data=X_test) 30 | label_predict = list(map(lambda x : np.argmax(x),logit)) 31 | 32 | accuracy = accuracy_score(y_test,label_predict) 33 | accuracy_per_class(y_real=y_test,y_pred=label_predict) 34 | 35 | print(accuracy) -------------------------------------------------------------------------------- /models/ml/cumul/model_train.py: -------------------------------------------------------------------------------- 1 | __author__ = 'jmh081701' 2 | import sklearn 3 | from sklearn.externals import joblib 4 | from sklearn.metrics import accuracy_score 5 | import lightgbm as lgb 6 | import numpy as np 7 | from src.cumul.util import CUMUL_datagenerator 8 | dator = CUMUL_datagenerator(is_train=True) 9 | 10 | saved_model = "saved_model/cumul.model" 11 | train_X,train_y = dator.trainSet() 12 | valid_X,valid_y = dator.validSet() 13 | test_X,test_y = dator.testSet() 14 | 15 | lgb_train = lgb.Dataset(data=train_X,label=train_y) 16 | lgb_eval = lgb.Dataset(data=valid_X,label=valid_y) 17 | 18 | hyper_params = { 19 | 'boosting_type': 'rf', 20 | 'objective': 'multiclass', 21 | 'num_leaves': 512, 22 | 'learning_rate': 0.05, 23 | 'feature_fraction': 0.9, 24 | 'bagging_fraction': 0.8, 25 | 'bagging_freq': 5, 26 | 'verbose': 0, 27 | 'num_class':100, 28 | 'lambda_l1':0.05, 29 | 'lambda_l2':0.15 30 | } 31 | 32 | gbm = lgb.train(params=hyper_params, 33 | train_set=lgb_train, 34 | valid_sets=lgb_eval, 35 | num_boost_round=3000, 36 | early_stopping_rounds=10) 37 | 38 | logit = gbm.predict(data=test_X) 39 | label_predict = list(map(lambda x : np.argmax(x),logit)) 40 | 41 | accuracy = accuracy_score(test_y,label_predict) 42 | print(accuracy) 43 | 44 | #save model 45 | gbm.save_model(saved_model) 46 | 47 | 48 | 49 | 50 | 51 | -------------------------------------------------------------------------------- /get_dataset_statistics.py: -------------------------------------------------------------------------------- 1 | __author__ = 'jmh081701' 2 | import numpy as np 3 | import json 4 | import os 5 | 6 | def statistic(lengths): 7 | mean = np.mean(lengths) 8 | min = np.min(lengths) 9 | max = np.max(lengths) 10 | std = np.std(lengths) 11 | median = np.median(lengths) 12 | print('\tmean:{0}, std:{1}, min:{2}, max:{3}, median:{4}'.format(mean, std, min, max, median)) 13 | percent = [10,20,30,40,50,60,70,80,90,95,99] 14 | for each_percent in percent: 15 | print("\t\tP( v<={1}) = {0}".format(each_percent, np.percentile(lengths,each_percent))) 16 | def parser_dataset(dataset_dir): 17 | total_length = [] 18 | print(dataset_dir) 19 | for _root, _dirs, _files in os.walk(dataset_dir): 20 | if len(_files) == 0 : 21 | raise ValueError('{0} empty!'.format(dataset_dir)) 22 | 23 | for file in _files: 24 | length = [] 25 | path = _root + '/' + file 26 | with open(path) as fp: 27 | data = json.load(fp) 28 | 29 | for flow in data: 30 | length.append(len(flow['packet_length'])) 31 | total_length.append(length[-1]) 32 | 33 | #print(file) 34 | #print(statistic(length)) 35 | 36 | print('total:') 37 | statistic(total_length) 38 | 39 | if __name__ == '__main__': 40 | dataset_dir = 'dataset/tifs2015' 41 | parser_dataset(dataset_dir) -------------------------------------------------------------------------------- /models/dl/graphDapp/model_seriealization.py: -------------------------------------------------------------------------------- 1 | __author__ = 'dk' 2 | #模型的保存和加载 3 | import torch 4 | import json 5 | import logger_wrappers 6 | import os 7 | model_name = "/gnn_model.pkl" 8 | def save(model,optimizer,checkpoint_path): 9 | path = checkpoint_path + model_name 10 | if os.path.exists(checkpoint_path) == False: 11 | os.makedirs(checkpoint_path) 12 | torch.save(model.state_dict(),path) 13 | #torch.save( 14 | # {'state_dict':model.state_dict(), 15 | # 'optimizer':optimizer.state_dict()}, 16 | # (checkpoint_path+model_name).replace("//","/") 17 | #) 18 | info = "Dump model to {0} well.".format(checkpoint_path) 19 | logger_wrappers.warning(info) 20 | 21 | def load(model,optimizer,checkpoint_path, use_gpu=True): 22 | path = (checkpoint_path+model_name).replace("//","/") 23 | if os.path.exists(path): 24 | if use_gpu == False: 25 | map_location= torch.device('cpu') 26 | else: 27 | map_location = None 28 | model_CKPT = torch.load(path, map_location=map_location) 29 | model.load_state_dict(model_CKPT) 30 | #model.load_state_dict(model_CKPT['state_dict']) 31 | #optimizer.load_state_dict(model_CKPT['optimizer']) 32 | info ="Load model from {0} well.".format(path) 33 | logger_wrappers.warning(info) 34 | else: 35 | logger_wrappers.warning('Load empty model from {0}.'.format(path)) 36 | return model#,optimizer 37 | -------------------------------------------------------------------------------- /models/ml/cumul/attack_cumul.py: -------------------------------------------------------------------------------- 1 | __author__ = 'jmh081701' 2 | import lightgbm as lgb 3 | from src.cumul.util import CUMUL_datagenerator 4 | from sklearn.metrics import accuracy_score 5 | from src.df.src import utility 6 | import numpy as np 7 | saved_model = "./saved_model/cumul.model" 8 | model = lgb.Booster(model_file=saved_model) 9 | dator = CUMUL_datagenerator(is_train=False) 10 | def prediction(X): 11 | print(X.shape) 12 | X = dator.feature_extract(X) 13 | logit = model.predict(data=X) 14 | y = list(map(lambda x : np.argmax(x),logit)) 15 | #assert len(y.shape) == X.shape[0] 16 | 17 | return y 18 | def flatten(X_compressed): 19 | X =[] 20 | for i in range(X_compressed.shape[0]): 21 | x =[] 22 | for j in range(X_compressed.shape[1]): 23 | if (X_compressed[i,j])<0: 24 | x += [-1] * abs(int(X_compressed[i,j])) 25 | elif X_compressed[i,j] >0 : 26 | x +=[1] * abs(int(X_compressed[i,j])) 27 | x+=[0] * 5000 28 | X.append(x[:5000]) 29 | return np.array(X) 30 | if __name__ == '__main__': 31 | preprocess = CUMUL_datagenerator(is_train=False) 32 | X_train, y_train, X_valid, y_valid, X_test, y_test = utility.LoadDataRetrain(is_cluster=False,dataset_dir=None) 33 | predict_y = prediction(flatten(X_train)) 34 | accuracy = accuracy_score(y_train,predict_y) 35 | print('test accuracy:{0}'.format(accuracy)) 36 | -------------------------------------------------------------------------------- /models/ml/bind/train.py: -------------------------------------------------------------------------------- 1 | __author__ = 'dk' 2 | from BIND.build_vector_dataset import builder 3 | import lightgbm as lgb 4 | from sklearn.metrics import accuracy_score 5 | import numpy as np 6 | def main(raw_feature_dictory,modelpath,global_feature_dict_filename="./global_feature_dict.vocb"): 7 | ##原始的包长序列 8 | bd = builder(raw_feature_dictory=raw_feature_dictory,global_feature_dict_filename=global_feature_dict_filename) 9 | X_train,y_train,X_test,y_test,X_valid,y_valid=bd.vectorize() 10 | saved_model = modelpath 11 | ##开始训练 12 | 13 | lgb_train = lgb.Dataset(data=X_train,label=y_train) 14 | lgb_eval = lgb.Dataset(data=X_valid,label=y_valid) 15 | hyper_params = { 16 | 'boosting_type': 'rf', 17 | 'objective': 'multiclass', 18 | 'num_leaves': 512, 19 | 'learning_rate': 0.05, 20 | 'feature_fraction': 0.9, 21 | 'bagging_fraction': 0.8, 22 | 'bagging_freq': 5, 23 | 'verbose': 0, 24 | 'num_class':53, 25 | 'lambda_l1':0.05, 26 | 'lambda_l2':0.15 27 | } 28 | 29 | gbm = lgb.train(params=hyper_params, 30 | train_set=lgb_train, 31 | valid_sets=lgb_eval, 32 | num_boost_round=100, 33 | early_stopping_rounds=5) 34 | #save model 35 | try: 36 | gbm.save_model(saved_model) 37 | except BaseException as exp: 38 | pass 39 | logit = gbm.predict(data=X_test) 40 | label_predict = list(map(lambda x : np.argmax(x),logit)) 41 | 42 | accuracy = accuracy_score(y_test,label_predict) 43 | print(accuracy) 44 | if __name__ == '__main__': 45 | main("./raw_feature/",global_feature_dict_filename="./global_feature_dict.vocb") 46 | -------------------------------------------------------------------------------- /models/ml/appscanner/min_max.py: -------------------------------------------------------------------------------- 1 | __author__ = 'dk' 2 | ##_min和_max必须得是train.py输出的那样 3 | _min=[-1.44800000e+03,-1.44800000e+03,-1.44800000e+03, 0.00000000e+00, 4 | 0.00000000e+00, 0.00000000e+00,-9.74732319e+00,-3.00000000e+00, 5 | -1.44800000e+03,-1.44800000e+03,-1.44800000e+03,-1.44800000e+03, 6 | -1.44800000e+03,-1.44800000e+03,-1.44800000e+03,-1.44800000e+03, 7 | -1.44800000e+03, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 8 | 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 9 | -2.23682312e+01,-3.00000000e+00, 0.00000000e+00, 0.00000000e+00, 10 | 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 11 | 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 12 | -1.44800000e+03,-1.44800000e+03,-1.44800000e+03, 0.00000000e+00, 13 | 0.00000000e+00, 0.00000000e+00,-2.23682312e+01,-3.00000000e+00, 14 | -1.44800000e+03,-1.44800000e+03,-1.44800000e+03,-1.44800000e+03, 15 | -1.44800000e+03,-1.44800000e+03,-1.44800000e+03,-1.44800000e+03, 16 | -1.44800000e+03, 1.00000000e+00] 17 | 18 | _max=[0.00000000e+00,0.00000000e+00,0.00000000e+00,7.23500000e+02, 19 | 7.23500000e+02,5.23452250e+05,3.15449531e+01,9.93387334e+02, 20 | 0.00000000e+00,0.00000000e+00,0.00000000e+00,0.00000000e+00, 21 | 0.00000000e+00,0.00000000e+00,0.00000000e+00,0.00000000e+00, 22 | 0.00000000e+00,1.00000000e+03,1.44800000e+03,1.44800000e+03, 23 | 1.44800000e+03,7.23500000e+02,7.23500000e+02,5.23452250e+05, 24 | 9.40585157e+00,4.99444233e+02,1.44800000e+03,1.44800000e+03, 25 | 1.44800000e+03,1.44800000e+03,1.44800000e+03,1.44800000e+03, 26 | 1.44800000e+03,1.44800000e+03,1.44800000e+03,1.00000000e+03, 27 | 1.44800000e+03,1.44800000e+03,1.44800000e+03,1.44493827e+03, 28 | 1.32699466e+03,1.76091482e+06,2.61611211e+01,7.35659009e+02, 29 | 1.44800000e+03,1.44800000e+03,1.44800000e+03,1.44800000e+03, 30 | 1.44800000e+03,1.44800000e+03,1.44800000e+03,1.44800000e+03, 31 | 1.44800000e+03,1.00000000e+03] -------------------------------------------------------------------------------- /models/dl/cnn/cnn_model_config.py: -------------------------------------------------------------------------------- 1 | __author__ = 'dk' 2 | ###这个模型来自于:Automated website fingerprinting through deep learning (Vera Rimmer et.al ) 3 | nb_classes_template = 100 #分类的目标类别数目,指网站的数目【此处需要修改】 4 | learning_params_template={ 5 | "nb_epochs": 15, 6 | "input_length":40, 7 | "maxlen": 40, #向量长度【此处需要修改】 8 | "nb_features": 1, 9 | "batch_size": 256, 10 | "val_split": 0.05, 11 | "test_split": 0.05, 12 | "optimizer": "rmsprop", 13 | "lr": 0.0008, 14 | "decay": 0, 15 | 'momentum':0.9, 16 | "nb_layers": 7, 17 | "layers": [ 18 | { 19 | "name": "conv", 20 | "rate": 0.25, 21 | "filters": 32, 22 | "kernel_size": 5, 23 | "activation": "relu", 24 | "stride": 1 25 | }, 26 | { 27 | "name": "conv", 28 | "pool_size": 4, 29 | "filters": 32, 30 | "kernel_size": 5, 31 | "activation": "relu", 32 | "stride": 1 33 | }, 34 | { 35 | "name": "maxpooling", 36 | "pool_size": 4 37 | }, 38 | { 39 | "name": "lstm", 40 | "units": 128 41 | }, 42 | { 43 | "last": True, 44 | "units": nb_classes_template, #这个就是最后一层的输出神经元个数,必须等于nb_classes 45 | "name": "dense", 46 | "activation": "softmax", 47 | "regularization": 0 48 | } 49 | ] 50 | } 51 | try: 52 | assert nb_classes_template == learning_params_template['layers'][-1]['units'] 53 | except AssertionError as exp: 54 | print("cnn model: The last layer units should be equals to the number of classes.") 55 | print("{0}:{1}".format(__file__,str(exp))) 56 | raise AssertionError(exp) 57 | try: 58 | assert learning_params_template['maxlen']==learning_params_template['input_length'] 59 | except AssertionError as exp: 60 | print("cnn model: The max_len should be equal to input_length, because they are alias name for each other.") 61 | print("{0}:{1}".format(__file__,str(exp))) 62 | raise AssertionError(exp) 63 | -------------------------------------------------------------------------------- /models/dl/sdae/sdae_model_config.py: -------------------------------------------------------------------------------- 1 | __author__ = 'dk' 2 | ###这个模型来自于:Automated website fingerprinting through deep learning (Vera Rimmer et.al ) 3 | #学习的参数 4 | nb_classes_template = 100 #分类的目标类别数目,指网站的数目【此处需要修改】 5 | learning_params_template={ 6 | "nb_epochs" : 30, 7 | "maxlen" : 40, #原始向量的长度【此处需要修改】 8 | "features" : 2, 9 | "batch_size" : 32, 10 | "val_split" : 0.05, 11 | "test_split" : 0.05, 12 | "optimizer" : "sgd", 13 | "nb_layers" : 3, 14 | "lr" : 0.001, 15 | "momentum" : 0.9, 16 | "decay" : 0.0, 17 | "nesterov" : True, 18 | "layers":#各个自编码器层的参数设置 19 | [ 20 | { #第一层的超参数 21 | "in_dim" : 40, #encoder输入向量长度【此处需要修改】 22 | "out_dim" : 700,#decoder输出向量长度 23 | "epochs": 20, 24 | "batch_size": 128, 25 | "dropout" :0.2, #dropout的概率 26 | "optimizer" : "sgd", #本层的优化器,可选性:sgd(随机梯度下降),adam,rmsprop 27 | "enc_activation" : "tanh",#编码器的激活函数 28 | "dec_activation" : "linear",#解码器的激活函数 29 | "lr":0.001, #sgd的优化器参数 30 | "momentum" : 0.9, 31 | "decay" : 0.0 32 | }, 33 | { #第二层超参数 34 | "in_dim": 700, 35 | "out_dim": 500, 36 | "epochs": 10, 37 | "batch_size": 128, 38 | "dropout":0.2, 39 | "optimizer":"sgd", 40 | "enc_activation": "tanh", 41 | "dec_activation":"linear", 42 | "lr": 0.001, 43 | "momentum":0.9, 44 | "decay": 0.0 45 | }, 46 | { #第三层超参数 47 | "in_dim" : 500, 48 | "out_dim": 300, 49 | "epochs": 10, 50 | "batch_size": 128, 51 | "dropout": 0.2, 52 | "optimizer": "sgd", 53 | "enc_activation": "tanh", 54 | "dec_activation": "linear", 55 | "lr" : 0.001, 56 | "momentum": 0.9, 57 | "decay" : 0.0 58 | } 59 | ] 60 | } 61 | -------------------------------------------------------------------------------- /models/ml/appscanner/eval.py: -------------------------------------------------------------------------------- 1 | __author__ = 'dk' 2 | from wf_attacks.data_utils import LoadDataApp_crossversion 3 | from wf_attacks.appscanner.feature_extractor import feature_extract 4 | import lightgbm as lgb 5 | from sklearn.metrics import accuracy_score 6 | import numpy as np 7 | from wf_attacks.appscanner.min_max import _min ,_max 8 | from accuracy_per_class import accuracy_per_class 9 | def main(test_set,modelpath): 10 | ##原始的包长序列 11 | global _min,_max 12 | X_train_r, y_train_r, X_valid_r, y_valid_r, X_test_r, y_test_r = LoadDataApp_crossversion(test_set) 13 | saved_model = modelpath #"appscanner.model" 14 | print('before extract feature') 15 | ##提取统计特征 16 | X_train =[] 17 | X_valid =[] 18 | X_test =[] 19 | #print(X_train_r[0]) 20 | #print(X_valid_r[1]) 21 | #print(X_test_r[2]) 22 | for i in range(X_train_r.shape[0]): 23 | X_train.append(feature_extract(X_train_r[i])) 24 | for i in range(X_test_r.shape[0]): 25 | X_test.append(feature_extract(X_test_r[i])) 26 | for i in range(X_valid_r.shape[0]): 27 | X_valid.append(feature_extract(X_valid_r[i])) 28 | print('feature extract well!') 29 | ##归一化操作 30 | 31 | 32 | _min = np.array(_min) 33 | _max =np.array(_max) 34 | 35 | X_train = np.array(X_train) 36 | X_valid = np.array(X_valid) 37 | X_test = np.array(X_test) 38 | _min = np.array(_min) 39 | _max =np.array(_max) 40 | X_train = (X_train-_min)/(_max-_min) 41 | X_valid = (X_valid-_min)/(_max-_min) 42 | X_test = (X_test-_min)/(_max-_min) 43 | print('normalize well!') 44 | print(X_train[0]) 45 | print(X_valid[1]) 46 | print(X_test[2]) 47 | ## 48 | y_test = np.argmax(y_test_r,1) 49 | y_train =np.argmax(y_train_r,1) 50 | y_valid =np.argmax(y_valid_r,1) 51 | print(y_test[0:10]) 52 | #load model 53 | try: 54 | gbm = lgb.Booster(model_file=saved_model) 55 | except BaseException as exp: 56 | raise exp 57 | logit = gbm.predict(data=X_test) 58 | label_predict = list(map(lambda x : np.argmax(x),logit)) 59 | 60 | accuracy = accuracy_score(y_test,label_predict) 61 | accuracy_per_class(y_real=y_test,y_pred=label_predict) 62 | 63 | print("[Appscanner] Test on {0}, accuracy is {1}. ".format(test_set,accuracy)) 64 | 65 | -------------------------------------------------------------------------------- /models/dl/graphDapp/graphDapp_main_model.py: -------------------------------------------------------------------------------- 1 | __author__ = 'dk' 2 | 3 | from models.model_base import abs_model 4 | from config import raw_dataset_base 5 | from models.dl.graphDapp.data_builder import Dataset 6 | from models.dl.graphDapp.train import main as train_main 7 | from models.dl.graphDapp.test import main as test_main 8 | 9 | import os 10 | class model(abs_model): 11 | def __init__(self, dataset, randseed, splitrate ,max_len=200): 12 | super(model,self).__init__('graphDapp',randseed= randseed) 13 | if os.path.exists(self.database) == False: 14 | os.makedirs(self.database,exist_ok=True) 15 | 16 | self.dataset = dataset 17 | self.model = self.database + '/'+ self.name + '_' + dataset + '_model' 18 | self.data = self.database + '/'+ self.name + '_' + dataset + '/' 19 | self.splitrate = splitrate 20 | #原始数据集目录 21 | full_rdata = raw_dataset_base + self.dataset 22 | self.full_rdata = full_rdata 23 | self.max_len = max_len 24 | if self.data_exists() == False: 25 | self.parser_raw_data() 26 | def parser_raw_data(self): 27 | def pad_sequence(x, max_len= self.max_len, pad_value=0): 28 | r = x + [pad_value] * (max_len - len(x)) 29 | return r[:max_len] 30 | full_rdata = self.full_rdata 31 | if os.path.exists(full_rdata) == False: 32 | raise OSError('Dataset {0} (full path: {1}) does not exist!'.format(self.dataset,full_rdata)) 33 | os.makedirs(self.data, exist_ok=True) 34 | ##从原始数据集构建graphDApp所需数据 35 | dator = Dataset(raw_dir=full_rdata, 36 | dumpfile=self.data + '{0}.gzip'.format(self.dataset), 37 | split_rate=self.splitrate, 38 | renew= True) 39 | dator.save_dumpfile() 40 | 41 | def train(self): 42 | train_main(dataset_name=self.data + '{0}.gzip'.format(self.dataset), modelpath= self.model) 43 | 44 | def test(self): 45 | test_main(dataset_name=self.data + '{0}.gzip'.format(self.dataset), modelpath= self.model) 46 | 47 | if __name__ == '__main__': 48 | graphdapp_model = model('awf200_burst', randseed= 128, splitrate=0.1) 49 | graphdapp_model.parser_raw_data() 50 | graphdapp_model.train() 51 | graphdapp_model.test() 52 | -------------------------------------------------------------------------------- /models/dl/beauty/cnn_model_config.py: -------------------------------------------------------------------------------- 1 | __author__ = 'dk' 2 | ''' 3 | ###这个模型来自于:@inproceedings{schuster2017beauty, 4 | title={Beauty and the burst: Remote identification of encrypted video streams}, 5 | author={Schuster, Roei and Shmatikov, Vitaly and Tromer, Eran}, 6 | booktitle={26th $\{$USENIX$\}$ Security Symposium ($\{$USENIX$\}$ Security 17)}, 7 | pages={1357--1374}, 8 | year={2017} 9 | } 10 | ''' 11 | learning_params_template={ 12 | "epoch": 200, 13 | "input_length":200, 14 | "maxlen": 200, #向量长度【此处需要修改】 15 | "nb_features": 1, 16 | "batch_size": 256, 17 | "val_split": 0.05, 18 | "test_split": 0.05, 19 | "optimizer": "adam", 20 | "lr": 0.0008, 21 | "decay": 0, 22 | 'momentum':0.9, 23 | "nb_layers": 7, 24 | "layers": [ 25 | { 26 | "name": "conv", 27 | "filters": 32, 28 | "kernel_size": 5, 29 | "activation": "relu", 30 | "stride": 1 31 | }, 32 | { 33 | "name": "conv", 34 | "filters": 32, 35 | "kernel_size": 5, 36 | "activation": "relu", 37 | "stride": 1 38 | }, 39 | { 40 | "name": "conv", 41 | "filters": 32, 42 | "kernel_size": 5, 43 | "activation": "relu", 44 | "stride": 1, 45 | }, 46 | { 47 | 'name':'dropout', 48 | 'rate': 0.5 49 | }, 50 | { 51 | "name": "maxpooling", 52 | "pool_size": 4 53 | }, 54 | { 55 | 'name':'dropout', 56 | 'rate': 0.3 57 | }, 58 | { 59 | "units": 64, #这个就是最后一层的输出神经元个数,必须等于nb_classes 60 | "name": "dense", 61 | "activation": "relu", 62 | "regularization": 0.0 63 | }, 64 | { 65 | 'name':'dropout', 66 | 'rate': 0.5 67 | }, 68 | { 69 | 'name':'flatten' 70 | }, 71 | { 72 | "last": True, 73 | "units": None, #这个就是最后一层的输出神经元个数,必须等于nb_classes 74 | "name": "dense", 75 | "activation": "softmax", 76 | "regularization": 0.0 77 | } 78 | ] 79 | } 80 | try: 81 | assert learning_params_template['maxlen']==learning_params_template['input_length'] 82 | except AssertionError as exp: 83 | print("cnn model: The max_len should be equal to input_length, because they are alias name for each other.") 84 | print("{0}:{1}".format(__file__,str(exp))) 85 | raise AssertionError(exp) 86 | -------------------------------------------------------------------------------- /models/ml/appscanner/feature_extractor.py: -------------------------------------------------------------------------------- 1 | __author__ = 'dk' 2 | #appscanner使用的特征提取方法,提取得到54维统计特征 3 | import numpy as np 4 | from scipy.stats import skew,kurtosis 5 | _min=[1e9+1] * 54 6 | _max=[0.0] * 54 7 | def feature_trace(trace): 8 | feature = [0.0] * 18 9 | if len(trace)==0: 10 | return feature 11 | feature[0] = np.min(trace) 12 | feature[1] = np.max(trace) 13 | feature[2] = np.mean(trace) 14 | feature[3] = np.median(np.absolute(trace-np.mean(trace))) 15 | feature[4] = np.std(trace) 16 | feature[5] = np.var(trace) 17 | feature[6] = skew(trace) 18 | feature[7] = kurtosis(trace) 19 | ##百分位数 20 | p=[10,20,30,40,50,60,70,80,90] 21 | percentile =np.percentile(trace,p) 22 | for i in range(9): 23 | feature[8+i] = percentile[i] 24 | feature[17]= len(trace) 25 | return feature 26 | 27 | def feature_extract(pkt_length_sequence): 28 | ingoing_trace =[] 29 | outgoing_trace =[] 30 | trace =[] 31 | pkt_length_sequence = np.array(pkt_length_sequence) 32 | pkt_length_sequence = pkt_length_sequence.reshape((-1)) 33 | for i in range(pkt_length_sequence.shape[0]): 34 | if pkt_length_sequence[i] < 0 : 35 | ingoing_trace.append(pkt_length_sequence[i]) 36 | if pkt_length_sequence[i] > 0 : 37 | outgoing_trace.append(pkt_length_sequence[i]) 38 | if pkt_length_sequence[i]!=0: 39 | trace.append(pkt_length_sequence[i]) 40 | if pkt_length_sequence[i]==0: 41 | break 42 | 43 | in_feature = feature_trace(ingoing_trace) 44 | out_feature = feature_trace(outgoing_trace) 45 | bi_feature = feature_trace(trace) 46 | 47 | feature = in_feature+out_feature+bi_feature 48 | for i in range(54): 49 | if feature[i] > _max[i] : 50 | _max[i] = feature[i] 51 | if feature[i] < _min[i]: 52 | _min[i] = feature[i] 53 | return feature 54 | def normalize(feature,min=None,max=None): 55 | if type(min) == type(None): 56 | min = _min 57 | if type(max) == type(None): 58 | max = _max 59 | return (feature-min)/(max-min) 60 | 61 | if __name__ == '__main__': 62 | pkt_length_seq =[383, -290, 90, -165, 1448, 463, 929, 389, 1448, 976, 1448, 1448, 1448, 1448, 1448, 1448, 1448, 1448, 1448, 1448, 1448, 1448, 1448, 717, 105, 1448, 1448, 1448, 1448, 1051, 144, 219, 196, 603, 113] 63 | x=feature_extract(pkt_length_seq) 64 | print(x) 65 | print(len(x)) 66 | -------------------------------------------------------------------------------- /models/dl/df_eval.py: -------------------------------------------------------------------------------- 1 | __author__ = 'dk' 2 | import os 3 | #设置Tensorflow的日志等级 4 | #os.environ["TF_CPP_MIN_LOG_LEVEL"]='1' # 这是默认的显示等级,显示所有信息 5 | #os.environ["TF_CPP_MIN_LOG_LEVEL"]='2' # 只显示 warning 和 Error 6 | os.environ["TF_CPP_MIN_LOG_LEVEL"]='3' # 只显示 Error 7 | 8 | from attacks import CNN_model,DF_model,SDAE_model,LSTM_model 9 | from df import df_model_config 10 | 11 | from cnn import cnn_model_config 12 | from sdae import sdae_model_config 13 | from lstm import lstm_model_config 14 | 15 | from data_utils import LoadDataNoDefCW100,LoadDataNoDefCW,LoadDataWalkieTalkieCW,LoadDataApp_crossversion 16 | 17 | #使用步骤 18 | #1. 修改各个模型的超参数,xx_model_config.py,把里面的输入向量和标签数改成自己所需要的 19 | #2. 读取数据,构造好训练集,验证集,测试集 20 | #3. build_model() 21 | #4. 调用fit() 22 | #5. 测试一下 23 | #6. 保存模型 24 | def test_cnn(X_train,y_train,X_valid,y_valid,X_test,y_test): 25 | model = CNN_model() 26 | model.build_model() 27 | model.fit(X_train=X_train,y_train=y_train,X_valid=X_valid,y_valid=y_valid, 28 | batch_size=cnn_model_config.learning_params_template['batch_size'], 29 | epochs=cnn_model_config.learning_params_template['nb_epochs']) 30 | model.save_model() 31 | score = model.evaluate(X_test=X_test,y_test=y_test) 32 | print('simple CNN accuracy :{0}'.format(score)) 33 | 34 | def test_df(X_train,y_train,X_valid,y_valid,X_test,y_test): 35 | 36 | model =DF_model() 37 | model.load_model() 38 | score = model.evaluate(X_test=X_test,y_test=y_test) 39 | print('Deep Fingerprinting Test on test dataset accuracy :{0}'.format(score)) 40 | 41 | def test_lstm(X_train,y_train,X_valid,y_valid,X_test,y_test): 42 | model = LSTM_model() 43 | def test_sdae(X_train,y_train,X_valid,y_valid,X_test,y_test): 44 | model = SDAE_model() 45 | model.build_model() 46 | model.pre_train(x_train=X_train,x_test=X_test) 47 | 48 | model.fit(X_train=X_train,y_train=y_train,X_valid=X_valid,y_valid=y_valid, 49 | batch_size=sdae_model_config.learning_params_template['batch_size'], 50 | epochs=sdae_model_config.learning_params_template['nb_epochs']) 51 | model.save_model() 52 | score = model.evaluate(X_test=X_test,y_test=y_test) 53 | print('sdae accuracy :{0}'.format(score)) 54 | if __name__ == '__main__': 55 | X_train, y_train, X_valid, y_valid, X_test, y_test = LoadDataApp_crossversion("/home3/jmh/app_dataset_noise/") 56 | test_df(X_train=X_train,y_train=y_train,X_valid=X_valid,y_valid=y_valid,X_test=X_test,y_test=y_test) 57 | -------------------------------------------------------------------------------- /models/dl/sdae_eval.py: -------------------------------------------------------------------------------- 1 | __author__ = 'dk' 2 | import os 3 | #设置Tensorflow的日志等级 4 | #os.environ["TF_CPP_MIN_LOG_LEVEL"]='1' # 这是默认的显示等级,显示所有信息 5 | #os.environ["TF_CPP_MIN_LOG_LEVEL"]='2' # 只显示 warning 和 Error 6 | os.environ["TF_CPP_MIN_LOG_LEVEL"]='3' # 只显示 Error 7 | 8 | from attacks import CNN_model,DF_model,SDAE_model,LSTM_model 9 | from df import df_model_config 10 | 11 | from cnn import cnn_model_config 12 | from sdae import sdae_model_config 13 | from lstm import lstm_model_config 14 | 15 | from data_utils import LoadDataNoDefCW100,LoadDataNoDefCW,LoadDataWalkieTalkieCW,LoadDataApp,LoadDataApp_crossversion 16 | 17 | #使用步骤 18 | #1. 修改各个模型的超参数,xx_model_config.py,把里面的输入向量和标签数改成自己所需要的 19 | #2. 读取数据,构造好训练集,验证集,测试集 20 | #3. build_model() 21 | #4. 调用fit() 22 | #5. 测试一下 23 | #6. 保存模型 24 | def test_cnn(X_train,y_train,X_valid,y_valid,X_test,y_test): 25 | model = CNN_model() 26 | model.build_model() 27 | model.fit(X_train=X_train,y_train=y_train,X_valid=X_valid,y_valid=y_valid, 28 | batch_size=cnn_model_config.learning_params_template['batch_size'], 29 | epochs=cnn_model_config.learning_params_template['nb_epochs']) 30 | model.save_model() 31 | score = model.evaluate(X_test=X_test,y_test=y_test) 32 | print('simple CNN accuracy :{0}'.format(score)) 33 | 34 | def test_df(X_train,y_train,X_valid,y_valid,X_test,y_test): 35 | 36 | model =DF_model() 37 | model.build_model() 38 | 39 | model.fit(X_train=X_train,y_train=y_train,X_valid=X_valid,y_valid=y_valid, 40 | batch_size=df_model_config.learning_params_template['batch_size'], 41 | epochs=df_model_config.learning_params_template['epoch']) 42 | model.save_model() 43 | score = model.evaluate(X_test=X_test,y_test=y_test) 44 | print('Deep Fingerprinting accuracy :{0}'.format(score)) 45 | 46 | 47 | 48 | def test_lstm(X_train,y_train,X_valid,y_valid,X_test,y_test): 49 | model = LSTM_model() 50 | def test_sdae(X_train,y_train,X_valid,y_valid,X_test,y_test): 51 | model = SDAE_model() 52 | model.load_model() 53 | score = model.evaluate(X_test=X_test,y_test=y_test) 54 | print('sdae Test on test dataset accuracy :{0}'.format(score)) 55 | if __name__ == '__main__': 56 | X_train, y_train, X_valid, y_valid, X_test, y_test = LoadDataApp_crossversion()#LoadDataWalkieTalkieCW() #LoadDataNoDefCW() 57 | #test_df(X_train=X_train,y_train=y_train,X_valid=X_valid,y_valid=y_valid,X_test=X_test,y_test=y_test) 58 | #test_cnn(X_train=X_train,y_train=y_train,X_valid=X_valid,y_valid=y_valid,X_test=X_test,y_test=y_test) 59 | test_sdae(X_train,y_train,X_valid,y_valid,X_test,y_test) -------------------------------------------------------------------------------- /models/dl/cnn_eval.py: -------------------------------------------------------------------------------- 1 | __author__ = 'dk' 2 | import os 3 | #设置Tensorflow的日志等级 4 | #os.environ["TF_CPP_MIN_LOG_LEVEL"]='1' # 这是默认的显示等级,显示所有信息 5 | #os.environ["TF_CPP_MIN_LOG_LEVEL"]='2' # 只显示 warning 和 Error 6 | os.environ["TF_CPP_MIN_LOG_LEVEL"]='3' # 只显示 Error 7 | 8 | from attacks import CNN_model,DF_model,SDAE_model,LSTM_model 9 | from df import df_model_config 10 | 11 | from cnn import cnn_model_config 12 | from sdae import sdae_model_config 13 | from lstm import lstm_model_config 14 | 15 | from data_utils import LoadDataNoDefCW100,LoadDataNoDefCW,LoadDataWalkieTalkieCW,LoadDataApp,LoadDataApp,LoadDataApp_crossversion 16 | 17 | #使用步骤 18 | #1. 修改各个模型的超参数,xx_model_config.py,把里面的输入向量和标签数改成自己所需要的 19 | #2. 读取数据,构造好训练集,验证集,测试集 20 | #3. build_model() 21 | #4. 调用fit() 22 | #5. 测试一下 23 | #6. 保存模型 24 | def test_cnn(X_train,y_train,X_valid,y_valid,X_test,y_test): 25 | model = CNN_model() 26 | model.load_model() 27 | score = model.evaluate(X_test=X_test,y_test=y_test) 28 | print('simple CNN Test on test dataset accuracy :{0}'.format(score)) 29 | 30 | def test_df(X_train,y_train,X_valid,y_valid,X_test,y_test): 31 | 32 | model =DF_model() 33 | model.build_model() 34 | 35 | model.fit(X_train=X_train,y_train=y_train,X_valid=X_valid,y_valid=y_valid, 36 | batch_size=df_model_config.learning_params_template['batch_size'], 37 | epochs=df_model_config.learning_params_template['epoch']) 38 | model.save_model() 39 | score = model.evaluate(X_test=X_test,y_test=y_test) 40 | print('Deep Fingerprinting accuracy :{0}'.format(score)) 41 | 42 | 43 | 44 | def test_lstm(X_train,y_train,X_valid,y_valid,X_test,y_test): 45 | model = LSTM_model() 46 | def test_sdae(X_train,y_train,X_valid,y_valid,X_test,y_test): 47 | model = SDAE_model() 48 | model.build_model() 49 | model.pre_train(x_train=X_train,x_test=X_test) 50 | 51 | model.fit(X_train=X_train,y_train=y_train,X_valid=X_valid,y_valid=y_valid, 52 | batch_size=sdae_model_config.learning_params_template['batch_size'], 53 | epochs=sdae_model_config.learning_params_template['nb_epochs']) 54 | model.save_model() 55 | score = model.evaluate(X_test=X_test,y_test=y_test) 56 | print('sdae accuracy :{0}'.format(score)) 57 | if __name__ == '__main__': 58 | X_train, y_train, X_valid, y_valid, X_test, y_test = LoadDataApp_crossversion() 59 | #test_df(X_train=X_train,y_train=y_train,X_valid=X_valid,y_valid=y_valid,X_test=X_test,y_test=y_test) 60 | test_cnn(X_train=X_train,y_train=y_train,X_valid=X_valid,y_valid=y_valid,X_test=X_test,y_test=y_test) 61 | #test_sdae(X_train,y_train,X_valid,y_valid,X_test,y_test) -------------------------------------------------------------------------------- /models/dl/graphDapp/test.py: -------------------------------------------------------------------------------- 1 | __author__ = 'dk' 2 | import numpy as np 3 | from models.dl.graphDapp import logger_wrappers 4 | import torch as th 5 | from torch import nn 6 | from torch import optim 7 | from torch.nn import functional as F 8 | 9 | from models.dl.graphDapp.model_seriealization import save,load 10 | from models.dl.graphDapp.data_builder import Dataset_fgnet 11 | from models.dl.graphDapp.DApp_Classifier import DApp_classifier 12 | from models.dl.graphDapp.graphDapp_config import config 13 | from sklearn.metrics import classification_report 14 | use_gpu = th.cuda.is_available() 15 | if use_gpu : 16 | device_id = config['device_id'] 17 | device= device_id 18 | else: 19 | device= "cpu" 20 | 21 | def main(dataset_name, modelpath,max_epoch=0): 22 | data_loader = Dataset_fgnet(raw_dir=r'',dumpfile=dataset_name,renew=False) 23 | print(data_loader) 24 | model = DApp_classifier(nb_classes=len(data_loader.labelname), 25 | gin_layer_num=config['gin_layer_num'], 26 | gin_hidden_units=config['gin_hidden_units'], 27 | iteration_nums=config['iteration_nums'], 28 | device= device,use_gpu= use_gpu) 29 | loss_func = nn.CrossEntropyLoss() 30 | optimizer = optim.Adam(params=model.parameters(),lr=5e-5) 31 | 32 | model = load(model,optimizer=optimizer,checkpoint_path=modelpath) 33 | if use_gpu: 34 | model = model.cuda(device) 35 | batch_size = config['batch_size'] 36 | 37 | model.eval() 38 | acc_list =[] 39 | ground_truth = [] 40 | predict_truth = [] 41 | 42 | for subset in range(len(data_loader.test_set)//batch_size): 43 | graphs,labels = data_loader.next_test_batch(batch_size=batch_size) 44 | if use_gpu : 45 | graphs = graphs.to(th.device(device)) 46 | labels = labels.to(th.device(device)) 47 | predict_labels = model(graphs) 48 | predict_labels = F.softmax(predict_labels,1) 49 | argmax_labels = th.argmax(predict_labels,1) 50 | ground_truth = ground_truth + labels.tolist() 51 | predict_truth = predict_truth + argmax_labels.tolist() 52 | acc = (labels == argmax_labels).float().sum().item() / len(labels) * 100 53 | acc_list.append(acc) 54 | info='Accuracy of argmax predictions on the test subset{1}: {0:4f}%'.format(acc,subset) 55 | info = 'Average Accuracy on entire test set:{:0.4f}%'.format(np.mean(acc_list)) 56 | logger_wrappers.info(info) 57 | print(classification_report(y_true=ground_truth,y_pred=predict_truth,digits=5)) 58 | -------------------------------------------------------------------------------- /models/dl/fsnet/README.md: -------------------------------------------------------------------------------- 1 | 2 | # FS-Net 3 | 4 | Implementation of "FS-Net: A Flow Sequence Network For Encrypted Traffic Classification". 5 | 6 | If you find this method helpful for your research, please cite this paper: 7 | 8 | ```latex 9 | @inproceedings{LiuHXCL19, 10 | author = {Chang Liu and 11 | Longtao He and 12 | Gang Xiong and 13 | Zigang Cao and 14 | Zhen Li}, 15 | title = {FS-Net: {A} Flow Sequence Network For Encrypted Traffic Classification}, 16 | booktitle = {{IEEE} Conference on Computer Communications (INFOCOM), 2019}, 17 | pages = {1171--1179}, 18 | year = {2019} 19 | } 20 | ``` 21 | 22 | ------ 23 | 24 | ### Requirement 25 | 26 | - python >= 3.4 27 | - numpy == 1.14.5 28 | - tqdm 29 | - tensorflow == 1.8.0 30 | 31 | ------ 32 | 33 | ### Dataset Format 34 | 35 | The dataset consists of multiple files, and each file contains all the flow records of a specific application. And the files are ended with `.num`. For example 36 | 37 | ``` 38 | origin_data 39 | |---- alicdn.num 40 | |---- baidu.num 41 | ``` 42 | 43 | For a specific application, each flow record is consists with two parts, for example 44 | 45 | ``` 46 | 50 3 7 5 5 5 ;2920 167 51 78 968 38 47 | ``` 48 | 49 | There are two sequences in a record: the first one is encoded status sequence and the second on is the packet length sequence. The two sequences are separated with `;`, and the elements in the sequences are separated with `\t`. 50 | 51 | ### How to use 52 | 53 | #### Step 1. Pre-Process The Dataset 54 | 55 | The dataset is first formalized into `.json` files, and the train set and development set are split as follows: 56 | 57 | ```bash 58 | python main.py --mode=prepro 59 | ``` 60 | 61 | The dataset will saved in the `record` folder, and the files are start with `train` and `test`. The setting can be changed with `--train_json`, `--test_json`, `--train_meta` and `--test_meta`. 62 | 63 | #### Step 2: Train The Model 64 | 65 | We can train our model by: 66 | 67 | ```bash 68 | python main.py --mode=train 69 | ``` 70 | 71 | **Note**: hyper-parameters (such as batch size, hidden size, layer number) of the model and the training process can be explored in the `main.py`. 72 | 73 | #### Step 3: Evaluation. 74 | 75 | Given the evaluation dataset, we can conduct the evaluation with: 76 | 77 | ```bash 78 | python main.py --mode=test --test_json=xxxxxx --test_model_dir=yyyyy 79 | ``` 80 | 81 | The model will loaded from the `${test_model_dir}`, and the `${test_json}` is the test data. The test data have the same format with the results of the Step 1. 82 | -------------------------------------------------------------------------------- /models/dl/fsnet/dataset.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from tqdm import tqdm 3 | import numpy as np 4 | import json 5 | 6 | 7 | PAD_KEY = 0 8 | START_KEY = 1 9 | END_KEY = 2 10 | 11 | 12 | def read_file_generator(filename, max_len, keep_ratio=1): 13 | 14 | def gen(): 15 | with open(filename) as fp: 16 | data = json.load(fp) 17 | data_all = [] 18 | for exp in data: 19 | flow_length = len(exp['flow']) 20 | if flow_length <= max_len: 21 | flow = [START_KEY] + exp['flow'] + [END_KEY] + [PAD_KEY] * (max_len - flow_length) 22 | data_all.append((str.encode(exp['id']), exp['label'], flow)) 23 | numx = 0 24 | total_num = min(int(keep_ratio * len(data_all)), len(data_all)-1) 25 | data_all = data_all[:total_num] 26 | #print('total_num',total_num) 27 | while True: 28 | if numx == 0: 29 | np.random.shuffle(data_all) 30 | #print('numx',numx) 31 | yield data_all[numx] 32 | numx = (numx + 1) % total_num 33 | return gen 34 | 35 | 36 | def get_dataset_from_generator(file, config, max_len, keep_ratio=1): 37 | data_gen = read_file_generator(file, max_len, keep_ratio) 38 | dataset = tf.data.Dataset.from_generator( 39 | data_gen, 40 | (tf.string, tf.int32, tf.int32), 41 | (tf.TensorShape([]), tf.TensorShape([]), tf.TensorShape([max_len + 2])) 42 | ).shuffle(config.capacity).batch(config.batch_size).prefetch(4) 43 | return dataset 44 | 45 | 46 | def _get_summary(metric): 47 | summ = [] 48 | for met in metric: 49 | sx = tf.Summary(value=[tf.Summary.Value(tag=met, simple_value=metric[met])]) 50 | summ.append(sx) 51 | return summ 52 | 53 | 54 | def accuracy(model, val_num_batches, sess, handle, str_handle, name): 55 | pred_all, pred_right, losses, r_losses, c_losses = 0, 0, [], [], [] 56 | metric = {} 57 | for _ in tqdm(range(val_num_batches), desc='eval', ascii=True): 58 | loss,\ 59 | pred, label = sess.run( 60 | [model.loss, 61 | model.pred, model.label], 62 | feed_dict={handle: str_handle}) 63 | losses.append(loss) 64 | #r_losses.append(r_loss) 65 | #c_losses.append(c_loss) 66 | pred_all += len(pred) 67 | pred_right += np.sum(pred == label) 68 | loss = np.mean(losses) 69 | metric[name + '/loss/all'] = loss 70 | #metric[name + '/loss/clf'] = np.mean(c_losses) 71 | #metric[name + '/loss/rec'] = np.mean(r_losses) 72 | metric[name + '/accuracy'] = pred_right / pred_all 73 | summ = _get_summary(metric) 74 | 75 | return loss, summ, metric -------------------------------------------------------------------------------- /models/ml/appscanner/train.py: -------------------------------------------------------------------------------- 1 | __author__ = 'dk' 2 | from wf_attacks.data_utils import LoadDataApp_crossversion 3 | from wf_attacks.appscanner.feature_extractor import feature_extract 4 | import lightgbm as lgb 5 | from sklearn.metrics import accuracy_score 6 | import numpy as np 7 | from wf_attacks.appscanner.min_max import _max,_min 8 | ##原始的包长序列 9 | def main(train_set,modelpath): 10 | global _min,_max 11 | X_train_r, y_train_r, X_valid_r, y_valid_r, X_test_r, y_test_r = LoadDataApp_crossversion(train_set) 12 | saved_model = modelpath# "appscanner.model" 13 | print('before extract feature') 14 | ##提取统计特征 15 | X_train =[] 16 | X_valid =[] 17 | X_test =[] 18 | 19 | for i in range(X_train_r.shape[0]): 20 | X_train.append(feature_extract(X_train_r[i])) 21 | for i in range(X_test_r.shape[0]): 22 | X_test.append(feature_extract(X_test_r[i])) 23 | for i in range(X_valid_r.shape[0]): 24 | X_valid.append(feature_extract(X_valid_r[i])) 25 | print('feature extract well!') 26 | ##归一化操作 27 | X_train = np.array(X_train) 28 | X_valid = np.array(X_valid) 29 | X_test = np.array(X_test) 30 | _min = np.array(_min) 31 | _max =np.array(_max) 32 | print('_min:') 33 | print(_min) 34 | print('_max:') 35 | print(_max) 36 | 37 | X_train = (X_train-_min)/(_max-_min) 38 | X_valid = (X_valid-_min)/(_max-_min) 39 | X_test = (X_test-_min)/(_max-_min) 40 | print('normalize well!') 41 | print(X_train[0]) 42 | print(X_valid[1]) 43 | print(X_test[2]) 44 | ## 45 | y_test = np.argmax(y_test_r,1) 46 | y_train =np.argmax(y_train_r,1) 47 | y_valid =np.argmax(y_valid_r,1) 48 | print(y_test[0:10]) 49 | ##开始训练 50 | 51 | lgb_train = lgb.Dataset(data=X_train,label=y_train) 52 | lgb_eval = lgb.Dataset(data=X_valid,label=y_valid) 53 | 54 | hyper_params = { 55 | 'boosting_type': 'rf', 56 | 'objective': 'multiclass', 57 | 'num_leaves': 512, 58 | 'learning_rate': 0.05, 59 | 'feature_fraction': 0.9, 60 | 'bagging_fraction': 0.8, 61 | 'bagging_freq': 5, 62 | 'verbose': 0, 63 | 'num_class':55, 64 | 'lambda_l1':0.05, 65 | 'lambda_l2':0.15 66 | } 67 | 68 | gbm = lgb.train(params=hyper_params, 69 | train_set=lgb_train, 70 | valid_sets=lgb_eval, 71 | num_boost_round=50, 72 | early_stopping_rounds=5) 73 | #save model 74 | try: 75 | gbm.save_model(saved_model) 76 | except BaseException as exp: 77 | pass 78 | logit = gbm.predict(data=X_test) 79 | label_predict = list(map(lambda x : np.argmax(x),logit)) 80 | 81 | accuracy = accuracy_score(y_test,label_predict) 82 | print('[Appscanner test on {0} acc:{1}]'.format(train_set,accuracy)) 83 | -------------------------------------------------------------------------------- /models/dl/fsnet/eval.py: -------------------------------------------------------------------------------- 1 | from tqdm import tqdm 2 | import numpy as np 3 | import json 4 | from sklearn.metrics import classification_report 5 | ALL_ = -1 6 | TPR_KEY = 'TPR' 7 | FPR_KEY = 'FPR' 8 | FTF_KEY = 'FTF' 9 | 10 | 11 | def _fpr_trp_app(real, pred, app_ind): 12 | real_app = real == app_ind 13 | pred_app = pred == app_ind 14 | TP = 0 15 | TN = 0 16 | FP = 0 17 | FN = 0 18 | for r, p in zip(real_app, pred_app): 19 | if r and p: 20 | TP += 1 21 | elif r and not p: 22 | FN += 1 23 | elif not r and p: 24 | FP += 1 25 | else: 26 | TN += 1 27 | return TP, TN, FP, FN 28 | 29 | 30 | def _evaluate_fpr_and_tpr(real, pred): 31 | app_num = len(pred) 32 | real = np.concatenate(real) 33 | pred = np.concatenate(pred) 34 | TP = 0 35 | TN = 0 36 | FP = 0 37 | FN = 0 38 | TPR = {} 39 | FPR = {} 40 | for app_ind in tqdm(range(app_num), ascii=True, desc='Eval'): 41 | TP_app, TN_app, FP_app, FN_app = _fpr_trp_app(real, pred, app_ind) 42 | TP += TP_app 43 | TN += TN_app 44 | FP += FP_app 45 | FN += FN_app 46 | TPR[app_ind] = TP_app / (TP_app + FN_app) 47 | FPR[app_ind] = FP_app / (FP_app + TN_app) 48 | TPR[ALL_] = TP / (TP + FN) 49 | FPR[ALL_] = FP / (FP + TN) 50 | #print("Total Accuracy:",(TP+TN)/(TP+TN+FP+FN)) 51 | return TPR, FPR 52 | 53 | 54 | def _evaluate_ftf(TPR, FPR, class_num): 55 | res = 0 56 | sam_num = np.array(class_num, dtype=np.float) 57 | sam_num /= sam_num.sum() 58 | 59 | for key in TPR: 60 | if key == ALL_: 61 | continue 62 | res += sam_num[key] * TPR[key] / (1 + FPR[key]) 63 | return res 64 | 65 | 66 | def save_res(res, filename): 67 | with open(filename, 'w') as fp: 68 | json.dump(res, fp, indent=1, sort_keys=True) 69 | 70 | 71 | def evaluate(real, pred): 72 | print('real.shape:{0},len.shape{1}'.format(np.array(real).shape,np.array(pred).shape)) 73 | r=0 74 | t=0 75 | y_real =[] 76 | y_pred =[] 77 | for i in range(len(real)): 78 | for j in range(len(real[i])): 79 | y_real.append(real[i][j]) 80 | y_pred.append(pred[i][j]) 81 | if real[i][j]==pred[i][j]: 82 | r+=1 83 | t+=1 84 | 85 | example_len = [len(ix) for ix in real] 86 | TPR, FPR = _evaluate_fpr_and_tpr(real, pred) 87 | FTF = _evaluate_ftf(TPR, FPR, example_len) 88 | res = { 89 | TPR_KEY: TPR, 90 | FPR_KEY: FPR, 91 | FTF_KEY: FTF 92 | } 93 | print('Accuracy:',r*1.0/t) 94 | print(classification_report(y_true=y_real,y_pred=y_pred, digits=5)) 95 | 96 | return res 97 | -------------------------------------------------------------------------------- /models/ml/rdp/convert_to_csv.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | #把生成的文件转换为csv格式的. 3 | #逐行生成,并不合并,等实际模型需要的时候再进行合并 4 | #格式 5 | #时间戳,类别id,特征1,特征2,··· 6 | ''' 7 | 类别id : 动作类别 8 | 0 : editing_doc 9 | 1 : reading_doc 10 | 2 : surfing_web 11 | 3 : installing_software 12 | 4 : transfering_file 13 | 5 : watching_video 14 | ''' 15 | class_to_id={ 16 | 'editing_doc':0, 17 | 'reading_doc':1, 18 | 'surfing_web':2, 19 | 'installing_software':3, 20 | 'transfering_file':4, 21 | 'watching_video':5 22 | } 23 | __author__ = 'jmh081701' 24 | import os 25 | import re 26 | import json 27 | import sys 28 | def get_files(appname,gap,directory=r"E:\TempWorkStation\i-know-what-are-you-doing\dataset\vector"): 29 | files=[] 30 | for _root,_subs,_files in os.walk(directory): 31 | for file in _files: 32 | if file.count(appname) and file.count("gap=%s"%gap): 33 | files.append(directory+"\\"+file) 34 | return files 35 | if __name__ == '__main__': 36 | appnames=['micrords','anydesk','realvnc','teamviewer'] 37 | gaps = [0.5,0.2,0.8] 38 | for appname in appnames: 39 | for gap in gaps: 40 | #if appname!='teamviewer' or gap!=0.2: 41 | # continue 42 | DIRECOTRY=r"E:\TempWorkStation\i-know-what-are-you-doing\dataset\vector_flowid" 43 | files = get_files(appname=appname,gap=gap,directory=DIRECOTRY) 44 | label_rule = "_(.*?)\." 45 | label_pattern = re.compile(label_rule) 46 | TARGET=DIRECOTRY+"\\"+"csv"+"\\"+appname+"_"+str(gap) +".txt" 47 | fp = open(TARGET,'w') 48 | for file in files: 49 | print(file) 50 | label=class_to_id[label_pattern.findall(file.split("\\")[-1] )[0] ] 51 | with open(file) as jfp: 52 | peaks_features=json.load(jfp) 53 | for i in range(peaks_features['counter']): 54 | timestamp = peaks_features['timestamps'][i] 55 | feature = peaks_features['feature'][i] 56 | flowid = peaks_features['flowids'][i] 57 | fp.writelines("%s,%d,%d,"%(timestamp,label,flowid)) 58 | #print(len(feature)) 59 | if len(feature)!=96: 60 | print(feature,i) 61 | exit() 62 | for j in range(len(feature)): 63 | fp.writelines(str(feature[j])) 64 | if j < (len(feature)-1): 65 | fp.writelines(",") 66 | fp.writelines("\n") 67 | if i %1000 ==0 : 68 | print('finished %d/%d'%(i,peaks_features['counter'])) 69 | fp.flush() 70 | fp.close() -------------------------------------------------------------------------------- /models/ml/cumul/feature_extractor.py: -------------------------------------------------------------------------------- 1 | __author__ = 'dk' 2 | import numpy as np 3 | import os 4 | import sys 5 | import copy 6 | class cumul_feature_extractor: 7 | def __init__(self, 8 | feature_length = 100, 9 | min = 0, max = 1 10 | ): 11 | self.feature_length = feature_length #cumul模型的输入向量的长度,默认是100 12 | self.equidistance = None #采样的间距 13 | #标准化的参数 14 | self.min = min 15 | self.max = max 16 | ##############训练模型使用的数据 17 | 18 | def feature_extract(self,trace_sequence, cell_size=None): 19 | """feature_extract() : 从[-1,1,1...]的cell的方向序列中,生成CUMUL模型所需的特征向量 20 | 21 | :param trace_sequence: `numpy.narray` ,形状:batch_size * trace_length 22 | 输入的[-1,1,1...]向量,-1表示ingoing的cell,+1表示outgoing的流 23 | :param cell_size: 每个cell的大小,默认None,因为最后还得归一化 24 | :return: 25 | """ 26 | 27 | if cell_size == None: 28 | cell_size = 1 29 | if not isinstance(type(trace_sequence),np.ndarray): 30 | trace_sequence = np.array(trace_sequence) 31 | shape = trace_sequence.shape 32 | culmulative_sum_a = np.zeros(shape=shape,dtype = np.float) 33 | culmulative_sum_c = np.zeros(shape=shape,dtype = np.float) 34 | xp = np.linspace(0,shape[1]-1,shape[1]) 35 | features = np.zeros(shape=(shape[0],2*self.feature_length),dtype = np.float) 36 | #计算累计和 37 | for i in range(0,shape[0]): 38 | for j in range(1,shape[1]): 39 | culmulative_sum_a[i,j] += culmulative_sum_a[i,j-1] + abs(trace_sequence[i,j]) 40 | culmulative_sum_c[i,j] += culmulative_sum_c[i,j-1] + trace_sequence[i,j] 41 | #加上cell_size 42 | culmulative_sum_a = cell_size * culmulative_sum_a 43 | culmulative_sum_c = cell_size * culmulative_sum_c 44 | 45 | #线性采样n个特征 46 | if self.equidistance != None: 47 | equidistance = self.equidistance 48 | else: 49 | equidistance = (shape[1]-1)/self.feature_length 50 | xval = np.arange(0,equidistance * self.feature_length,equidistance) 51 | for i in range(shape[0]): 52 | #print(i,culmulative_sum_a[i]) 53 | #print(i,culmulative_sum_c[i]) 54 | a_interp = (np.interp(xval,xp,culmulative_sum_a[i])-self.min)/(self.max-self.min) 55 | c_interp = (np.interp(xval,xp,culmulative_sum_c[i])-self.min)/(self.max-self.min) 56 | 57 | features[i,0:2*self.feature_length:2]=copy.deepcopy(a_interp)[:self.feature_length] 58 | features[i,1:2*self.feature_length:2]=copy.deepcopy(c_interp)[:self.feature_length] 59 | #print(i,features[i]) 60 | #print('#'*30) 61 | return features 62 | -------------------------------------------------------------------------------- /models/dl/examples.py: -------------------------------------------------------------------------------- 1 | 2 | __author__ = 'dk' 3 | import os 4 | #设置Tensorflow的日志等级 5 | #os.environ["TF_CPP_MIN_LOG_LEVEL"]='1' # 这是默认的显示等级,显示所有信息 6 | #os.environ["TF_CPP_MIN_LOG_LEVEL"]='2' # 只显示 warning 和 Error 7 | os.environ["TF_CPP_MIN_LOG_LEVEL"]='3' # 只显示 Error 8 | 9 | from attacks import CNN_model,DF_model,SDAE_model,LSTM_model 10 | from df import df_model_config 11 | 12 | from cnn import cnn_model_config 13 | from sdae import sdae_model_config 14 | from lstm import lstm_model_config 15 | 16 | from data_utils import LoadDataNoDefCW100,LoadDataNoDefCW 17 | 18 | #使用步骤 19 | #1. 修改各个模型的超参数,xx_model_config.py,把里面的输入向量和标签数改成自己所需要的 20 | #2. 读取数据,构造好训练集,验证集,测试集 21 | #3. build_model() 22 | #4. 调用fit() 23 | #5. 测试一下 24 | #6. 保存模型 25 | def test_cnn(X_train,y_train,X_valid,y_valid,X_test,y_test): 26 | model = CNN_model() 27 | model.build_model() 28 | model.fit(X_train=X_train,y_train=y_train,X_valid=X_valid,y_valid=y_valid, 29 | batch_size=cnn_model_config.learning_params_template['batch_size'], 30 | epochs=cnn_model_config.learning_params_template['nb_epochs']) 31 | model.save_model() 32 | score = model.evaluate(X_test=X_test,y_test=y_test) 33 | print('simple CNN accuracy :{0}'.format(score)) 34 | 35 | def test_df(X_train,y_train,X_valid,y_valid,X_test,y_test): 36 | 37 | model =DF_model() 38 | model.build_model() 39 | 40 | model.fit(X_train=X_train,y_train=y_train,X_valid=X_valid,y_valid=y_valid, 41 | batch_size=df_model_config.learning_params_template['batch_size'], 42 | epochs=df_model_config.learning_params_template['epoch']) 43 | model.save_model() 44 | score = model.evaluate(X_test=X_test,y_test=y_test) 45 | print('Deep Fingerprinting accuracy :{0}'.format(score)) 46 | 47 | 48 | 49 | def test_lstm(X_train,y_train,X_valid,y_valid,X_test,y_test): 50 | model = LSTM_model() 51 | def test_sdae(X_train,y_train,X_valid,y_valid,X_test,y_test): 52 | model = SDAE_model() 53 | model.build_model() 54 | model.pre_train(x_train=X_train,x_test=X_test) 55 | 56 | model.fit(X_train=X_train,y_train=y_train,X_valid=X_valid,y_valid=y_valid, 57 | batch_size=sdae_model_config.learning_params_template['batch_size'], 58 | epochs=sdae_model_config.learning_params_template['nb_epochs']) 59 | model.save_model() 60 | score = model.evaluate(X_test=X_test,y_test=y_test) 61 | print('sdae accuracy :{0}'.format(score)) 62 | if __name__ == '__main__': 63 | X_train, y_train, X_valid, y_valid, X_test, y_test = LoadDataNoDefCW100() 64 | #test_df(X_train=X_train,y_train=y_train,X_valid=X_valid,y_valid=y_valid,X_test=X_test,y_test=y_test) 65 | #test_cnn(X_train=X_train,y_train=y_train,X_valid=X_valid,y_valid=y_valid,X_test=X_test,y_test=y_test) 66 | 67 | test_sdae(X_train,y_train,X_valid,y_valid,X_test,y_test) -------------------------------------------------------------------------------- /models/dl/cnn_example.py: -------------------------------------------------------------------------------- 1 | __author__ = 'dk' 2 | import os 3 | #设置Tensorflow的日志等级 4 | #os.environ["TF_CPP_MIN_LOG_LEVEL"]='1' # 这是默认的显示等级,显示所有信息 5 | #os.environ["TF_CPP_MIN_LOG_LEVEL"]='2' # 只显示 warning 和 Error 6 | os.environ["TF_CPP_MIN_LOG_LEVEL"]='3' # 只显示 Error 7 | 8 | from attacks import CNN_model,DF_model,SDAE_model,LSTM_model 9 | from df import df_model_config 10 | 11 | from cnn import cnn_model_config 12 | from sdae import sdae_model_config 13 | from lstm import lstm_model_config 14 | 15 | from data_utils import LoadDataNoDefCW100,LoadDataNoDefCW,LoadDataWalkieTalkieCW,LoadDataApp 16 | 17 | #使用步骤 18 | #1. 修改各个模型的超参数,xx_model_config.py,把里面的输入向量和标签数改成自己所需要的 19 | #2. 读取数据,构造好训练集,验证集,测试集 20 | #3. build_model() 21 | #4. 调用fit() 22 | #5. 测试一下 23 | #6. 保存模型 24 | def test_cnn(X_train,y_train,X_valid,y_valid,X_test,y_test): 25 | model = CNN_model() 26 | model.build_model() 27 | model.fit(X_train=X_train,y_train=y_train,X_valid=X_valid,y_valid=y_valid, 28 | batch_size=cnn_model_config.learning_params_template['batch_size'], 29 | epochs=cnn_model_config.learning_params_template['nb_epochs']) 30 | model.save_model() 31 | score = model.evaluate(X_test=X_test,y_test=y_test) 32 | print('simple CNN accuracy :{0}'.format(score)) 33 | 34 | def test_df(X_train,y_train,X_valid,y_valid,X_test,y_test): 35 | 36 | model =DF_model() 37 | model.build_model() 38 | 39 | model.fit(X_train=X_train,y_train=y_train,X_valid=X_valid,y_valid=y_valid, 40 | batch_size=df_model_config.learning_params_template['batch_size'], 41 | epochs=df_model_config.learning_params_template['epoch']) 42 | model.save_model() 43 | score = model.evaluate(X_test=X_test,y_test=y_test) 44 | print('Deep Fingerprinting accuracy :{0}'.format(score)) 45 | 46 | 47 | 48 | def test_lstm(X_train,y_train,X_valid,y_valid,X_test,y_test): 49 | model = LSTM_model() 50 | def test_sdae(X_train,y_train,X_valid,y_valid,X_test,y_test): 51 | model = SDAE_model() 52 | model.build_model() 53 | model.pre_train(x_train=X_train,x_test=X_test) 54 | 55 | model.fit(X_train=X_train,y_train=y_train,X_valid=X_valid,y_valid=y_valid, 56 | batch_size=sdae_model_config.learning_params_template['batch_size'], 57 | epochs=sdae_model_config.learning_params_template['nb_epochs']) 58 | model.save_model() 59 | score = model.evaluate(X_test=X_test,y_test=y_test) 60 | print('sdae accuracy :{0}'.format(score)) 61 | if __name__ == '__main__': 62 | X_train, y_train, X_valid, y_valid, X_test, y_test = LoadDataApp()#LoadDataWalkieTalkieCW()#LoadDataNoDefCW() 63 | #test_df(X_train=X_train,y_train=y_train,X_valid=X_valid,y_valid=y_valid,X_test=X_test,y_test=y_test) 64 | test_cnn(X_train=X_train,y_train=y_train,X_valid=X_valid,y_valid=y_valid,X_test=X_test,y_test=y_test) 65 | 66 | #test_sdae(X_train,y_train,X_valid,y_valid,X_test,y_test) -------------------------------------------------------------------------------- /models/dl/df_example.py: -------------------------------------------------------------------------------- 1 | __author__ = 'dk' 2 | import os 3 | #设置Tensorflow的日志等级 4 | #os.environ["TF_CPP_MIN_LOG_LEVEL"]='1' # 这是默认的显示等级,显示所有信息 5 | #os.environ["TF_CPP_MIN_LOG_LEVEL"]='2' # 只显示 warning 和 Error 6 | os.environ["TF_CPP_MIN_LOG_LEVEL"]='3' # 只显示 Error 7 | 8 | from attacks import CNN_model,DF_model,SDAE_model,LSTM_model 9 | from df import df_model_config 10 | 11 | from cnn import cnn_model_config 12 | from sdae import sdae_model_config 13 | from lstm import lstm_model_config 14 | 15 | from data_utils import LoadDataNoDefCW100,LoadDataNoDefCW,LoadDataWalkieTalkieCW,LoadDataApp 16 | 17 | #使用步骤 18 | #1. 修改各个模型的超参数,xx_model_config.py,把里面的输入向量和标签数改成自己所需要的 19 | #2. 读取数据,构造好训练集,验证集,测试集 20 | #3. build_model() 21 | #4. 调用fit() 22 | #5. 测试一下 23 | #6. 保存模型 24 | def test_cnn(X_train,y_train,X_valid,y_valid,X_test,y_test): 25 | model = CNN_model() 26 | model.build_model() 27 | model.fit(X_train=X_train,y_train=y_train,X_valid=X_valid,y_valid=y_valid, 28 | batch_size=cnn_model_config.learning_params_template['batch_size'], 29 | epochs=cnn_model_config.learning_params_template['nb_epochs']) 30 | model.save_model() 31 | score = model.evaluate(X_test=X_test,y_test=y_test) 32 | print('simple CNN accuracy :{0}'.format(score)) 33 | 34 | def test_df(X_train,y_train,X_valid,y_valid,X_test,y_test): 35 | 36 | model =DF_model() 37 | model.build_model() 38 | 39 | model.fit(X_train=X_train,y_train=y_train,X_valid=X_valid,y_valid=y_valid, 40 | batch_size=df_model_config.learning_params_template['batch_size'], 41 | epochs=df_model_config.learning_params_template['epoch']) 42 | model.save_model() 43 | score = model.evaluate(X_test=X_test,y_test=y_test) 44 | print('Deep Fingerprinting accuracy :{0}'.format(score)) 45 | 46 | 47 | 48 | def test_lstm(X_train,y_train,X_valid,y_valid,X_test,y_test): 49 | model = LSTM_model() 50 | def test_sdae(X_train,y_train,X_valid,y_valid,X_test,y_test): 51 | model = SDAE_model() 52 | model.build_model() 53 | model.pre_train(x_train=X_train,x_test=X_test) 54 | 55 | model.fit(X_train=X_train,y_train=y_train,X_valid=X_valid,y_valid=y_valid, 56 | batch_size=sdae_model_config.learning_params_template['batch_size'], 57 | epochs=sdae_model_config.learning_params_template['nb_epochs']) 58 | model.save_model() 59 | score = model.evaluate(X_test=X_test,y_test=y_test) 60 | print('sdae accuracy :{0}'.format(score)) 61 | if __name__ == '__main__': 62 | X_train, y_train, X_valid, y_valid, X_test, y_test = LoadDataApp()#LoadDataWalkieTalkieCW() #LoadDataNoDefCW() 63 | test_df(X_train=X_train,y_train=y_train,X_valid=X_valid,y_valid=y_valid,X_test=X_test,y_test=y_test) 64 | #test_cnn(X_train=X_train,y_train=y_train,X_valid=X_valid,y_valid=y_valid,X_test=X_test,y_test=y_test) 65 | 66 | #test_sdae(X_train,y_train,X_valid,y_valid,X_test,y_test) -------------------------------------------------------------------------------- /models/dl/sdae_example.py: -------------------------------------------------------------------------------- 1 | __author__ = 'dk' 2 | import os 3 | #设置Tensorflow的日志等级 4 | #os.environ["TF_CPP_MIN_LOG_LEVEL"]='1' # 这是默认的显示等级,显示所有信息 5 | #os.environ["TF_CPP_MIN_LOG_LEVEL"]='2' # 只显示 warning 和 Error 6 | os.environ["TF_CPP_MIN_LOG_LEVEL"]='3' # 只显示 Error 7 | 8 | from attacks import CNN_model,DF_model,SDAE_model,LSTM_model 9 | from df import df_model_config 10 | 11 | from cnn import cnn_model_config 12 | from sdae import sdae_model_config 13 | from lstm import lstm_model_config 14 | 15 | from data_utils import LoadDataNoDefCW100,LoadDataNoDefCW,LoadDataWalkieTalkieCW,LoadDataApp 16 | 17 | #使用步骤 18 | #1. 修改各个模型的超参数,xx_model_config.py,把里面的输入向量和标签数改成自己所需要的 19 | #2. 读取数据,构造好训练集,验证集,测试集 20 | #3. build_model() 21 | #4. 调用fit() 22 | #5. 测试一下 23 | #6. 保存模型 24 | def test_cnn(X_train,y_train,X_valid,y_valid,X_test,y_test): 25 | model = CNN_model() 26 | model.build_model() 27 | model.fit(X_train=X_train,y_train=y_train,X_valid=X_valid,y_valid=y_valid, 28 | batch_size=cnn_model_config.learning_params_template['batch_size'], 29 | epochs=cnn_model_config.learning_params_template['nb_epochs']) 30 | model.save_model() 31 | score = model.evaluate(X_test=X_test,y_test=y_test) 32 | print('simple CNN accuracy :{0}'.format(score)) 33 | 34 | def test_df(X_train,y_train,X_valid,y_valid,X_test,y_test): 35 | 36 | model =DF_model() 37 | model.build_model() 38 | 39 | model.fit(X_train=X_train,y_train=y_train,X_valid=X_valid,y_valid=y_valid, 40 | batch_size=df_model_config.learning_params_template['batch_size'], 41 | epochs=df_model_config.learning_params_template['epoch']) 42 | model.save_model() 43 | score = model.evaluate(X_test=X_test,y_test=y_test) 44 | print('Deep Fingerprinting accuracy :{0}'.format(score)) 45 | 46 | 47 | 48 | def test_lstm(X_train,y_train,X_valid,y_valid,X_test,y_test): 49 | model = LSTM_model() 50 | def test_sdae(X_train,y_train,X_valid,y_valid,X_test,y_test): 51 | model = SDAE_model() 52 | model.build_model() 53 | model.pre_train(x_train=X_train,x_test=X_test) 54 | 55 | model.fit(X_train=X_train,y_train=y_train,X_valid=X_valid,y_valid=y_valid, 56 | batch_size=sdae_model_config.learning_params_template['batch_size'], 57 | epochs=sdae_model_config.learning_params_template['nb_epochs']) 58 | model.save_model() 59 | score = model.evaluate(X_test=X_test,y_test=y_test) 60 | print('sdae accuracy :{0}'.format(score)) 61 | if __name__ == '__main__': 62 | X_train, y_train, X_valid, y_valid, X_test, y_test = LoadDataApp()#LoadDataWalkieTalkieCW() #LoadDataNoDefCW() 63 | #test_df(X_train=X_train,y_train=y_train,X_valid=X_valid,y_valid=y_valid,X_test=X_test,y_test=y_test) 64 | #test_cnn(X_train=X_train,y_train=y_train,X_valid=X_valid,y_valid=y_valid,X_test=X_test,y_test=y_test) 65 | 66 | test_sdae(X_train,y_train,X_valid,y_valid,X_test,y_test) -------------------------------------------------------------------------------- /models/dl/mimetic/build_model.py: -------------------------------------------------------------------------------- 1 | __author__ = 'dk' 2 | import torch.nn as nn 3 | import torch as th 4 | 5 | class CNN_block(nn.Module): 6 | def __init__(self, kernel_size=25, filter_num=256): 7 | super(CNN_block, self).__init__() 8 | self.kernel_size = kernel_size 9 | self.filter_num = filter_num 10 | 11 | self._1conv1d = nn.Conv1d(stride=1, 12 | kernel_size=kernel_size, 13 | in_channels=1, 14 | out_channels=16, 15 | padding=kernel_size//2) 16 | self._2maxpooling = nn.MaxPool1d(kernel_size=3, stride= 1) 17 | 18 | self._3conv1d = nn.Conv1d(stride=1, 19 | kernel_size=kernel_size, 20 | in_channels=16, 21 | out_channels=32, 22 | padding=kernel_size//2) 23 | self._4maxpooling = nn.MaxPool1d(kernel_size=kernel_size, stride= 4) 24 | self._6flattern = nn.Flatten() 25 | self._5fc = nn.Linear(in_features=144, out_features=128) 26 | 27 | def forward(self, x): 28 | x = x.unsqueeze(1) 29 | 30 | x = self._1conv1d(x) 31 | x = self._2maxpooling(x) 32 | x = self._3conv1d(x) 33 | x = self._4maxpooling(x) 34 | x = self._5fc(x) 35 | x = self._6flattern(x) 36 | 37 | #print('cnn x shape',x.shape) 38 | return x 39 | 40 | class MIMETICModel(nn.Module): 41 | def __init__(self, payload_sz, packet_nb, class_nb, gru_layer_nb=2): 42 | super(MIMETICModel, self).__init__() 43 | self.payload_sz = payload_sz 44 | self.packet_nb = packet_nb 45 | 46 | self.gru_encoder = nn.GRU( input_size = 3, ##包长序列、到达时间序列、方向序列、 window-size 47 | hidden_size= 64, 48 | num_layers=gru_layer_nb, 49 | bidirectional=True, batch_first=True) 50 | 51 | self.cnn_encoder = CNN_block() 52 | 53 | self.fc = nn.Linear(in_features=12288, out_features= class_nb) 54 | self.dropout = nn.Dropout(p=0.1) 55 | 56 | def forward(self, field, payload): 57 | batch_size= field.shape[0] 58 | 59 | #print('packet embed shape', packet_embed.shape) 60 | packet_vector, hidden = self.gru_encoder(field) 61 | #print('packet vector shape',packet_vector.shape) 62 | packet_vector = packet_vector.reshape(batch_size, -1) 63 | 64 | payload_vector = self.cnn_encoder(payload) 65 | 66 | representation = th.cat((packet_vector, payload_vector), dim=1) 67 | representation = self.dropout(self.fc(representation)) 68 | return representation -------------------------------------------------------------------------------- /models/dl/fsnet/preprocess.py: -------------------------------------------------------------------------------- 1 | import tqdm 2 | import numpy as np 3 | import os 4 | import sys 5 | import json 6 | 7 | 8 | def eprint(*args, **kwargs): 9 | print(*args, file=sys.stderr, **kwargs) 10 | 11 | 12 | def load_origin_data(data_dir, app_num): 13 | datas = [[] for _ in range(app_num)] 14 | filenames = [filename for filename in os.listdir(data_dir) \ 15 | if os.path.isfile(os.path.join(data_dir, filename)) and filename.split(".")[-1] == "num"] 16 | filenames.sort() 17 | print(filenames,app_num) 18 | for app in tqdm.tqdm(range(app_num), ascii=True, desc='[Load Data]'): 19 | with open(os.path.join(data_dir, filenames[app])) as fp: 20 | for line in fp: 21 | 22 | _length = line.strip().split(';')[1].strip().split('\t') 23 | length =[] 24 | for ix in _length: 25 | if int(ix) !=0 : 26 | length.append(abs(int(ix))) 27 | else: 28 | break 29 | datas[app].append({'label': app, 'flow': length, 'lo': length.copy()}) 30 | return datas 31 | 32 | 33 | def _transform(datas, block, limit, max_packet): 34 | data_trans = [[] for _ in range(len(datas))] 35 | for app in tqdm.tqdm(range(len(datas)), ascii=True, desc='[Transform]'): 36 | app_data = datas[app] 37 | for idx, example in enumerate(app_data): 38 | flow = example['flow'] 39 | if len(flow) < limit: 40 | #print(flow) 41 | continue 42 | flow = [ix if ix <= max_packet else max_packet for ix in flow] 43 | flow = [ix // block + 3 for ix in flow] 44 | data_trans[app].append( 45 | {'label': example['label'], 'flow': flow, 'lo': example['lo'], 'id': str(app) + '-' + str(idx)} 46 | ) 47 | return data_trans 48 | 49 | 50 | def split_train_and_dev(datas, ratio=0.8, keep_ratio=1): 51 | train, dev = [], [] 52 | for app_data in tqdm.tqdm(datas, ascii=True, desc='[Split]'): 53 | is_keep = np.random.rand(len(app_data)) <= keep_ratio 54 | is_train = np.random.rand(len(app_data)) <= ratio 55 | for example, kp, tr in zip(app_data, is_keep, is_train): 56 | if kp and tr: 57 | train.append(example) 58 | elif kp and not tr: 59 | dev.append(example) 60 | np.random.shuffle(train) 61 | np.random.shuffle(dev) 62 | return train, dev 63 | 64 | 65 | def preprocess(config): 66 | eprint('Generate train and test.') 67 | origin = load_origin_data(config.data_dir, config.class_num) 68 | length = _transform(origin, config.length_block, config.min_length, config.max_packet_length) 69 | train, test = split_train_and_dev(length, config.split_ratio, config.keep_ratio) 70 | with open(config.train_json, 'w') as fp: 71 | json.dump(train, fp, indent=1) 72 | with open(config.test_json, 'w') as fp: 73 | json.dump(test, fp, indent=1) 74 | with open(config.train_meta, 'w') as fp: 75 | fp.write(str(len(train))) 76 | with open(config.test_meta, 'w') as fp: 77 | fp.write(str(len(test))) 78 | -------------------------------------------------------------------------------- /models/model_base.py: -------------------------------------------------------------------------------- 1 | 2 | import random 3 | import os 4 | import pickle 5 | import numpy as np 6 | import gzip 7 | from sklearn.metrics import multilabel_confusion_matrix, roc_auc_score, auc 8 | class abs_model: 9 | def __init__(self, name, randseed): 10 | self.database = './data/' 11 | self.name = name 12 | self.rand = random.Random(x = randseed) 13 | self.data = None 14 | self.model = None 15 | self.full_rdata = [] 16 | 17 | def data_exists(self): 18 | return os.path.exists(self.data) 19 | def model_exist(self): 20 | return os.path.exists(self.model) 21 | 22 | def train(self): 23 | pass 24 | 25 | def test(self): 26 | pass 27 | 28 | def parser_raw_data(self): 29 | ##从原始通用数据集获取自己所需格式数据集能力 30 | pass 31 | 32 | def save_model(self): 33 | pass 34 | 35 | def load_model(self): 36 | pass 37 | def save_data(self,X_train, y_train, X_valid, y_valid, X_test, y_test): 38 | fp = gzip.GzipFile(self.data + 'data.gzip','wb') 39 | pickle.dump({ 40 | 'X_train':X_train, 41 | 'y_train':y_train, 42 | 'X_valid':X_valid, 43 | 'y_valid':y_valid, 44 | 'X_test':X_test, 45 | 'y_test':y_test 46 | },file=fp) 47 | fp.close() 48 | def load_data(self): 49 | fp = gzip.GzipFile(self.data + 'data.gzip','rb') 50 | data = pickle.load(fp) 51 | fp.close() 52 | X_train = data['X_train'] 53 | y_train = data['y_train'] 54 | X_valid = data['X_valid'] 55 | y_valid = data['y_valid'] 56 | X_test = data['X_test'] 57 | y_test = data['y_test'] 58 | import random 59 | indexs = [x for x in range(len(y_test))] 60 | random.shuffle(indexs) 61 | return np.array(X_train), np.array(y_train), np.array(X_valid), np.array(y_valid), np.array(X_test)[indexs], np.array(y_test)[indexs] 62 | def num_classes(self): 63 | for _root, _dir, _files in os.walk(self.full_rdata): 64 | classes = _files 65 | return len(classes) 66 | def fpr_tpr_auc(self, y_pred, y_real,y_pred_logit=None): 67 | labels =set() 68 | for each in y_real: 69 | labels.add(each) 70 | labels =list(labels) 71 | mcm = multilabel_confusion_matrix(y_true=y_real,y_pred=y_pred,labels=labels) 72 | #print(mcm) 73 | fp ={} 74 | tp ={} 75 | fn ={} 76 | tn ={} 77 | for i in range(len(labels)): 78 | fp.setdefault(labels[i],mcm[i,0,1]) 79 | tp.setdefault(labels[i],mcm[i,1,1]) 80 | fn.setdefault(labels[i],mcm[i,1,0]) 81 | tn.setdefault(labels[i],mcm[i,0,0]) 82 | acc={} 83 | fpr={} 84 | tpr={} 85 | for each in fp: 86 | acc.setdefault(each,(tp[each]+tn[each])/(fp[each]+tn[each]+fn[each]+tp[each])) 87 | fpr.setdefault(each,fp[each]/(fp[each]+tn[each])) 88 | tpr.setdefault(each,tp[each]/(tp[each]+fn[each])) 89 | 90 | print('tpr:',tpr) 91 | 92 | print('fpr:',fpr) 93 | #auc = roc_auc_score(y_true=y_real, y_score=y_pred_logit[:,1]) 94 | #print('auc (prob):', auc) 95 | 96 | auc = roc_auc_score(y_true=y_real, y_score=y_pred) 97 | print('auc (label):', auc) -------------------------------------------------------------------------------- /models/dl/cnn/cnn_model.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | from keras.models import Sequential 3 | from keras.layers.core import Dense, Dropout 4 | from keras.layers import Conv1D, MaxPooling1D, Flatten 5 | from keras.callbacks import EarlyStopping, ModelCheckpoint 6 | from keras.utils import np_utils 7 | from keras.optimizers import RMSprop, Adam, SGD 8 | from keras import regularizers 9 | from keras.layers.recurrent import LSTM 10 | from .cnn_model_config import learning_params_template,nb_classes_template 11 | from keras import optimizers 12 | 13 | def build_model(learn_params=learning_params_template, nb_classes=nb_classes_template): 14 | input_length = learn_params["maxlen"] 15 | input_dim = learn_params["nb_features"] 16 | layers = learn_params["layers"] 17 | 18 | model = Sequential() 19 | 20 | maxlen = input_length 21 | max_features = input_dim 22 | 23 | if len(layers) == 0: 24 | raise("No layers") 25 | 26 | first_l = layers[0] 27 | rest_l = layers[1:] 28 | 29 | # First layer 30 | if first_l["name"] == 'dropout': 31 | model.add(Dropout(input_shape=(maxlen, max_features), rate=first_l['rate'])) 32 | elif first_l["name"] == 'conv': 33 | model.add(Conv1D(filters=first_l['filters'], 34 | kernel_size=first_l['kernel_size'], 35 | padding='valid', 36 | activation=first_l['activation'], 37 | strides=first_l['stride'])) 38 | 39 | # Middle layers (conv, dropout, pooling, dense, lstm.....) 40 | for l in rest_l: 41 | if l["name"] == 'maxpooling': 42 | model.add(MaxPooling1D(pool_size=l['pool_size'], padding='valid')) 43 | elif l["name"] == 'conv': 44 | model.add(Conv1D(filters=l['filters'], 45 | kernel_size=l['kernel_size'], 46 | padding='valid', 47 | activation=l['activation'], 48 | strides=l['stride'])) 49 | elif l["name"] == 'dropout': 50 | model.add(Dropout(rate=l['rate'])) 51 | elif l["name"] == 'lstm': 52 | model.add(LSTM(l['units'])) 53 | elif l["name"] == 'flatten': 54 | model.add(Flatten()) 55 | elif l["name"] == 'dense': 56 | if l['regularization'] > 0.0: 57 | model.add(Dense(units=l['units'], activation=l['activation'], 58 | kernel_regularizer=regularizers.l2(last_l['regularization']), 59 | activity_regularizer=regularizers.l1(last_l['regularization']))) 60 | else: 61 | model.add(Dense(units=l['units'], activation=l['activation'])) 62 | 63 | 64 | learn_params = learning_params_template 65 | if learn_params['optimizer'] == "sgd": 66 | optimizer = SGD(lr=learn_params['lr'], 67 | decay=learn_params['decay'], 68 | momentum=learn_params['momentum'], 69 | nesterov=True) 70 | elif learn_params['optimizer'] == "adam": 71 | optimizer = Adam(lr=learn_params['lr'], 72 | decay=learn_params['decay']) 73 | else: # elif learn_params['optimizer'] == "rmsprop": 74 | optimizer = RMSprop(lr=learn_params['lr'], 75 | decay=learn_params['decay']) 76 | metrics=['accuracy'] 77 | model.compile(loss="categorical_crossentropy", optimizer=optimizer, metrics=metrics) 78 | return model 79 | 80 | -------------------------------------------------------------------------------- /models/dl/lstm/lstm_model.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | from keras.layers.core import Dense, Dropout 3 | from keras.layers.recurrent import LSTM 4 | from keras.layers import Input 5 | from keras.models import Sequential 6 | try: 7 | import hyperas 8 | except ImportError as exp: 9 | print("Error:{0},\n\t please execute: {1}".format(exp,"pip install hyperas -i https://mirrors.aliyun.com/pypi/simple/")) 10 | raise exp 11 | from keras.optimizers import SGD, Adam, RMSprop 12 | from .lstm_model_config import learn_params_template,nb_classes_template 13 | 14 | def build_model(learn_params=learn_params_template, nb_classes=nb_classes_template): 15 | input_length = learn_params["maxlen"] 16 | input_dim = learn_params["nb_features"] 17 | layers = learn_params["layers"] 18 | 19 | model = Sequential() 20 | # input_shape = (input_length, input_dim) 21 | # input_length = maxlen 22 | # input_dim = nb_features 23 | 24 | if len(layers) == 0: 25 | raise ("No layers") 26 | 27 | if len(layers) == 1: 28 | layer = layers[0] 29 | model.add(LSTM(input_shape=(input_length, input_dim), 30 | #batch_input_shape=(batch_size, input_length, input_dim), 31 | units=layer['units'], 32 | activation=layer['activation'], 33 | recurrent_activation=layer['rec_activation'], 34 | return_sequences=False, 35 | #stateful=True, 36 | dropout=layer['dropout'])) 37 | model.add(Dense(units=nb_classes, activation='softmax')) 38 | return model 39 | 40 | first_l = layers[0] 41 | last_l = layers[-1] 42 | middle_ls = layers[1:-1] 43 | # 44 | model.add(LSTM(input_shape=(input_length, input_dim), 45 | #batch_input_shape=(batch_size, input_length, input_dim), 46 | units=first_l['units'], 47 | activation=first_l['activation'], 48 | recurrent_activation=first_l['rec_activation'], 49 | return_sequences=True, 50 | #stateful=True, 51 | dropout=first_l['dropout'])) 52 | for l in middle_ls: 53 | model.add(LSTM(units=l['units'], 54 | activation=l['activation'], 55 | recurrent_activation=l['rec_activation'], 56 | return_sequences=True, 57 | #stateful=True, 58 | dropout=l['dropout'])) 59 | 60 | model.add(LSTM(units=last_l['units'], 61 | activation=last_l['activation'], 62 | recurrent_activation=last_l['rec_activation'], 63 | return_sequences=False, 64 | #stateful=True, 65 | dropout=last_l['dropout'])) 66 | 67 | model.add(Dense(units=nb_classes, activation='softmax')) 68 | 69 | if learn_params['optimizer'] == "sgd": 70 | optimizer = SGD(lr=learn_params['lr'], 71 | decay=learn_params['decay'], 72 | momentum=0.9, 73 | nesterov=True) 74 | elif learn_params['optimizer'] == "adam": 75 | optimizer = Adam(lr=learn_params['lr'], 76 | decay=learn_params['decay']) 77 | else: # elif learn_params['optimizer'] == "rmsprop": 78 | optimizer = RMSprop(lr=learn_params['lr'], 79 | decay=learn_params['decay']) 80 | metrics=['accuracy'] 81 | model.compile(loss="categorical_crossentropy", optimizer=optimizer, metrics=metrics) 82 | 83 | return model 84 | 85 | 86 | -------------------------------------------------------------------------------- /models/dl/README.md: -------------------------------------------------------------------------------- 1 | # website_fingerprinting 2 | 3 | 目前本项目支持如下模型: 4 | 5 | 6 | - Deep Fingerprinting 7 | 8 | - SDAE 9 | 10 | - LSTM 11 | 12 | - CNN 13 | 14 | 15 | 剩余两个是统计机器学习模型:【 目前这两个模型没有适配好,但是里面的特征提取是有效的】 16 | 17 | - CUMUL 18 | 19 | - AppScanner 20 | 21 | 22 | # 使用方法 23 | 24 | ## 数据准备 25 | 26 | 首先,需要准备好数据格式: 27 | ![在这里插入图片描述](https://img-blog.csdnimg.cn/20201013165821472.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L2ptaDE5OTY=,size_16,color_FFFFFF,t_70#pic_center) 28 | 需要将网络流量整理为如上的6个文件,并放在同一个目录,文件名如上。 29 | 30 | ```python 31 | X_train_pkt_length.pkl : 包长序列,训练集。 32 | X_valid_pkt_length.pkl : 包长序列,验证集。 33 | X_test_pkt_length.pkl : 包长序列,测试集。 34 | y_train_pkt_length.pkl : 流量标签,训练集。 35 | y_valid_pkt_length.pkl : 流量标签,验证集。 36 | y_test_pkt_length.pkl : 流量标签,测试集。 37 | ``` 38 | 其中,`X_*_pkt_length.pkl` 是一个使用pickle.save()保存的numpy 矩阵,它的形状为 $m\times l$ 。其中m是样本个数,l是包长序列的长度,**同一数据集所有样本的包长序列需要填充到相同的长度** 。 39 | `y_*_pkt_length.pkl` 也是一个pickle.save()保存的numpy矩阵,它的形状为 $m\times1$,m表示样本个数,第i个元素都是整数,表示对应的训练集、验证集、测试集第i个样本的标签。 40 | 数据集的保存需要使用类似如下的步骤: 41 | 42 | ```python 43 | with gzip.GzipFile(path_dir+"/"+"X_train_"+feature_name+".pkl","wb") as fp: 44 | pickle.dump(X_train,fp,-1) 45 | with gzip.GzipFile(path_dir+"/"+"X_valid_"+feature_name+".pkl","wb") as fp: 46 | pickle.dump(X_valid,fp,-1) 47 | with gzip.GzipFile(path_dir+"/"+"X_test_"+feature_name+".pkl","wb") as fp: 48 | pickle.dump(X_test,fp,-1) 49 | 50 | with gzip.GzipFile(path_dir+"/"+"y_train_"+feature_name+".pkl","wb") as fp: 51 | pickle.dump(y_train,fp,-1) 52 | with gzip.GzipFile(path_dir+"/"+"y_valid_"+feature_name+".pkl","wb") as fp: 53 | pickle.dump(y_valid,fp,-1) 54 | with gzip.GzipFile(path_dir+"/"+"y_test_"+feature_name+".pkl","wb") as fp: 55 | pickle.dump(y_test,fp,-1) 56 | ``` 57 | 58 | **训练集的包长序列的样本数目需要等于训练集的流量标签序列的样本数。 59 | 验证集的包长序列的样本数目需要等于验证集的流量标签序列的样本数。 60 | 测试集的包长序列的样本数目需要等于测试集的流量标签序列的样本数。** 61 | 62 | 项目提供了一个示例数据集 app_dataset,它是一个55分类的数据集,每条样本的包长序列长度为1000,不足的填充0,超过1000的就截断。 63 | 64 | 65 | --- 66 | ## 修改数据目录 67 | 在按照上述步骤准备好数据后,需要修改数据目录。 68 | 修改`website_fingerprinting/data_utils.py` 文件里面的`NB_CLASSES` 变量 和 默认数目集目录`dataset_dir` 变量。 69 | 其中`NB_CLASSES` 变量是数据集不同标签的数目。 70 | `dataset_dir` 是默认数据集的目录 71 | 72 | ![在这里插入图片描述](https://img-blog.csdnimg.cn/20201013171642726.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L2ptaDE5OTY=,size_16,color_FFFFFF,t_70#pic_center) 73 | 74 | --- 75 | ## 配置模型 76 | 在运行模型之前,需要先修改他们的配置文件。 77 | 目前,各个模型的配置文件以模型名命名的目录下: 78 | ![在这里插入图片描述](https://img-blog.csdnimg.cn/20201013171123714.png#pic_center) 79 | 例如,对于Deep fingerprinting模型,它的配置文件为df目录下的df_model_config.py。 80 | ![在这里插入图片描述](https://img-blog.csdnimg.cn/20201013171223270.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L2ptaDE5OTY=,size_16,color_FFFFFF,t_70#pic_center) 81 | 修改模型文件:**修改里面的类别数目和包长序列的长度参数。** 对于里面需要修改的参数,各模式文件都做了标注。 82 | ![在这里插入图片描述](https://img-blog.csdnimg.cn/20201013171257550.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L2ptaDE5OTY=,size_16,color_FFFFFF,t_70#pic_center) 83 | ## 运行模型 84 | ![在这里插入图片描述](https://img-blog.csdnimg.cn/20201013172217604.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L2ptaDE5OTY=,size_16,color_FFFFFF,t_70#pic_center) 85 | 运行 `X_example.py ` 进行模型的训练,其中X可以是df,cnn,lstm,sdae。 86 | 运行`X_eval.py` 进行模型的测试,其中X可以是df,cnn,lstm,sdae. 87 | 88 | 例如: 89 | 在自带的app_dataset数据集运行的 `df_example.py` 的结果为: 90 | ![在这里插入图片描述](https://img-blog.csdnimg.cn/20201013173850869.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L2ptaDE5OTY=,size_16,color_FFFFFF,t_70#pic_center) 91 | 92 | -------------------------------------------------------------------------------- /models/dl/lstm_eval.py: -------------------------------------------------------------------------------- 1 | __author__ = 'dk' 2 | import os 3 | #设置Tensorflow的日志等级 4 | #os.environ["TF_CPP_MIN_LOG_LEVEL"]='1' # 这是默认的显示等级,显示所有信息 5 | #os.environ["TF_CPP_MIN_LOG_LEVEL"]='2' # 只显示 warning 和 Error 6 | os.environ["TF_CPP_MIN_LOG_LEVEL"]='3' # 只显示 Error 7 | 8 | from attacks import CNN_model,DF_model,SDAE_model,LSTM_model 9 | from df import df_model_config 10 | 11 | from cnn import cnn_model_config 12 | from sdae import sdae_model_config 13 | from lstm import lstm_model_config 14 | 15 | from data_utils import LoadDataNoDefCW100,LoadDataNoDefCW,LoadDataWalkieTalkieCW,LoadDataApp,LoadDataApp_crossversion 16 | 17 | #使用步骤 18 | #1. 修改各个模型的超参数,xx_model_config.py,把里面的输入向量和标签数改成自己所需要的 19 | #2. 读取数据,构造好训练集,验证集,测试集 20 | #3. build_model() 21 | #4. 调用fit() 22 | #5. 测试一下 23 | #6. 保存模型 24 | def test_cnn(X_train,y_train,X_valid,y_valid,X_test,y_test): 25 | model = CNN_model() 26 | model.build_model() 27 | model.fit(X_train=X_train,y_train=y_train,X_valid=X_valid,y_valid=y_valid, 28 | batch_size=cnn_model_config.learning_params_template['batch_size'], 29 | epochs=cnn_model_config.learning_params_template['nb_epochs']) 30 | model.save_model() 31 | score = model.evaluate(X_test=X_test,y_test=y_test) 32 | print('simple CNN accuracy :{0}'.format(score)) 33 | 34 | def test_df(X_train,y_train,X_valid,y_valid,X_test,y_test): 35 | 36 | model =DF_model() 37 | model.build_model() 38 | 39 | model.fit(X_train=X_train,y_train=y_train,X_valid=X_valid,y_valid=y_valid, 40 | batch_size=df_model_config.learning_params_template['batch_size'], 41 | epochs=df_model_config.learning_params_template['epoch']) 42 | model.save_model() 43 | score = model.evaluate(X_test=X_test,y_test=y_test) 44 | print('Deep Fingerprinting accuracy :{0}'.format(score)) 45 | 46 | 47 | 48 | def test_lstm(X_train,y_train,X_valid,y_valid,X_test,y_test): 49 | 50 | model = LSTM_model() 51 | model.load_model() 52 | score = model.evaluate(X_test=X_test,y_test=y_test) 53 | print('lstm Test on test dataset accuracy :{0}'.format(score)) 54 | 55 | def test_sdae(X_train,y_train,X_valid,y_valid,X_test,y_test): 56 | model = SDAE_model() 57 | model.build_model() 58 | model.pre_train(x_train=X_train,x_test=X_test) 59 | 60 | model.fit(X_train=X_train,y_train=y_train,X_valid=X_valid,y_valid=y_valid, 61 | batch_size=sdae_model_config.learning_params_template['batch_size'], 62 | epochs=sdae_model_config.learning_params_template['nb_epochs']) 63 | model.save_model() 64 | score = model.evaluate(X_test=X_test,y_test=y_test) 65 | print('sdae accuracy :{0}'.format(score)) 66 | if __name__ == '__main__': 67 | X_train, y_train, X_valid, y_valid, X_test, y_test = LoadDataApp_crossversion()##LoadDataWalkieTalkieCW() # LoadDataNoDefCW() 68 | #test_df(X_train=X_train,y_train=y_train,X_valid=X_valid,y_valid=y_valid,X_test=X_test,y_test=y_test) 69 | #test_cnn(X_train=X_train,y_train=y_train,X_valid=X_valid,y_valid=y_valid,X_test=X_test,y_test=y_test) 70 | #test_sdae(X_train,y_train,X_valid,y_valid,X_test,y_test) 71 | if X_train.shape[1] > lstm_model_config.learn_params_template['maxlen']: 72 | X_train = X_train.reshape(X_train.shape[0],X_train.shape[1]) 73 | X_valid = X_valid.reshape(X_valid.shape[0],X_valid.shape[1]) 74 | X_test = X_test.reshape(X_test.shape[0],X_test.shape[1]) 75 | 76 | X_train = X_train[:,:lstm_model_config.learn_params_template['maxlen']] 77 | X_valid = X_valid[:,:lstm_model_config.learn_params_template['maxlen']] 78 | X_test = X_test[:,:lstm_model_config.learn_params_template['maxlen']] 79 | 80 | X_train = X_train.reshape(X_train.shape[0],X_train.shape[1],1) 81 | X_valid = X_valid.reshape(X_valid.shape[0],X_valid.shape[1],1) 82 | X_test = X_test.reshape(X_test.shape[0],X_test.shape[1],1) 83 | test_lstm(X_train,y_train,X_valid,y_valid,X_test,y_test) 84 | -------------------------------------------------------------------------------- /models/dl/beauty/cnn_model.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | from keras.models import Sequential 3 | from keras.layers.core import Dense, Dropout 4 | from keras.layers import Conv1D, MaxPooling1D, Flatten 5 | from keras.callbacks import EarlyStopping, ModelCheckpoint 6 | from keras.utils import np_utils 7 | from keras.optimizers import RMSprop, Adam, SGD 8 | from keras import regularizers 9 | from keras.layers.recurrent import LSTM 10 | from models.dl.beauty.cnn_model_config import learning_params_template 11 | from keras import optimizers 12 | 13 | def build_model(learn_params=learning_params_template, nb_classes= -1): 14 | input_length = learn_params["maxlen"] 15 | input_dim = learn_params["nb_features"] 16 | layers = learn_params["layers"] 17 | 18 | model = Sequential() 19 | 20 | maxlen = input_length 21 | max_features = input_dim 22 | 23 | if len(layers) == 0: 24 | raise("No layers") 25 | 26 | first_l = layers[0] 27 | rest_l = layers[1:] 28 | 29 | # First layer 30 | if first_l["name"] == 'dropout': 31 | model.add(Dropout(input_shape=(maxlen, max_features), rate=first_l['rate'])) 32 | elif first_l["name"] == 'conv': 33 | model.add(Conv1D(filters=first_l['filters'], 34 | kernel_size=first_l['kernel_size'], 35 | padding='valid', 36 | activation=first_l['activation'], 37 | strides=first_l['stride'], 38 | input_shape=(learn_params['input_length'],1))) 39 | 40 | # Middle layers (conv, dropout, pooling, dense, lstm.....) 41 | for l in rest_l: 42 | if l["name"] == 'maxpooling': 43 | model.add(MaxPooling1D(pool_size=l['pool_size'], padding='valid')) 44 | elif l["name"] == 'conv': 45 | model.add(Conv1D(filters=l['filters'], 46 | kernel_size=l['kernel_size'], 47 | padding='valid', 48 | activation=l['activation'], 49 | strides=l['stride'])) 50 | elif l["name"] == 'dropout': 51 | model.add(Dropout(rate=l['rate'])) 52 | elif l["name"] == 'lstm': 53 | model.add(LSTM(l['units'])) 54 | elif l["name"] == 'flatten': 55 | model.add(Flatten()) 56 | elif l["name"] == 'dense': 57 | if 'last' in l and l['last'] == True: 58 | l['units'] = nb_classes 59 | print(l['units']) 60 | if l['regularization'] > 0.0: 61 | module=Dense(units=l['units'], 62 | activation=l['activation'], 63 | kernel_regularizer=regularizers.l2(l['regularization']), 64 | activity_regularizer=regularizers.l1(l['regularization']) 65 | ) 66 | else: 67 | module=Dense(units=l['units'], activation=l['activation']) 68 | 69 | model.add(module) 70 | 71 | 72 | learn_params = learning_params_template 73 | if learn_params['optimizer'] == "sgd": 74 | optimizer = SGD(lr=learn_params['lr'], 75 | decay=learn_params['decay'], 76 | momentum=learn_params['momentum'], 77 | nesterov=True) 78 | elif learn_params['optimizer'] == "adam": 79 | optimizer = Adam(lr=learn_params['lr'], 80 | decay=learn_params['decay']) 81 | else: # elif learn_params['optimizer'] == "rmsprop": 82 | optimizer = RMSprop(lr=learn_params['lr'], 83 | decay=learn_params['decay']) 84 | metrics=['accuracy'] 85 | model.compile(loss="categorical_crossentropy", optimizer=optimizer, metrics=metrics) 86 | return model 87 | 88 | if __name__ == '__main__': 89 | import numpy as np 90 | model = build_model(nb_classes=200) 91 | print(model.summary()) -------------------------------------------------------------------------------- /models/dl/lstm_example.py: -------------------------------------------------------------------------------- 1 | __author__ = 'dk' 2 | import os 3 | #设置Tensorflow的日志等级 4 | #os.environ["TF_CPP_MIN_LOG_LEVEL"]='1' # 这是默认的显示等级,显示所有信息 5 | #os.environ["TF_CPP_MIN_LOG_LEVEL"]='2' # 只显示 warning 和 Error 6 | os.environ["TF_CPP_MIN_LOG_LEVEL"]='3' # 只显示 Error 7 | 8 | from attacks import CNN_model,DF_model,SDAE_model,LSTM_model 9 | from df import df_model_config 10 | 11 | from cnn import cnn_model_config 12 | from sdae import sdae_model_config 13 | from lstm import lstm_model_config 14 | 15 | from data_utils import LoadDataNoDefCW100,LoadDataNoDefCW,LoadDataWalkieTalkieCW,LoadDataApp 16 | 17 | #使用步骤 18 | #1. 修改各个模型的超参数,xx_model_config.py,把里面的输入向量和标签数改成自己所需要的 19 | #2. 读取数据,构造好训练集,验证集,测试集 20 | #3. build_model() 21 | #4. 调用fit() 22 | #5. 测试一下 23 | #6. 保存模型 24 | def test_cnn(X_train,y_train,X_valid,y_valid,X_test,y_test): 25 | model = CNN_model() 26 | model.build_model() 27 | model.fit(X_train=X_train,y_train=y_train,X_valid=X_valid,y_valid=y_valid, 28 | batch_size=cnn_model_config.learning_params_template['batch_size'], 29 | epochs=cnn_model_config.learning_params_template['nb_epochs']) 30 | model.save_model() 31 | score = model.evaluate(X_test=X_test,y_test=y_test) 32 | print('simple CNN accuracy :{0}'.format(score)) 33 | 34 | def test_df(X_train,y_train,X_valid,y_valid,X_test,y_test): 35 | 36 | model =DF_model() 37 | model.build_model() 38 | 39 | model.fit(X_train=X_train,y_train=y_train,X_valid=X_valid,y_valid=y_valid, 40 | batch_size=df_model_config.learning_params_template['batch_size'], 41 | epochs=df_model_config.learning_params_template['epoch']) 42 | model.save_model() 43 | score = model.evaluate(X_test=X_test,y_test=y_test) 44 | print('Deep Fingerprinting accuracy :{0}'.format(score)) 45 | 46 | 47 | 48 | def test_lstm(X_train,y_train,X_valid,y_valid,X_test,y_test): 49 | 50 | model = LSTM_model() 51 | model.build_model() 52 | model.fit(X_train=X_train,y_train=y_train,X_valid=X_valid,y_valid=y_valid, 53 | batch_size=lstm_model_config.learn_params_template['batch_size'], 54 | epochs=lstm_model_config.learn_params_template['nb_epochs']) 55 | model.save_model() 56 | score = model.evaluate(X_test=X_test,y_test=y_test) 57 | print('lstm accuracy :{0}'.format(score)) 58 | def test_sdae(X_train,y_train,X_valid,y_valid,X_test,y_test): 59 | model = SDAE_model() 60 | model.build_model() 61 | model.pre_train(x_train=X_train,x_test=X_test) 62 | 63 | model.fit(X_train=X_train,y_train=y_train,X_valid=X_valid,y_valid=y_valid, 64 | batch_size=sdae_model_config.learning_params_template['batch_size'], 65 | epochs=sdae_model_config.learning_params_template['nb_epochs']) 66 | model.save_model() 67 | score = model.evaluate(X_test=X_test,y_test=y_test) 68 | print('sdae accuracy :{0}'.format(score)) 69 | if __name__ == '__main__': 70 | X_train, y_train, X_valid, y_valid, X_test, y_test = LoadDataApp()#LoadDataWalkieTalkieCW() # LoadDataNoDefCW() 71 | #test_df(X_train=X_train,y_train=y_train,X_valid=X_valid,y_valid=y_valid,X_test=X_test,y_test=y_test) 72 | #test_cnn(X_train=X_train,y_train=y_train,X_valid=X_valid,y_valid=y_valid,X_test=X_test,y_test=y_test) 73 | #test_sdae(X_train,y_train,X_valid,y_valid,X_test,y_test) 74 | if X_train.shape[1] > lstm_model_config.learn_params_template['maxlen']: 75 | X_train = X_train.reshape(X_train.shape[0],X_train.shape[1]) 76 | X_valid = X_valid.reshape(X_valid.shape[0],X_valid.shape[1]) 77 | X_test = X_test.reshape(X_test.shape[0],X_test.shape[1]) 78 | 79 | X_train = X_train[:,:lstm_model_config.learn_params_template['maxlen']] 80 | X_valid = X_valid[:,:lstm_model_config.learn_params_template['maxlen']] 81 | X_test = X_test[:,:lstm_model_config.learn_params_template['maxlen']] 82 | 83 | X_train = X_train.reshape(X_train.shape[0],X_train.shape[1],1) 84 | X_valid = X_valid.reshape(X_valid.shape[0],X_valid.shape[1],1) 85 | X_test = X_test.reshape(X_test.shape[0],X_test.shape[1],1) 86 | test_lstm(X_train,y_train,X_valid,y_valid,X_test,y_test) 87 | -------------------------------------------------------------------------------- /models/dl/appnet/build_model.py: -------------------------------------------------------------------------------- 1 | __author__ = 'dk' 2 | import torch.nn as nn 3 | import torch as th 4 | 5 | class CNN_block(nn.Module): 6 | def __init__(self, embedding_dim, kernel_size=7, filter_num=256): 7 | super(CNN_block, self).__init__() 8 | self.kernel_size = kernel_size 9 | self.filter_num = filter_num 10 | 11 | self._1conv1d = nn.Conv1d(stride=1, 12 | kernel_size=kernel_size, 13 | in_channels=embedding_dim, 14 | out_channels=filter_num, 15 | padding=kernel_size//2) 16 | self._2maxpooling = nn.MaxPool1d(kernel_size=kernel_size, stride= kernel_size) 17 | 18 | self._3conv1d = nn.Conv1d(stride=1, 19 | kernel_size=kernel_size, 20 | in_channels=256, 21 | out_channels=filter_num, 22 | padding=kernel_size//2) 23 | 24 | self._4maxpooling = nn.MaxPool1d(kernel_size=kernel_size, stride= kernel_size) 25 | 26 | self._5conv1d = nn.Conv1d(stride=1, 27 | kernel_size=kernel_size, 28 | in_channels=256, 29 | out_channels=filter_num, 30 | padding=kernel_size//2) 31 | 32 | self._6maxpooling = nn.MaxPool1d(kernel_size=kernel_size*2, stride= 4) 33 | 34 | self._7flattern = nn.Flatten() 35 | 36 | def forward(self, x): 37 | 38 | batch_size, seq_len, embedding_dim = x.shape 39 | x = x.permute(0,2,1) 40 | 41 | x = self._1conv1d(x) 42 | x = self._2maxpooling(x) 43 | x = self._3conv1d(x) 44 | x = self._4maxpooling(x) 45 | #print(x.shape) 46 | x = self._5conv1d(x) 47 | x = self._6maxpooling(x) 48 | x = self._7flattern(x) 49 | #print('cnn x shape',x.shape) 50 | return x 51 | 52 | class AppNetModel(nn.Module): 53 | def __init__(self, payload_sz, payload_embed_sz, packet_nb, packet_embed_sz, class_nb, lstm_layer_nb=2): 54 | super(AppNetModel, self).__init__() 55 | self.payload_sz = payload_sz 56 | self.payload_embed_sz = payload_embed_sz 57 | self.packet_nb = packet_nb 58 | self.packet_embed_sz = packet_embed_sz 59 | 60 | self.payload_embed_layer = nn.Embedding(num_embeddings=512, embedding_dim=payload_embed_sz) 61 | self.packet_embed_layer = nn.Embedding(num_embeddings=3200, embedding_dim=packet_embed_sz) 62 | 63 | self.lstm_encoder = nn.LSTM(input_size = packet_embed_sz, 64 | hidden_size= packet_embed_sz, 65 | num_layers=lstm_layer_nb, 66 | bidirectional=True, batch_first=True) 67 | 68 | self.cnn_encoder = CNN_block(payload_embed_sz) 69 | 70 | self.fc = nn.Linear(in_features=5632, out_features= class_nb) 71 | self.dropout = nn.Dropout(p=0.1) 72 | 73 | def forward(self, packet_size, payload): 74 | batch_size= packet_size.shape[0] 75 | try: 76 | packet_embed = self.packet_embed_layer(packet_size) 77 | except BaseException as exp: 78 | print(packet_size) 79 | print(exp) 80 | #print('packet embed shape', packet_embed.shape) 81 | packet_vector, hidden = self.lstm_encoder(packet_embed) 82 | #print('packet vector shape',packet_vector.shape) 83 | packet_vector = packet_vector.reshape(batch_size, -1) 84 | 85 | payload_embed = self.payload_embed_layer(payload) 86 | payload_vector = self.cnn_encoder(payload_embed) 87 | 88 | representation = th.cat((packet_vector, payload_vector), dim=1) 89 | representation = self.dropout(self.fc(representation)) 90 | return representation -------------------------------------------------------------------------------- /models/dl/fsnet/main.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import os 3 | import train 4 | import preprocess 5 | 6 | home = os.getcwd() 7 | record_dir = os.path.join(home, 'record') 8 | save_base = os.path.join(home, 'log') 9 | log_dir = os.path.join(save_base) 10 | data_dir = os.path.join(home, 'filter') 11 | pred_dir = os.path.join(home, 'result') 12 | 13 | for dirx in [save_base, record_dir, log_dir, data_dir, pred_dir]: 14 | if not os.path.exists(dirx): 15 | os.mkdir(dirx) 16 | 17 | train_record = os.path.join(record_dir, 'train.json') 18 | test_record = os.path.join(record_dir, 'test.json') 19 | train_meta = os.path.join(record_dir, 'train.meta') 20 | test_meta = os.path.join(record_dir, 'test.meta') 21 | status_label = os.path.join(data_dir, 'status.label') 22 | 23 | flags = tf.flags 24 | 25 | flags.DEFINE_string('train_json', train_record, 'the processed train json file') 26 | flags.DEFINE_string('test_json', test_record, 'the processed test json file') 27 | flags.DEFINE_string('train_meta', train_meta, 'the processed train number') 28 | flags.DEFINE_string('test_meta', test_meta, 'the processed test number') 29 | flags.DEFINE_string('log_dir', log_dir, 'where to save the log') 30 | flags.DEFINE_string('model_dir', log_dir, 'where to save the model') 31 | flags.DEFINE_string('data_dir', data_dir, 'where to read data') 32 | flags.DEFINE_integer('class_num', 53, 'the class number') 33 | flags.DEFINE_integer('length_block', 1, 'the length of a block') 34 | flags.DEFINE_integer('min_length', 2, 'the flow under this parameter will be filtered') 35 | flags.DEFINE_integer('max_packet_length', 1000, 'the largest packet length') 36 | flags.DEFINE_float('split_ratio', 0.8, 'ratio of train set of target app') 37 | flags.DEFINE_float('keep_ratio', 1, 'ratio of keeping the example (for small dataset test)') 38 | flags.DEFINE_integer('max_flow_length_train', 200, 'the max flow length, if larger, drop') 39 | flags.DEFINE_integer('max_flow_length_test', 1000, 'the max flow length, if larger, drop') 40 | flags.DEFINE_string('test_model_dir', log_dir, 'the model dir for test result') 41 | flags.DEFINE_string('pred_dir', pred_dir, 'the dir to save predict result') 42 | 43 | flags.DEFINE_integer('batch_size', 128, 'train batch size') 44 | flags.DEFINE_integer('hidden', 128, 'GRU dimension of hidden state') 45 | flags.DEFINE_integer('layer', 2, 'layer number of length RNN') 46 | flags.DEFINE_integer('length_dim', 16, 'dimension of length embedding') 47 | flags.DEFINE_string('length_num', 'auto', 'length_num') 48 | 49 | flags.DEFINE_float('keep_prob', 0.8, 'the keep probability for dropout') 50 | flags.DEFINE_float('learning_rate', 0.001, 'learning rate') 51 | flags.DEFINE_integer('iter_num', int(0.7e5), 'iteration number') 52 | flags.DEFINE_integer('eval_batch', 77, 'evaluated train batches') 53 | flags.DEFINE_integer('train_eval_batch', 77, 'evaluated train batches') 54 | flags.DEFINE_string('decay_step', 'auto', 'the decay step') 55 | flags.DEFINE_float('decay_rate', 0.5, 'the decay rate') 56 | 57 | flags.DEFINE_string('mode', 'train', 'model mode: train/prepro/test') 58 | flags.DEFINE_integer("capacity", int(1e3), "size of dataset shuffle") 59 | flags.DEFINE_integer("loss_save", 100, "step of saving loss") 60 | flags.DEFINE_integer("checkpoint", 5000, "checkpoint to save and evaluate the model") 61 | flags.DEFINE_float("grad_clip", 5.0, "Global Norm gradient clipping rate") 62 | 63 | flags.DEFINE_boolean('is_cudnn', True, 'whether take the cudnn gru') 64 | flags.DEFINE_float('rec_loss', 0.5, 'the parameter to control the reconstruction of length sequence') 65 | 66 | 67 | def main(_): 68 | config = flags.FLAGS 69 | if config.length_num == 'auto': 70 | config.length_num = config.max_packet_length // config.length_block + 4 71 | else: 72 | config.length_num = int(config.length_num) 73 | if config.decay_step != 'auto': 74 | config.decay_step = int(config.decay_step) 75 | if config.mode == 'train': 76 | train.train(config) 77 | elif config.mode == 'prepro': 78 | preprocess.preprocess(config) 79 | elif config.mode == 'test': 80 | print(config.test_model_dir) 81 | train.predict(config) 82 | else: 83 | print('unknown mode, only support train now') 84 | raise Exception 85 | 86 | 87 | if __name__ == '__main__': 88 | tf.app.run() 89 | -------------------------------------------------------------------------------- /models/dl/df/generate_dataset.py: -------------------------------------------------------------------------------- 1 | __author__ = 'jmh081701' 2 | from flowcontainer.extractor import extract 3 | import os 4 | import json 5 | import requests 6 | import tqdm 7 | import threading 8 | def payload2packet_length(payload): 9 | rst = [] 10 | i = 0 11 | while i < len(payload): 12 | rst.append(int(payload[i:i+2],base=16)) 13 | i+= 2 14 | return rst 15 | 16 | def request_label(packet): 17 | url = 'http://172.31.251.82:8899/datacon' 18 | post = {"packet_length": packet} 19 | response = requests.post(url=url,json=post) 20 | #print(response.json()) 21 | return response.json()['label'] 22 | def traversal_training(dir): 23 | dataset= {} 24 | for _root, _dirs, _files in os.walk(dir): 25 | if len(_files)==0 : 26 | continue 27 | for file in tqdm.tqdm(_files): 28 | if '.pcap' not in file: 29 | continue 30 | label = file.split('_')[0] 31 | 32 | if label not in dataset: 33 | dataset[label] = [] 34 | path = _root + '/' + file 35 | flows = extract(path,extension=['tcp.payload','udp.payload'], filter='tcp or udp') 36 | for each in tqdm.tqdm(flows, desc=file): 37 | flow = flows[each] 38 | if 'tcp.payload' in flow.extension: 39 | payloads = flow.extension['tcp.payload'] 40 | else: 41 | payloads = flow.extension['udp.payload'] 42 | for payload, index in payloads: 43 | pkt_size= payload2packet_length(payload) 44 | dataset[label].append({ 45 | "packet_length": pkt_size 46 | }) 47 | 48 | for label in dataset: 49 | with open('datacon/'+label + '.json', 'w') as fp: 50 | json.dump(dataset[label],fp) 51 | print('dump ', label) 52 | 53 | def traversal_test(dir): 54 | log_file = 'test.log' 55 | rst_file = 'result.txt' 56 | for _root, _dirs, _files in os.walk(dir): 57 | if len(_files)==0 : 58 | continue 59 | for file in tqdm.tqdm(_files): 60 | if '.pcap' not in file: 61 | continue 62 | 63 | path = _root + '/' + file 64 | flows = extract(path,extension=['tcp.payload','udp.payload'], filter='tcp or udp') 65 | max_counter = 4096 66 | counter = 0 67 | label_counter = {} 68 | for each in tqdm.tqdm(flows, desc=file): 69 | flow = flows[each] 70 | if 'tcp.payload' in flow.extension: 71 | payloads = flow.extension['tcp.payload'] 72 | else: 73 | payloads = flow.extension['udp.payload'] 74 | 75 | packet_length = [] 76 | for payload, index in payloads[:256]: 77 | ##一个batch,一个batch的测试 78 | pkt_size= payload2packet_length(payload) 79 | 80 | if counter < max_counter or len(packet_length) == 0: 81 | packet_length.append(pkt_size) 82 | counter += 1 83 | 84 | _labels = request_label(packet=packet_length) 85 | for label in _labels: 86 | if label not in label_counter: 87 | label_counter[label] = 0 88 | label_counter[label] += 1 89 | 90 | label_counter = list(label_counter.items()) 91 | label_counter= sorted(label_counter, key= lambda x: x[1]) 92 | print('file: {0}, label counter: {1}, vote:{2}\n'.format(file, label_counter, label_counter[-1][0])) 93 | 94 | with open(log_file, 'a') as fp: 95 | fp.writelines('file: {0}, label counter: {1}, vote:{2}\n'.format(file, label_counter, label_counter[-1][0])) 96 | 97 | with open(rst_file,'a') as fp: 98 | fp.writelines('{0} {1}\n'.format(file, label_counter[-1][0])) 99 | 100 | 101 | if __name__ == '__main__': 102 | #traversal_training(r'G:\chromeDownload\datacon2021_traffic_eta_part1\part1\sample') 103 | traversal_test(dir=r'G:\chromeDownload\datacon2021_traffic_eta_part1\part1\real_data') 104 | -------------------------------------------------------------------------------- /models/dl/df_only_D/generate_dataset.py: -------------------------------------------------------------------------------- 1 | __author__ = 'jmh081701' 2 | from flowcontainer.extractor import extract 3 | import os 4 | import json 5 | import requests 6 | import tqdm 7 | import threading 8 | def payload2packet_length(payload): 9 | rst = [] 10 | i = 0 11 | while i < len(payload): 12 | rst.append(int(payload[i:i+2],base=16)) 13 | i+= 2 14 | return rst 15 | 16 | def request_label(packet): 17 | url = 'http://172.31.251.82:8899/datacon' 18 | post = {"packet_length": packet} 19 | response = requests.post(url=url,json=post) 20 | #print(response.json()) 21 | return response.json()['label'] 22 | def traversal_training(dir): 23 | dataset= {} 24 | for _root, _dirs, _files in os.walk(dir): 25 | if len(_files)==0 : 26 | continue 27 | for file in tqdm.tqdm(_files): 28 | if '.pcap' not in file: 29 | continue 30 | label = file.split('_')[0] 31 | 32 | if label not in dataset: 33 | dataset[label] = [] 34 | path = _root + '/' + file 35 | flows = extract(path,extension=['tcp.payload','udp.payload'], filter='tcp or udp') 36 | for each in tqdm.tqdm(flows, desc=file): 37 | flow = flows[each] 38 | if 'tcp.payload' in flow.extension: 39 | payloads = flow.extension['tcp.payload'] 40 | else: 41 | payloads = flow.extension['udp.payload'] 42 | for payload, index in payloads: 43 | pkt_size= payload2packet_length(payload) 44 | dataset[label].append({ 45 | "packet_length": pkt_size 46 | }) 47 | 48 | for label in dataset: 49 | with open('datacon/'+label + '.json', 'w') as fp: 50 | json.dump(dataset[label],fp) 51 | print('dump ', label) 52 | 53 | def traversal_test(dir): 54 | log_file = 'test.log' 55 | rst_file = 'result.txt' 56 | for _root, _dirs, _files in os.walk(dir): 57 | if len(_files)==0 : 58 | continue 59 | for file in tqdm.tqdm(_files): 60 | if '.pcap' not in file: 61 | continue 62 | 63 | path = _root + '/' + file 64 | flows = extract(path,extension=['tcp.payload','udp.payload'], filter='tcp or udp') 65 | max_counter = 4096 66 | counter = 0 67 | label_counter = {} 68 | for each in tqdm.tqdm(flows, desc=file): 69 | flow = flows[each] 70 | if 'tcp.payload' in flow.extension: 71 | payloads = flow.extension['tcp.payload'] 72 | else: 73 | payloads = flow.extension['udp.payload'] 74 | 75 | packet_length = [] 76 | for payload, index in payloads[:256]: 77 | ##一个batch,一个batch的测试 78 | pkt_size= payload2packet_length(payload) 79 | 80 | if counter < max_counter or len(packet_length) == 0: 81 | packet_length.append(pkt_size) 82 | counter += 1 83 | 84 | _labels = request_label(packet=packet_length) 85 | for label in _labels: 86 | if label not in label_counter: 87 | label_counter[label] = 0 88 | label_counter[label] += 1 89 | 90 | label_counter = list(label_counter.items()) 91 | label_counter= sorted(label_counter, key= lambda x: x[1]) 92 | print('file: {0}, label counter: {1}, vote:{2}\n'.format(file, label_counter, label_counter[-1][0])) 93 | 94 | with open(log_file, 'a') as fp: 95 | fp.writelines('file: {0}, label counter: {1}, vote:{2}\n'.format(file, label_counter, label_counter[-1][0])) 96 | 97 | with open(rst_file,'a') as fp: 98 | fp.writelines('{0} {1}\n'.format(file, label_counter[-1][0])) 99 | 100 | 101 | if __name__ == '__main__': 102 | #traversal_training(r'G:\chromeDownload\datacon2021_traffic_eta_part1\part1\sample') 103 | traversal_test(dir=r'G:\chromeDownload\datacon2021_traffic_eta_part1\part1\real_data') 104 | -------------------------------------------------------------------------------- /models/ml/cumul/util.py: -------------------------------------------------------------------------------- 1 | __author__ = 'jmh081701' 2 | #特征提取 3 | ############ 4 | import numpy as np 5 | import os 6 | import sys 7 | import copy 8 | from src.df.src.utility import LoadDataWakieTalkie_Single_DataSet 9 | class CUMUL_datagenerator: 10 | 11 | def __init__(self, 12 | feature_length=100,min=-2305,max=2305, 13 | equidistance=None,cell_size=512,is_train=False): 14 | self.feature_length = feature_length #cumul模型的输入向量的长度,默认是100 15 | self.cell_size = cell_size #Tor的cell的大小 16 | self.equidistance = None #采样的间距 17 | 18 | #标准化的参数 19 | self.min = min 20 | self.max = max 21 | ##############训练模型使用的数据 22 | 23 | self.is_train = is_train 24 | 25 | self.train_X = None 26 | self.train_y = None 27 | 28 | self.valid_X = None 29 | self.valid_y = None 30 | 31 | self.test_X = None 32 | self.test_y = None 33 | 34 | if is_train: 35 | self.load_tor_cell_sequence() 36 | 37 | def feature_extract(self,trace_sequence,cell_size=None): 38 | """feature_extract() : 从[-1,1,1...]的cell的方向序列中,生成CUMUL模型所需的特征向量 39 | 40 | :param trace_sequence: `numpy.narray` ,形状:batch_size * trace_length 41 | 输入的[-1,1,1...]向量,-1表示ingoing的cell,+1表示outgoing的流 42 | :param cell_size: 每个cell的大小,默认None,因为最后还得归一化 43 | :return: 44 | """ 45 | if cell_size == None: 46 | cell_size = 1 47 | if not isinstance(type(trace_sequence),np.ndarray): 48 | trace_sequence = np.array(trace_sequence) 49 | shape = trace_sequence.shape 50 | culmulative_sum_a = np.zeros(shape=shape,dtype = np.float) 51 | culmulative_sum_c = np.zeros(shape=shape,dtype = np.float) 52 | xp = np.linspace(0,shape[1]-1,shape[1]) 53 | features = np.zeros(shape=(shape[0],2*self.feature_length),dtype = np.float) 54 | #计算累计和 55 | for i in range(0,shape[0]): 56 | for j in range(1,shape[1]): 57 | culmulative_sum_a[i,j] += culmulative_sum_a[i,j-1] + abs(trace_sequence[i,j]) 58 | culmulative_sum_c[i,j] += culmulative_sum_c[i,j-1] + trace_sequence[i,j] 59 | #加上cell_size 60 | culmulative_sum_a = cell_size * culmulative_sum_a 61 | culmulative_sum_c = cell_size * culmulative_sum_c 62 | 63 | #线性采样n个特征 64 | if self.equidistance != None: 65 | equidistance = self.equidistance 66 | else: 67 | equidistance = (shape[1]-1)/self.feature_length 68 | xval = np.arange(0,equidistance * self.feature_length,equidistance) 69 | for i in range(shape[0]): 70 | #print(i,culmulative_sum_a[i]) 71 | #print(i,culmulative_sum_c[i]) 72 | a_interp = (np.interp(xval,xp,culmulative_sum_a[i])-self.min)/(self.max-self.min) 73 | c_interp = (np.interp(xval,xp,culmulative_sum_c[i])-self.min)/(self.max-self.min) 74 | 75 | features[i,0:2*self.feature_length:2]=copy.deepcopy(a_interp) 76 | features[i,1:2*self.feature_length:2]=copy.deepcopy(c_interp) 77 | #print(i,features[i]) 78 | #print('#'*30) 79 | return features 80 | 81 | def load_tor_cell_sequence(self): 82 | if not self.is_train : 83 | return 84 | 85 | _,__,self.train_X,self.train_y = LoadDataWakieTalkie_Single_DataSet('train',is_cluster=False,normalized=False) 86 | _,__,self.valid_X,self.valid_y = LoadDataWakieTalkie_Single_DataSet('valid',is_cluster=False,normalized=False) 87 | _,__,self.test_X,self.test_y = LoadDataWakieTalkie_Single_DataSet('test',is_cluster=False,normalized=False) 88 | 89 | self.train_X = self.feature_extract(self.train_X) 90 | print('feature extract....') 91 | self.valid_X =self.feature_extract(self.valid_X) 92 | self.test_X = self.feature_extract(self.test_X) 93 | 94 | print('Load tor cell sequence dataset well.') 95 | print('X train shape:',self.train_X.shape) 96 | print('y train shape:',self.train_y.shape) 97 | print('X valid shape:',self.valid_X.shape) 98 | print('y valid shape:',self.valid_y.shape) 99 | print('X test shape:',self.test_X.shape) 100 | print('y test shape:',self.test_y.shape) 101 | 102 | def trainSet(self): 103 | return self.train_X,self.train_y 104 | def validSet(self): 105 | return self.valid_X,self.valid_y 106 | def testSet(self): 107 | return self.test_X,self.test_y 108 | 109 | 110 | if __name__ == '__main__': 111 | dator = CUMUL_datagenerator(is_train=True) 112 | print(dator.test_X[1],dator.test_y[1]) 113 | -------------------------------------------------------------------------------- /models/dl/graphDapp/train.py: -------------------------------------------------------------------------------- 1 | __author__ = 'dk' 2 | import numpy as np 3 | from models.dl.graphDapp import logger_wrappers 4 | import torch as th 5 | from torch import nn 6 | from torch import optim 7 | from torch.nn import functional as F 8 | import tqdm 9 | 10 | from sklearn.metrics import classification_report 11 | from models.dl.graphDapp.model_seriealization import save,load 12 | from models.dl.graphDapp.data_builder import Dataset_fgnet 13 | from models.dl.graphDapp.DApp_Classifier import DApp_classifier 14 | from models.dl.graphDapp.graphDapp_config import config 15 | use_gpu = th.cuda.is_available() 16 | if use_gpu : 17 | device_id = config['device_id'] 18 | device = device_id 19 | else: 20 | device= "cpu" 21 | 22 | def main(dataset_name, modelpath, max_epoch=config['max_epoch']): 23 | data_loader = Dataset_fgnet(raw_dir='', dumpfile= dataset_name,renew=False) 24 | print(data_loader) 25 | model = DApp_classifier(nb_classes=len(data_loader.labelname), 26 | gin_layer_num= config['gin_layer_num'], 27 | gin_hidden_units=config['gin_hidden_units'], 28 | iteration_nums=config['iteration_nums'], 29 | iteration_first=True, 30 | device= device,use_gpu= use_gpu) 31 | loss_func = nn.CrossEntropyLoss() 32 | optimizer = optim.Adam(params=model.parameters(),lr=config['learning_rate']) 33 | #model = load(model,optimizer=optimizer,checkpoint_path=modelpath) 34 | if use_gpu: 35 | model = model.cuda(device) 36 | loss_func = loss_func.cuda(device) 37 | 38 | #训练 39 | model.train() 40 | epoch_losses = [] 41 | epoch_acces = [] 42 | batch_size = config['batch_size'] 43 | 44 | for epoch in tqdm.trange(max_epoch): 45 | epoch_loss = 0 46 | iter = 0 47 | while data_loader.epoch_num == epoch: 48 | graphs,labels= data_loader.next_train_batch(batch_size) 49 | if use_gpu : 50 | graphs = graphs.to(th.device(device)) 51 | labels = labels.to(th.device(device)) 52 | predict_label = model(graphs) 53 | #print(predict_label.size()) 54 | #print(labels.size()) 55 | loss = loss_func(predict_label,labels) 56 | optimizer.zero_grad() 57 | loss.backward() 58 | optimizer.step() 59 | if use_gpu: 60 | lv= loss.detach().item() 61 | else: 62 | lv = loss.detach().cpu().item() 63 | epoch_loss += lv 64 | iter +=1 65 | #print('Inner loss: {:.4f},Train Watch:{}'.format(lv,data_loader.train_watch)) 66 | #epoch_losses.append(lv) 67 | epoch_loss /= (iter+0.0000001) 68 | info='Epoch {}, loss: {:.4f}'.format(epoch,epoch_loss) 69 | logger_wrappers.warning(info) 70 | epoch_losses.append(epoch_loss) 71 | #测试一下: 72 | graphs,labels = data_loader.next_valid_batch(batch_size=batch_size) 73 | if use_gpu : 74 | graphs = graphs.to(th.device(device)) 75 | labels = labels.to(th.device(device)) 76 | predict_labels = model(graphs) 77 | predict_labels = F.softmax(predict_labels,1) 78 | argmax_labels = th.argmax(predict_labels,1) 79 | #print('pred:', argmax_labels) 80 | #print('real:', labels) 81 | acc = (labels == argmax_labels).float().sum().item() / len(labels) * 100 82 | info='Accuracy of argmax predictions on the valid set: {:4f}%'.format( 83 | acc) 84 | epoch_acces.append(acc) 85 | logger_wrappers.info(info) 86 | ###保存一下模型 87 | save(model,optimizer,checkpoint_path=modelpath) 88 | model.eval() 89 | acc_list =[] 90 | y_pred= [] 91 | y_ture = [] 92 | 93 | for subset in range(len(data_loader.test_set)//batch_size): 94 | graphs,labels = data_loader.next_test_batch(batch_size=batch_size) 95 | if use_gpu : 96 | graphs = graphs.to(th.device(device)) 97 | labels = labels.to(th.device(device)) 98 | predict_labels = model(graphs) 99 | predict_labels = F.softmax(predict_labels,1) 100 | argmax_labels = th.argmax(predict_labels,1) 101 | y_pred += argmax_labels.tolist() 102 | y_ture += labels.tolist() 103 | acc = (labels == argmax_labels).float().sum().item() / len(labels) * 100 104 | acc_list.append(acc) 105 | info='Accuracy of argmax predictions on the test subset{1}: {0:4f}%'.format(acc,subset) 106 | logger_wrappers.info(info) 107 | info = 'Average Accuracy on entire test set:{:0.4f}%'.format(np.mean(acc_list)) 108 | logger_wrappers.info(info) 109 | print(classification_report(y_pred=y_pred,y_true=y_ture, digits=5)) 110 | -------------------------------------------------------------------------------- /models/ml/bind/README.md: -------------------------------------------------------------------------------- 1 | # 说明 2 | 本文件夹是对BIND论文里面提到的方法进行复现. 3 | 论文引用bibtex: 4 | ``` 5 | @inproceedings{al2016adaptive, 6 | title={Adaptive encrypted traffic fingerprinting with bi-directional dependence}, 7 | author={Al-Naami, Khaled and Chandra, Swarup and Mustafa, Ahmad and Khan, Latifur and Lin, Zhiqiang and Hamlen, Kevin and Thuraisingham, Bhavani}, 8 | booktitle={Proceedings of the 32nd Annual Conference on Computer Security Applications}, 9 | pages={177--188}, 10 | year={2016} 11 | } 12 | ``` 13 | # 数据集样式 14 | 向量化前,每条流具有的特征: 15 | ``` 16 | BIND_Feature_Raw={ 17 | 'Dn-Up-Burst-Size':{(x,y):counter},#先Down再Up的burst size依次为x,y的次数 18 | 'Dn-Up-Burst-Time':{(x,y):counter},#先Down再Up的burst 持续时间依次为x,y的次数,(只保留2位有效数字) 19 | 'Up-Dn-Burst-Size':{(x,y):counter},#先Up再Down的burst size依次为x,y的次数 20 | 'Up-Dn-Burst-Time':{(x,y):counter},#先Up再Down的burst 持续时间依次为x,y的次数,(只保留2位有效数字) 21 | 'Uni-Burst-Size':{x:counter},#单个burst size的出现次数的统计 22 | 'Uni-Burst-Time':{x:counter},#单个burst size持续时间的统计 23 | 'Pkt-Size':{x:counter},#包长的出现次数的统计 24 | } 25 | ``` 26 | 样例: 27 | ``` 28 | {'Dn-Up-size': {(1425, 176): 1, (2317, 184): 1, (2988, 191): 1, (2014, 197): 1, (1862, 173): 1, (7616, 191): 1, (4685, 192): 1, (2832, 185): 1, (2301, 197): 1, (5750, 180): 1, (2565, 200): 1, (3803, 178): 1, (3816, 181): 1, (2485, 181): 2, (2178, 200): 1, (2578, 200): 1, (2571, 199): 1, (2676, 186): 1, (2335, 185): 1, (4167, 200): 1, (5062, 200): 1, (5062, 167): 1, (2499, 167): 1, (2499, 182): 1, (2729, 200): 1, (2619, 192): 1, (4019, 211): 1, (2612, 211): 1, (2612, 210): 1, (3812, 194): 1, (3010, 194): 1, (3004, 169): 1, (2854, 177): 1, (3442, 177): 1, (3449, 178): 1, (2261, 183): 1, (1343, 183): 1, (1343, 169): 1}, 'Dn-Up-time': {(0.02, 1.03): 1, (0.08, 0.22): 1, (0.02, 0.3): 1, (0.05, 0.81): 1, (0.05, -0.62): 1, (0.01, 0.11): 2, (0.02, 0.0): 1, (0.07, 0.02): 1, (0.03, 0.76): 1, (0.0, 0.07): 1, (0.03, 0.12): 1, (0.08, 0.1): 1, (0.0, 0.04): 1, (0.01, 0.06): 1, (0.02, 0.69): 1, (0.0, -0.51): 1, (0.03, 0.05): 1, (0.04, 0.01): 1, (0.0, 0.11): 1, (0.04, 0.13): 1, (0.0, 0.0): 1, (0.11, 0.0): 2, (0.0, 0.09): 1, (0.01, 0.01): 2, (0.04, 0.09): 1, (0.3, 0.27): 1, (-0.42, 0.0): 1, (0.11, 0.01): 1, (0.1, 0.0): 1, (0.03, 0.06): 1, (0.0, 0.05): 1, (0.09, 0.0): 1, (0.0, 0.1): 1, (0.01, 0.59): 1, (0.36, -0.68): 1, (0.01, 1.32): 1}, 'Up-Dn-size': {(167, 1425): 1, (176, 2317): 1, (184, 2988): 1, (191, 2014): 1, (197, 1862): 1, (173, 7616): 1, (191, 4685): 1, (192, 2832): 1, (185, 2301): 1, (197, 5750): 1, (180, 2565): 1, (200, 3803): 1, (178, 3816): 1, (181, 2485): 2, (181, 2178): 1, (200, 2578): 1, (200, 2571): 1, (199, 2676): 1, (186, 2335): 1, (185, 4167): 1, (200, 5062): 2, (167, 2499): 2, (182, 2729): 1, (200, 2619): 1, (192, 4019): 1, (211, 2612): 2, (210, 3812): 1, (194, 3010): 1, (194, 3004): 1, (169, 2854): 1, (177, 3442): 1, (177, 3449): 1, (178, 2261): 1, (183, 1343): 2, (169, 2570): 1}, 'Up-Dn-time': {(0.0, 0.02): 1, (1.03, 0.08): 1, (0.22, 0.02): 1, (0.3, 0.05): 1, (0.81, 0.05): 1, (-0.62, 0.01): 1, (0.11, 0.01): 1, (0.11, 0.02): 1, (0.0, 0.07): 1, (0.02, 0.03): 1, (0.76, 0.0): 1, (0.07, 0.03): 1, (0.12, 0.08): 1, (0.1, 0.0): 1, (0.04, 0.01): 1, (0.06, 0.02): 1, (0.69, 0.0): 1, (-0.51, 0.03): 1, (0.05, 0.04): 1, (0.01, 0.0): 1, (0.11, 0.04): 1, (0.13, 0.0): 1, (0.0, 0.11): 2, (0.0, 0.0): 2, (0.09, 0.01): 1, (0.01, 0.04): 1, (0.09, 0.3): 1, (0.27, -0.42): 1, (0.0, 0.01): 1, (0.01, 0.11): 1, (0.01, 0.1): 1, (0.0, 0.03): 1, (0.06, 0.0): 1, (0.05, 0.09): 1, (0.1, 0.01): 1, (0.59, 0.36): 1, (-0.68, 0.01): 1, (1.32, 0.02): 1}, 'Uni-size': {167: 3, 1425: 1, 176: 1, 2317: 1, 184: 1, 2988: 1, 191: 2, 2014: 1, 197: 2, 1862: 1, 173: 1, 7616: 1, 4685: 1, 192: 2, 2832: 1, 185: 2, 2301: 1, 5750: 1, 180: 1, 2565: 1, 200: 6, 3803: 1, 178: 2, 3816: 1, 181: 3, 2485: 2, 2178: 1, 2578: 1, 2571: 1, 199: 1, 2676: 1, 186: 1, 2335: 1, 4167: 1, 5062: 2, 2499: 2, 182: 1, 2729: 1, 2619: 1, 4019: 1, 211: 2, 2612: 2, 210: 1, 3812: 1, 194: 2, 3010: 1, 3004: 1, 169: 2, 2854: 1, 177: 2, 3442: 1, 3449: 1, 2261: 1, 183: 2, 1343: 2, 2570: 1}, 'Uni-time': {0.0: 16, 0.02: 6, 1.03: 1, 0.08: 2, 0.22: 1, 0.3: 2, 0.05: 4, 0.81: 1, -0.62: 1, 0.01: 11, 0.11: 6, 0.07: 2, 0.03: 4, 0.76: 1, 0.12: 1, 0.1: 3, 0.04: 4, 0.06: 2, 0.69: 1, -0.51: 1, 0.13: 1, 0.09: 3, 0.27: 1, -0.42: 1, 0.59: 1, 0.36: 1, -0.68: 1, 1.32: 1}, 'Pkt-size': {167: 3, 1425: 1, 176: 1, 1448: 56, 4: 18, 8: 2, 857: 1, 184: 1, 9: 15, 78: 1, 5: 15, 191: 2, 566: 1, 197: 2, 414: 1, 173: 1, 14: 1, 1265: 1, 545: 1, 333: 1, 192: 2, 1375: 1, 185: 2, 600: 8, 253: 1, 206: 1, 180: 1, 1103: 1, 200: 6, 889: 1, 178: 2, 320: 1, 181: 3, 1023: 1, 1033: 1, 721: 1, 1121: 1, 1114: 1, 199: 1, 1219: 1, 186: 1, 878: 1, 1258: 1, 16: 1, 689: 1, 702: 1, 1042: 1, 1037: 1, 182: 1, 1267: 1, 1157: 1, 519: 1, 211: 2, 1150: 2, 210: 1, 861: 1, 903: 1, 194: 2, 17: 2, 93: 1, 956: 1, 169: 2, 806: 1, 177: 2, 542: 1, 532: 1, 799: 1, 183: 2, 1343: 2, 1113: 1}} 29 | 30 | ``` 31 | 向量化的过程: 32 | - 1. 获取全局的字典: 33 | 线性遍历每条流的BIND-Feature-Raw,获取7类特征的字典,字典反映了各类特征可能出现那些取值。 34 | 35 | 如果取值太大了,可能安装频率的高低只保留频率TOPN的一些取值。 36 | 37 | - 2. 向量化 38 | 根据全局的特征字典,只保留每条流里面存在于全局特征字典key里面的key. 39 | 例如,假设全局化字典里面的'Dn-Up-Burst-Size'特征,一共保留了512个key. 40 | 那么每条流的'Dn-Up-Burst-Size'特征就是一个512维的向量,其中第i个向量的取值表示第i个key在这条流出现的次数。 41 | 为了防止流的长度为特征取值的影响,这512维特征会除以这条流Dn-Up-Burst-Size里面所有value的总和。 42 | 43 | 最后把7类特征拼接起来,得到最终的向量特征。 44 | -------------------------------------------------------------------------------- /models/dl/df/df_main_model.py: -------------------------------------------------------------------------------- 1 | __author__ = 'dk' 2 | from models.dl.attacks import DF_model, parser_raw_data 3 | from models.dl.df import df_model_config 4 | from models.model_base import abs_model 5 | import os 6 | from config import raw_dataset_base 7 | from keras.utils import np_utils 8 | import numpy as np 9 | os.environ['CUDA_VISBALE_DEIVCES'] ='cuda:2' 10 | class model(abs_model): 11 | def __init__(self, dataset, randseed, splitrate): 12 | super(model,self).__init__('df',randseed= randseed) 13 | if os.path.exists(self.database) == False: 14 | os.makedirs(self.database,exist_ok=True) 15 | 16 | self.dataset = dataset 17 | self.model = self.database + '/'+ self.name + '_' + dataset + '_model' 18 | self.data = self.database + '/'+ self.name + '_' + dataset + '/' 19 | self.splitrate = splitrate 20 | #原始数据集目录 21 | full_rdata = raw_dataset_base + self.dataset 22 | self.full_rdata = full_rdata 23 | 24 | if self.data_exists() == False: 25 | self.parser_raw_data() 26 | 27 | self.df_model = None 28 | def parser_raw_data(self): 29 | full_rdata = self.full_rdata 30 | if os.path.exists(full_rdata) == False: 31 | raise OSError('Dataset {0} (full path: {1}) does not exist!'.format(self.dataset,full_rdata)) 32 | os.makedirs(self.data, exist_ok=True) 33 | ##从原始数据集构建DF所需的数据集 34 | X_train,y_train, X_valid, y_valid, X_test, y_test = parser_raw_data(self, self.full_rdata, max_len = df_model_config.learning_params_template['in_dim']) 35 | 36 | self.save_data(X_train,y_train, X_valid, y_valid, X_test, y_test) 37 | 38 | 39 | def train(self): 40 | X_train,y_train, X_valid, y_valid, X_test, y_test = self.load_data() 41 | num_class = self.num_classes() 42 | df_model_config.nb_classes_template = num_class 43 | 44 | y_train = np_utils.to_categorical(y_train, num_classes=num_class) 45 | y_valid = np_utils.to_categorical(y_valid, num_classes=num_class) 46 | y_test = np_utils.to_categorical(y_test, num_classes= self.num_classes()) 47 | 48 | X_train = X_train[:, :,np.newaxis] 49 | X_valid = X_valid[:, :,np.newaxis] 50 | X_test = X_test[:, :,np.newaxis] 51 | 52 | df_model = DF_model(num_class = num_class) 53 | df_model.build_model() 54 | 55 | df_model.fit(X_train=X_train,y_train=y_train, 56 | X_valid= X_valid, y_valid = y_valid, 57 | batch_size= df_model_config.learning_params_template['batch_size'], 58 | epochs=df_model_config.learning_params_template['epoch']) 59 | 60 | df_model.save_model(path=self.model) 61 | score = df_model.evaluate(X_test=X_test, y_test = y_test) 62 | print('[Deep Fingerprinting Test on {0} accuracy {1}'.format(self.dataset, score)) 63 | def test(self): 64 | X_train,y_train, X_valid, y_valid, X_test, y_test = self.load_data() 65 | y_test = np_utils.to_categorical(y_test, num_classes= self.num_classes()) 66 | X_test = X_test[:, :,np.newaxis] 67 | 68 | df_model = DF_model(num_class= self.num_classes()) 69 | df_model.load_model(self.model) 70 | score = df_model.evaluate(X_test=X_test,y_test=y_test) 71 | print('Deep Fingerprinting Test on {0} accuracy :{1}'.format(self.dataset,score)) 72 | 73 | def predict(self,pkt_size): 74 | def pad_sequence(x, max_len, pad_value=0): 75 | r = x + [pad_value] * (max_len - len(x)) 76 | return r[:max_len] 77 | 78 | if self.df_model == None: 79 | self.df_model = DF_model(num_class= self.num_classes()) 80 | self.df_model.load_model(self.model) 81 | 82 | x = [pad_sequence(_pkt_size, max_len= df_model_config.learning_params_template['in_dim']) for _pkt_size in pkt_size] 83 | x = np.array(x)[:, :,np.newaxis] 84 | y_logit = self.df_model.predict(x, actual_lable=True) 85 | return y_logit.tolist() 86 | def get_feature(self): 87 | X_train,y_train, X_valid, y_valid, X_test, y_test = self.load_data() 88 | #y_test = np_utils.to_categorical(y_test, num_classes= self.num_classes()) 89 | X_test = X_test[:5000] 90 | X_test = X_test[:, :,np.newaxis] 91 | 92 | df_model = DF_model(num_class= self.num_classes()) 93 | df_model.load_model(self.model) 94 | logit, feature = df_model.predict(X_test=X_test,actual_lable=False, return_feature=True) 95 | print(feature.shape, logit.shape) 96 | logit = logit.tolist() 97 | feature = feature.tolist() 98 | #feature = logit 99 | y_true = y_test[:5000].tolist() 100 | feature_set = {} 101 | feature_vector = [] 102 | for i in range(len(y_true)): 103 | if y_true[i] not in feature_set: 104 | feature_set[y_true[i]] = [] 105 | feature_set[y_true[i]].append([feature[i]]) 106 | import pickle 107 | with open('feature_set_D1_53_DF.pkl','wb') as fp: 108 | pickle.dump(feature_set, fp) 109 | print(y_true[-1],logit[-1]) 110 | print(feature[-1]) 111 | if __name__ == '__main__': 112 | for test_rate in [0.1]: 113 | print(test_rate) 114 | dataset='app150' 115 | df_model = model(dataset, randseed= 128, splitrate=test_rate) 116 | #df_model.parser_raw_data() 117 | df_model.train() 118 | df_model.test() 119 | print(dataset) 120 | print(test_rate) 121 | #import os 122 | #os.remove(df_model.model) 123 | #df_model.get_feature() 124 | break 125 | -------------------------------------------------------------------------------- /models/dl/df/df_model.py: -------------------------------------------------------------------------------- 1 | # DF model, 2 | # This code is to implement deep fingerprinting model for website fingerprinting attacks 3 | # ACM Reference Formant 4 | # Payap Sirinam, Mohsen Imani, Marc Juarez, and Matthew Wright. 2018. 5 | # Deep Fingerprinting: Undermining Website Fingerprinting Defenses with Deep Learning. 6 | # In 2018 ACM SIGSAC Conference on Computer and Communications Security (CCS ’18), 7 | # October 15–19, 2018, Toronto, ON, Canada. ACM, New York, NY, USA, 16 pages. 8 | # https://doi.org/10.1145/3243734.3243768 9 | from keras.models import Sequential 10 | from keras.layers import Conv1D, MaxPooling1D, BatchNormalization 11 | from keras.layers.core import Activation, Flatten, Dense, Dropout 12 | from keras.layers.advanced_activations import ELU 13 | from keras.initializers import glorot_uniform 14 | 15 | from .df_model_config import learning_params_template,nb_classes_template 16 | from keras.optimizers import Adamax 17 | def build_model(input_shape=(learning_params_template['in_dim'],1), classes=nb_classes_template): 18 | model = Sequential() 19 | #Block1 20 | filter_num = ['None',32,64,128,256] 21 | kernel_size = ['None',8,8,8,8] 22 | conv_stride_size = ['None',1,1,1,1] 23 | pool_stride_size = ['None',4,4,4,4] 24 | pool_size = ['None',8,8,8,8] 25 | 26 | model.add(Conv1D(filters=filter_num[1], kernel_size=kernel_size[1], input_shape=input_shape, 27 | strides=conv_stride_size[1], padding='same', 28 | name='block1_conv1')) 29 | model.add(BatchNormalization(axis=-1)) 30 | model.add(ELU(alpha=1.0, name='block1_adv_act1')) 31 | model.add(Conv1D(filters=filter_num[1], kernel_size=kernel_size[1], 32 | strides=conv_stride_size[1], padding='same', 33 | name='block1_conv2')) 34 | model.add(BatchNormalization(axis=-1)) 35 | model.add(ELU(alpha=1.0, name='block1_adv_act2')) 36 | model.add(MaxPooling1D(pool_size=pool_size[1], strides=pool_stride_size[1], 37 | padding='same', name='block1_pool')) 38 | model.add(Dropout(0.1, name='block1_dropout')) 39 | 40 | model.add(Conv1D(filters=filter_num[2], kernel_size=kernel_size[2], 41 | strides=conv_stride_size[2], padding='same', 42 | name='block2_conv1')) 43 | model.add(BatchNormalization()) 44 | model.add(Activation('relu', name='block2_act1')) 45 | 46 | model.add(Conv1D(filters=filter_num[2], kernel_size=kernel_size[2], 47 | strides=conv_stride_size[2], padding='same', 48 | name='block2_conv2')) 49 | model.add(BatchNormalization()) 50 | model.add(Activation('relu', name='block2_act2')) 51 | model.add(MaxPooling1D(pool_size=pool_size[2], strides=pool_stride_size[3], 52 | padding='same', name='block2_pool')) 53 | model.add(Dropout(0.1, name='block2_dropout')) 54 | 55 | model.add(Conv1D(filters=filter_num[3], kernel_size=kernel_size[3], 56 | strides=conv_stride_size[3], padding='same', 57 | name='block3_conv1')) 58 | model.add(BatchNormalization()) 59 | model.add(Activation('relu', name='block3_act1')) 60 | model.add(Conv1D(filters=filter_num[3], kernel_size=kernel_size[3], 61 | strides=conv_stride_size[3], padding='same', 62 | name='block3_conv2')) 63 | model.add(BatchNormalization()) 64 | model.add(Activation('relu', name='block3_act2')) 65 | model.add(MaxPooling1D(pool_size=pool_size[3], strides=pool_stride_size[3], 66 | padding='same', name='block3_pool')) 67 | model.add(Dropout(0.1, name='block3_dropout')) 68 | 69 | model.add(Conv1D(filters=filter_num[4], kernel_size=kernel_size[4], 70 | strides=conv_stride_size[4], padding='same', 71 | name='block4_conv1')) 72 | model.add(BatchNormalization()) 73 | model.add(Activation('relu', name='block4_act1')) 74 | model.add(Conv1D(filters=filter_num[4], kernel_size=kernel_size[4], 75 | strides=conv_stride_size[4], padding='same', 76 | name='block4_conv2')) 77 | model.add(BatchNormalization()) 78 | model.add(Activation('relu', name='block4_act2')) 79 | model.add(MaxPooling1D(pool_size=pool_size[4], strides=pool_stride_size[4], 80 | padding='same', name='block4_pool')) 81 | model.add(Dropout(0.1, name='block4_dropout')) 82 | 83 | model.add(Flatten(name='flatten')) 84 | model.add(Dense(512, kernel_initializer=glorot_uniform(seed=0), name='fc1')) 85 | #model.add(BatchNormalization()) 86 | model.add(Activation('relu', name='fc1_act')) 87 | 88 | model.add(Dropout(0.7, name='fc1_dropout')) 89 | 90 | model.add(Dense(512, kernel_initializer=glorot_uniform(seed=0), name='fc2')) 91 | #model.add(BatchNormalization()) 92 | model.add(Activation('relu', name='fc2_act')) 93 | 94 | model.add(Dropout(0.5, name='fc2_dropout')) 95 | 96 | model.add(Dense(classes, kernel_initializer=glorot_uniform(seed=0), name='fc3')) 97 | model.add(Activation('softmax', name="softmax")) 98 | 99 | 100 | OPTIMIZER = Adamax(lr=learning_params_template['lr'], 101 | beta_1=learning_params_template['beta_1'], 102 | beta_2=learning_params_template['beta_2'], 103 | epsilon=learning_params_template['epsilon'], 104 | decay=learning_params_template['decay']) 105 | model.compile(loss="categorical_crossentropy", optimizer=OPTIMIZER,metrics=["accuracy"]) 106 | return model 107 | -------------------------------------------------------------------------------- /models/dl/df_only_D/df_model.py: -------------------------------------------------------------------------------- 1 | # DF model, 2 | # This code is to implement deep fingerprinting model for website fingerprinting attacks 3 | # ACM Reference Formant 4 | # Payap Sirinam, Mohsen Imani, Marc Juarez, and Matthew Wright. 2018. 5 | # Deep Fingerprinting: Undermining Website Fingerprinting Defenses with Deep Learning. 6 | # In 2018 ACM SIGSAC Conference on Computer and Communications Security (CCS ’18), 7 | # October 15–19, 2018, Toronto, ON, Canada. ACM, New York, NY, USA, 16 pages. 8 | # https://doi.org/10.1145/3243734.3243768 9 | from keras.models import Sequential 10 | from keras.layers import Conv1D, MaxPooling1D, BatchNormalization 11 | from keras.layers.core import Activation, Flatten, Dense, Dropout 12 | from keras.layers.advanced_activations import ELU 13 | from keras.initializers import glorot_uniform 14 | 15 | from .df_model_config import learning_params_template,nb_classes_template 16 | from keras.optimizers import Adamax 17 | def build_model(input_shape=(learning_params_template['in_dim'],1), classes=nb_classes_template): 18 | model = Sequential() 19 | #Block1 20 | filter_num = ['None',32,64,128,256] 21 | kernel_size = ['None',8,8,8,8] 22 | conv_stride_size = ['None',1,1,1,1] 23 | pool_stride_size = ['None',4,4,4,4] 24 | pool_size = ['None',8,8,8,8] 25 | 26 | model.add(Conv1D(filters=filter_num[1], kernel_size=kernel_size[1], input_shape=input_shape, 27 | strides=conv_stride_size[1], padding='same', 28 | name='block1_conv1')) 29 | model.add(BatchNormalization(axis=-1)) 30 | model.add(ELU(alpha=1.0, name='block1_adv_act1')) 31 | model.add(Conv1D(filters=filter_num[1], kernel_size=kernel_size[1], 32 | strides=conv_stride_size[1], padding='same', 33 | name='block1_conv2')) 34 | model.add(BatchNormalization(axis=-1)) 35 | model.add(ELU(alpha=1.0, name='block1_adv_act2')) 36 | model.add(MaxPooling1D(pool_size=pool_size[1], strides=pool_stride_size[1], 37 | padding='same', name='block1_pool')) 38 | model.add(Dropout(0.1, name='block1_dropout')) 39 | 40 | model.add(Conv1D(filters=filter_num[2], kernel_size=kernel_size[2], 41 | strides=conv_stride_size[2], padding='same', 42 | name='block2_conv1')) 43 | model.add(BatchNormalization()) 44 | model.add(Activation('relu', name='block2_act1')) 45 | 46 | model.add(Conv1D(filters=filter_num[2], kernel_size=kernel_size[2], 47 | strides=conv_stride_size[2], padding='same', 48 | name='block2_conv2')) 49 | model.add(BatchNormalization()) 50 | model.add(Activation('relu', name='block2_act2')) 51 | model.add(MaxPooling1D(pool_size=pool_size[2], strides=pool_stride_size[3], 52 | padding='same', name='block2_pool')) 53 | model.add(Dropout(0.1, name='block2_dropout')) 54 | 55 | model.add(Conv1D(filters=filter_num[3], kernel_size=kernel_size[3], 56 | strides=conv_stride_size[3], padding='same', 57 | name='block3_conv1')) 58 | model.add(BatchNormalization()) 59 | model.add(Activation('relu', name='block3_act1')) 60 | model.add(Conv1D(filters=filter_num[3], kernel_size=kernel_size[3], 61 | strides=conv_stride_size[3], padding='same', 62 | name='block3_conv2')) 63 | model.add(BatchNormalization()) 64 | model.add(Activation('relu', name='block3_act2')) 65 | model.add(MaxPooling1D(pool_size=pool_size[3], strides=pool_stride_size[3], 66 | padding='same', name='block3_pool')) 67 | model.add(Dropout(0.1, name='block3_dropout')) 68 | 69 | model.add(Conv1D(filters=filter_num[4], kernel_size=kernel_size[4], 70 | strides=conv_stride_size[4], padding='same', 71 | name='block4_conv1')) 72 | model.add(BatchNormalization()) 73 | model.add(Activation('relu', name='block4_act1')) 74 | model.add(Conv1D(filters=filter_num[4], kernel_size=kernel_size[4], 75 | strides=conv_stride_size[4], padding='same', 76 | name='block4_conv2')) 77 | model.add(BatchNormalization()) 78 | model.add(Activation('relu', name='block4_act2')) 79 | model.add(MaxPooling1D(pool_size=pool_size[4], strides=pool_stride_size[4], 80 | padding='same', name='block4_pool')) 81 | model.add(Dropout(0.1, name='block4_dropout')) 82 | 83 | model.add(Flatten(name='flatten')) 84 | model.add(Dense(512, kernel_initializer=glorot_uniform(seed=0), name='fc1')) 85 | #model.add(BatchNormalization()) 86 | model.add(Activation('relu', name='fc1_act')) 87 | 88 | model.add(Dropout(0.7, name='fc1_dropout')) 89 | 90 | model.add(Dense(512, kernel_initializer=glorot_uniform(seed=0), name='fc2')) 91 | #model.add(BatchNormalization()) 92 | model.add(Activation('relu', name='fc2_act')) 93 | 94 | model.add(Dropout(0.5, name='fc2_dropout')) 95 | 96 | model.add(Dense(classes, kernel_initializer=glorot_uniform(seed=0), name='fc3')) 97 | model.add(Activation('softmax', name="softmax")) 98 | 99 | 100 | OPTIMIZER = Adamax(lr=learning_params_template['lr'], 101 | beta_1=learning_params_template['beta_1'], 102 | beta_2=learning_params_template['beta_2'], 103 | epsilon=learning_params_template['epsilon'], 104 | decay=learning_params_template['decay']) 105 | model.compile(loss="categorical_crossentropy", optimizer=OPTIMIZER,metrics=["accuracy"]) 106 | return model 107 | -------------------------------------------------------------------------------- /models/dl/sdae/sdae_model.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | from keras.layers import Dense, Dropout 3 | from keras.layers import Input 4 | from keras.models import Model 5 | import keras.utils.np_utils as npu 6 | import numpy as np 7 | from keras.optimizers import SGD, Adam, RMSprop 8 | 9 | from .sdae_model_config import learning_params_template, nb_classes_template 10 | 11 | global encoded_layers 12 | def make_layer(layer, x_train, x_test, steps=0, gen=False): 13 | in_dim = layer['in_dim'] 14 | out_dim = layer['out_dim'] 15 | epochs = layer['epochs'] 16 | batch_size = layer['batch_size'] 17 | optimizer = layer['optimizer'] 18 | enc_act = layer['enc_activation'] 19 | dec_act = layer['dec_activation'] 20 | 21 | if optimizer == "sgd": 22 | optimizer = SGD(lr=layer['lr'], 23 | decay=layer['decay'], 24 | momentum=layer['momentum']) 25 | elif optimizer == "adam": 26 | optimizer = Adam(lr=layer['lr'], 27 | decay=layer['decay']) 28 | elif optimizer == "rmsprop": 29 | optimizer = RMSprop(lr=layer['lr'], 30 | decay=layer['decay']) 31 | 32 | 33 | # this is our input placeholder 34 | input_data = Input(shape=(in_dim,)) 35 | # "encoded" is the encoded representation of the input_data 36 | encoded = Dense(out_dim, activation=enc_act)(input_data) 37 | # "decoded" is the lossy reconstruction of the input_data 38 | decoded = Dense(in_dim, activation=dec_act)(encoded) 39 | 40 | # this model maps an input_data to its reconstruction 41 | autoencoder = Model(input_data, decoded) 42 | 43 | # this model maps an input_data to its encoded representation 44 | encoder = Model(input_data, encoded) 45 | 46 | autoencoder.compile(optimizer=optimizer, loss='mean_squared_error') 47 | 48 | # train layer 1 49 | if gen: 50 | (train_steps, test_steps) = steps 51 | autoencoder.fit_generator(x_train, steps_per_epoch=train_steps, epochs=epochs) 52 | else: 53 | autoencoder.fit(x_train, x_train, epochs=epochs, batch_size=batch_size) 54 | 55 | # encode and decode some digits 56 | # note that we take them from the *test* set 57 | 58 | if gen: 59 | (train_steps, test_steps) = steps 60 | new_x_train1 = encoder.predict_generator(x_train, steps=train_steps) 61 | new_x_test1 = encoder.predict_generator(x_test, steps=test_steps) 62 | else: 63 | new_x_train1 = encoder.predict(x_train) 64 | new_x_test1 = encoder.predict(x_test) 65 | 66 | weights = encoder.layers[1].get_weights() 67 | 68 | return new_x_train1, new_x_test1, weights 69 | 70 | def build_model(learn_params=learning_params_template, nb_classes=nb_classes_template): 71 | ##注意输入的数据是迭代器 72 | #(x_train, y_train), (x_test, y_test) = train, test 73 | layers = learn_params["layers"] 74 | 75 | # Building SAE 76 | input_data = Input(shape=(layers[0]['in_dim'],)) 77 | prev_layer = input_data 78 | 79 | i = 0 80 | global encoded_layers 81 | encoded_layers = [] 82 | for l in layers: 83 | encoded = Dense(l['out_dim'], activation=l['enc_activation'])(prev_layer) #多个自编码层之间用了一个全连接层 84 | i += 1 85 | encoded_layers.append(i) 86 | dropout = l["dropout"] 87 | if dropout > 0.0: 88 | drop = Dropout(dropout)(encoded) 89 | i += 1 90 | prev_layer = drop 91 | else: 92 | prev_layer = encoded 93 | 94 | softmax = Dense(nb_classes, activation='softmax')(prev_layer) #最后一层是个全连接层 95 | sae = Model(input_data, softmax) 96 | ''' 97 | if pre_train: 98 | #这里是在预训练自编码器的encoder-decoder,于是应该提供X的数据 99 | # Pre-training AEs 100 | prev_x_train = None 101 | prev_x_test = None 102 | for i, l in enumerate(layers): 103 | if i == 0: 104 | prev_x_train, prev_x_test, weights = make_layer(l, train_gen, test_gen, steps=steps, gen=True) 105 | else: 106 | prev_x_train, prev_x_test, weights = make_layer(l, prev_x_train, prev_x_test) 107 | sae.layers[encoded_layers[i]].set_weights(weights) 108 | #print(sae.get_weights()) 109 | ''' 110 | if learn_params['optimizer'] == "sgd": 111 | optimizer = SGD(lr=learn_params['lr'], 112 | decay=learn_params['decay'], 113 | momentum=0.9, 114 | nesterov=True) 115 | elif learn_params['optimizer'] == "adam": 116 | optimizer = Adam(lr=learn_params['lr'], 117 | decay=learn_params['decay']) 118 | else: # elif learn_params['optimizer'] == "rmsprop": 119 | optimizer = RMSprop(lr=learn_params['lr'], 120 | decay=learn_params['decay']) 121 | metrics=['accuracy'] 122 | sae.compile(loss="categorical_crossentropy", optimizer=optimizer, metrics=metrics) 123 | return sae 124 | 125 | def pre_train(model,x_train, x_test, learn_params=learning_params_template): 126 | #这里是在预训练自编码器的encoder-decoder,于是应该提供X的数据 127 | # Pre-training AEs 128 | global encoded_layers 129 | prev_x_train = None 130 | prev_x_test = None 131 | layers = learn_params['layers'] 132 | for i, l in enumerate(layers): 133 | if i == 0: 134 | prev_x_train, prev_x_test, weights = make_layer(l, x_train, x_test,gen=False) 135 | else: 136 | prev_x_train, prev_x_test, weights = make_layer(l, prev_x_train, prev_x_test) 137 | model.layers[encoded_layers[i]].set_weights(weights) 138 | #print(sae.get_weights()) 139 | 140 | return model -------------------------------------------------------------------------------- /models/dl/beauty/beauty_main_model.py: -------------------------------------------------------------------------------- 1 | __author__ = 'dk' 2 | from models.dl.attacks import Beauty_model as CNN_model, parser_raw_data 3 | from models.dl.beauty import cnn_model_config 4 | from models.model_base import abs_model 5 | import os 6 | from config import raw_dataset_base 7 | from keras.utils import np_utils 8 | import numpy as np 9 | class model(abs_model): 10 | def __init__(self, dataset, randseed, splitrate): 11 | super(model,self).__init__('beauty',randseed= randseed) 12 | if os.path.exists(self.database) == False: 13 | os.makedirs(self.database,exist_ok=True) 14 | 15 | self.dataset = dataset 16 | self.model = self.database + '/'+ self.name + '_' + dataset + '_model' 17 | self.data = self.database + '/'+ self.name + '_' + dataset + '/' 18 | self.splitrate = splitrate 19 | #原始数据集目录 20 | full_rdata = raw_dataset_base + self.dataset 21 | self.full_rdata = full_rdata 22 | 23 | if self.data_exists() == False: 24 | self.parser_raw_data() 25 | 26 | self.cnn_model = None 27 | def parser_raw_data(self): 28 | full_rdata = self.full_rdata 29 | if os.path.exists(full_rdata) == False: 30 | raise OSError('Dataset {0} (full path: {1}) does not exist!'.format(self.dataset,full_rdata)) 31 | os.makedirs(self.data, exist_ok=True) 32 | ##从原始数据集构建DF所需的数据集 33 | X_train,y_train, X_valid, y_valid, X_test, y_test = parser_raw_data(self, self.full_rdata, max_len = cnn_model_config.learning_params_template['input_length'],burstification=True) 34 | 35 | self.save_data(X_train,y_train, X_valid, y_valid, X_test, y_test) 36 | 37 | 38 | def train(self): 39 | X_train,y_train, X_valid, y_valid, X_test, y_test = self.load_data() 40 | num_class = self.num_classes() 41 | cnn_model_config.nb_classes_template = num_class 42 | print(num_class) 43 | y_train = np_utils.to_categorical(y_train, num_classes=num_class) 44 | y_valid = np_utils.to_categorical(y_valid, num_classes=num_class) 45 | y_test = np_utils.to_categorical(y_test, num_classes= self.num_classes()) 46 | 47 | X_train = X_train[:, :,np.newaxis] 48 | X_valid = X_valid[:, :,np.newaxis] 49 | X_test = X_test[:, :,np.newaxis] 50 | 51 | cnn_model = CNN_model(num_class = num_class) 52 | cnn_model.build_model() 53 | #cnn_model.model.summary() 54 | cnn_model.fit(X_train=X_train,y_train=y_train, 55 | X_valid= X_valid, y_valid = y_valid, 56 | batch_size= cnn_model_config.learning_params_template['batch_size'], 57 | epochs=cnn_model_config.learning_params_template['epoch']) 58 | 59 | cnn_model.save_model(path=self.model) 60 | score = cnn_model.evaluate(X_test=X_test, y_test = y_test) 61 | print('[Beauty Test on {0} accuracy {1}'.format(self.dataset, score)) 62 | def test(self): 63 | X_train,y_train, X_valid, y_valid, X_test, y_test = self.load_data() 64 | y_test = np_utils.to_categorical(y_test, num_classes= self.num_classes()) 65 | X_test = X_test[:, :,np.newaxis] 66 | 67 | cnn_model = CNN_model(num_class= self.num_classes()) 68 | cnn_model.load_model(self.model) 69 | score = cnn_model.evaluate(X_test=X_test,y_test=y_test) 70 | print('Beauty Test on {0} accuracy :{1}'.format(self.dataset,score)) 71 | 72 | def predict(self,pkt_size): 73 | def pad_sequence(x, max_len, pad_value=0): 74 | r = x + [pad_value] * (max_len - len(x)) 75 | return r[:max_len] 76 | 77 | if self.cnn_model == None: 78 | self.cnn_model = CNN_model(num_class= self.num_classes()) 79 | self.cnn_model.load_model(self.model) 80 | 81 | x = [pad_sequence(_pkt_size, max_len= cnn_model_config.learning_params_template['in_dim']) for _pkt_size in pkt_size] 82 | x = np.array(x)[:, :,np.newaxis] 83 | y_logit = self.cnn_model.predict(x, actual_lable=True) 84 | return y_logit.tolist() 85 | def get_feature(self): 86 | X_train,y_train, X_valid, y_valid, X_test, y_test = self.load_data() 87 | #y_test = np_utils.to_categorical(y_test, num_classes= self.num_classes()) 88 | X_test = X_test[:5000] 89 | X_test = X_test[:, :,np.newaxis] 90 | 91 | cnn_model = CNN_model(num_class= self.num_classes()) 92 | cnn_model.load_model(self.model) 93 | logit, feature = cnn_model.predict(X_test=X_test,actual_lable=False, return_feature=True) 94 | print(feature.shape, logit.shape) 95 | logit = logit.tolist() 96 | feature = feature.tolist() 97 | #feature = logit 98 | y_true = y_test[:5000].tolist() 99 | feature_set = {} 100 | feature_vector = [] 101 | for i in range(len(y_true)): 102 | if y_true[i] not in feature_set: 103 | feature_set[y_true[i]] = [] 104 | feature_set[y_true[i]].append([feature[i]]) 105 | import pickle 106 | with open('feature_set_D1_53_DF.pkl','wb') as fp: 107 | pickle.dump(feature_set, fp) 108 | print(y_true[-1],logit[-1]) 109 | print(feature[-1]) 110 | if __name__ == '__main__': 111 | for test_rate in [0.1]: 112 | print(test_rate) 113 | dataset='app60' 114 | cnn_model = model(dataset, randseed= 128, splitrate=test_rate) 115 | #cnn_model.parser_raw_data() 116 | #cnn_model.train() 117 | cnn_model.test() 118 | print(dataset) 119 | print(test_rate) 120 | #import os 121 | break 122 | #os.remove(df_model.model) 123 | #df_model.get_feature() 124 | -------------------------------------------------------------------------------- /models/dl/df_only_D/df_main_model.py: -------------------------------------------------------------------------------- 1 | __author__ = 'dk' 2 | from models.dl.attacks import DF_model, parser_raw_data 3 | from models.dl.df_only_D import df_model_config 4 | from models.model_base import abs_model 5 | import os 6 | from config import raw_dataset_base 7 | from keras.utils import np_utils 8 | import numpy as np 9 | class model(abs_model): 10 | def __init__(self, dataset, randseed, splitrate): 11 | super(model,self).__init__('df',randseed= randseed) 12 | if os.path.exists(self.database) == False: 13 | os.makedirs(self.database,exist_ok=True) 14 | 15 | self.dataset = dataset 16 | self.model = self.database + '/'+ self.name + '_' + dataset + '_model' 17 | self.data = self.database + '/'+ self.name + '_' + dataset + '/' 18 | self.splitrate = splitrate 19 | #原始数据集目录 20 | full_rdata = raw_dataset_base + self.dataset 21 | self.full_rdata = full_rdata 22 | 23 | if self.data_exists() == False: 24 | self.parser_raw_data() 25 | 26 | self.df_model = None 27 | def parser_raw_data(self): 28 | full_rdata = self.full_rdata 29 | if os.path.exists(full_rdata) == False: 30 | raise OSError('Dataset {0} (full path: {1}) does not exist!'.format(self.dataset,full_rdata)) 31 | os.makedirs(self.data, exist_ok=True) 32 | ##从原始数据集构建DF所需的数据集 33 | X_train,y_train, X_valid, y_valid, X_test, y_test = parser_raw_data(self, self.full_rdata, max_len = df_model_config.learning_params_template['in_dim']) 34 | 35 | ##只使用包的方向 36 | X_train = np.sign(X_train) 37 | X_valid = np.sign(X_valid) 38 | X_test = np.sign(X_test) 39 | 40 | self.save_data(X_train,y_train, X_valid, y_valid, X_test, y_test) 41 | 42 | 43 | def train(self): 44 | X_train,y_train, X_valid, y_valid, X_test, y_test = self.load_data() 45 | num_class = self.num_classes() 46 | df_model_config.nb_classes_template = num_class 47 | 48 | y_train = np_utils.to_categorical(y_train, num_classes=num_class) 49 | y_valid = np_utils.to_categorical(y_valid, num_classes=num_class) 50 | y_test = np_utils.to_categorical(y_test, num_classes= self.num_classes()) 51 | 52 | X_train = X_train[:, :,np.newaxis] 53 | X_valid = X_valid[:, :,np.newaxis] 54 | X_test = X_test[:, :,np.newaxis] 55 | 56 | df_model = DF_model(num_class = num_class) 57 | df_model.build_model() 58 | 59 | df_model.fit(X_train=X_train,y_train=y_train, 60 | X_valid= X_valid, y_valid = y_valid, 61 | batch_size= df_model_config.learning_params_template['batch_size'], 62 | epochs=df_model_config.learning_params_template['epoch']) 63 | 64 | df_model.save_model(path=self.model) 65 | score = df_model.evaluate(X_test=X_test, y_test = y_test) 66 | print('[Deep Fingerprinting (only direction) Test on {0} accuracy {1}'.format(self.dataset, score)) 67 | def test(self): 68 | X_train,y_train, X_valid, y_valid, X_test, y_test = self.load_data() 69 | y_test = np_utils.to_categorical(y_test, num_classes= self.num_classes()) 70 | X_test = X_test[:, :,np.newaxis] 71 | 72 | df_model = DF_model(num_class= self.num_classes()) 73 | df_model.load_model(self.model) 74 | score = df_model.evaluate(X_test=X_test,y_test=y_test) 75 | print('Deep Fingerprinting (only direction) Test on {0} accuracy :{1}'.format(self.dataset,score)) 76 | 77 | def predict(self,pkt_size): 78 | def pad_sequence(x, max_len, pad_value=0): 79 | r = x + [pad_value] * (max_len - len(x)) 80 | return r[:max_len] 81 | 82 | if self.df_model == None: 83 | self.df_model = DF_model(num_class= self.num_classes()) 84 | self.df_model.load_model(self.model) 85 | 86 | x = [pad_sequence(_pkt_size, max_len= df_model_config.learning_params_template['in_dim']) for _pkt_size in pkt_size] 87 | x = np.array(x)[:, :,np.newaxis] 88 | y_logit = self.df_model.predict(x, actual_lable=True) 89 | return y_logit.tolist() 90 | def get_feature(self): 91 | X_train,y_train, X_valid, y_valid, X_test, y_test = self.load_data() 92 | #y_test = np_utils.to_categorical(y_test, num_classes= self.num_classes()) 93 | X_test = X_test[:5000] 94 | X_test = X_test[:, :,np.newaxis] 95 | 96 | df_model = DF_model(num_class= self.num_classes()) 97 | df_model.load_model(self.model) 98 | logit, feature = df_model.predict(X_test=X_test,actual_lable=False, return_feature=True) 99 | print(feature.shape, logit.shape) 100 | logit = logit.tolist() 101 | feature = feature.tolist() 102 | #feature = logit 103 | y_true = y_test[:5000].tolist() 104 | feature_set = {} 105 | feature_vector = [] 106 | for i in range(len(y_true)): 107 | if y_true[i] not in feature_set: 108 | feature_set[y_true[i]] = [] 109 | feature_set[y_true[i]].append([feature[i]]) 110 | import pickle 111 | with open('feature_set_D1_53_DF.pkl','wb') as fp: 112 | pickle.dump(feature_set, fp) 113 | print(y_true[-1],logit[-1]) 114 | print(feature[-1]) 115 | if __name__ == '__main__': 116 | for test_rate in [0.1]: 117 | print(test_rate) 118 | dataset='app60' 119 | df_model = model(dataset, randseed= 128, splitrate=test_rate) 120 | df_model.parser_raw_data() 121 | df_model.train() 122 | #df_model.test() 123 | print(dataset) 124 | print(test_rate) 125 | #import os 126 | #os.remove(df_model.model) 127 | #df_model.get_feature() 128 | -------------------------------------------------------------------------------- /models/dl/graphDapp/DApp_Classifier.py: -------------------------------------------------------------------------------- 1 | __author__ = 'dk' 2 | import numpy as np 3 | import torch as th 4 | import dgl 5 | import torch.nn as nn 6 | from dgl.nn.pytorch import GINConv 7 | from models.dl.graphDapp.data_builder import Dataset_fgnet 8 | class DApp_MLP(nn.Module): 9 | def __init__(self,in_feats,out_feats=64, layer_nums = 3): 10 | super(DApp_MLP,self).__init__() 11 | self.linear_layers =nn.ModuleList() 12 | for each in range(layer_nums): 13 | if each == 0 : 14 | in_features= in_feats 15 | else: 16 | in_features = out_feats 17 | self.linear_layers.append(nn.Linear(in_features= in_features,out_features=out_feats)) 18 | self.activate = nn.ReLU() 19 | self.batchnorm = nn.BatchNorm1d(out_feats) 20 | self.dropout = nn.Dropout(p=0.0) 21 | 22 | def forward(self, x): 23 | x1 = x 24 | for mod in self.linear_layers : 25 | x1 = mod(x1) 26 | x1 = self.activate(x1) 27 | 28 | x2 = self.batchnorm(x1) 29 | x3 = self.dropout(x2) 30 | return x3 31 | 32 | class DApp_classifier(nn.Module): 33 | def __init__(self, nb_classes=53, gin_layer_num=3, gin_hidden_units=64, iteration_nums = 3, graph_pooling_type='sum', 34 | neighbor_pooling_type='sum',use_gpu=False, device='cpu', iteration_first=True, embedding= True): 35 | #DApp: 3个GIN,顺序级联在一起 36 | super(DApp_classifier,self).__init__() 37 | 38 | self.nb_classes = nb_classes 39 | self.gin_layer_num = gin_layer_num 40 | self.gin_hidden_uints = gin_hidden_units 41 | self.iteration_nums = iteration_nums 42 | 43 | self.graph_pooling_type = graph_pooling_type 44 | self.neighbor_pooling_type= neighbor_pooling_type 45 | 46 | self.use_gpu = use_gpu 47 | self.device = device 48 | 49 | self.gin_layers = [] 50 | self.interation_first = iteration_first 51 | self.embedding = embedding 52 | self.embedding_dim = gin_hidden_units #embedding的设置为gin的隐藏神经元个数 53 | 54 | if embedding : 55 | self.embedding_layer = th.nn.Embedding(num_embeddings= 3100, embedding_dim= self.embedding_dim) 56 | #添加GIN层 57 | if iteration_first == False: 58 | for each in range(gin_layer_num): 59 | if each == 0: 60 | in_feats = self.embedding_dim if self.embedding == True else 1 61 | else: 62 | in_feats = gin_hidden_units 63 | mlp = DApp_MLP(in_feats, out_feats= gin_hidden_units, layer_nums= self.gin_layer_num) 64 | print(mlp) 65 | if use_gpu : 66 | mlp = mlp.to(th.device(device)) 67 | gin_layer =GINConv( 68 | apply_func= mlp, 69 | aggregator_type= self.neighbor_pooling_type, 70 | learn_eps=True 71 | ) 72 | if use_gpu: 73 | gin_layer = gin_layer.to(th.device(device)) 74 | self.gin_layers.append(gin_layer) 75 | else: 76 | if embedding == False: 77 | mlp = DApp_MLP(1,out_feats=gin_hidden_units,layer_nums= self.gin_layer_num) 78 | else: 79 | mlp = DApp_MLP(self.embedding_dim, gin_hidden_units, layer_nums= self.gin_layer_num) 80 | if use_gpu: 81 | mlp = mlp.to(th.device(device)) 82 | print(mlp) 83 | gin_layer = GINConv( 84 | apply_func=mlp, 85 | aggregator_type= self.neighbor_pooling_type, 86 | learn_eps=True 87 | ) 88 | if use_gpu: 89 | gin_layer = gin_layer.to(th.device(device)) 90 | self.gin_layers.append(gin_layer) 91 | #最后的全连接分类层 92 | self.linear = nn.Linear(in_features=iteration_nums * gin_hidden_units,out_features=nb_classes) 93 | 94 | 95 | def forward(self, g): 96 | 97 | node_feature = g.ndata['pkt_length'] 98 | 99 | if self.embedding == True: 100 | node_feature = self.embedding_layer(th.reshape(node_feature.long(),(-1,)) + Dataset_fgnet.MTU) 101 | 102 | graph_feature_history = [] 103 | ##gin 104 | if self.interation_first == False: 105 | for layer in self.gin_layers: 106 | node_feature = layer(g, node_feature.to(th.device(self.device))) 107 | g.ndata['iterated_feature'] = node_feature 108 | if self.graph_pooling_type == 'sum': 109 | graph_feature = dgl.sum_nodes(g,'iterated_feature') 110 | elif self.graph_pooling_type == 'mean': 111 | graph_feature = dgl.mean_nodes(g,'iterated_feature') 112 | 113 | graph_feature_history.append(graph_feature) 114 | else: 115 | layer = self.gin_layers[-1] 116 | # 只有一个MLP 117 | for i in range(self.iteration_nums): 118 | node_feature = layer(g, node_feature.to(th.device(self.device))) 119 | g.ndata['iterated_feature'] = node_feature 120 | if self.graph_pooling_type == 'sum': 121 | graph_feature = dgl.sum_nodes(g,'iterated_feature') 122 | elif self.graph_pooling_type == 'mean': 123 | graph_feature = dgl.mean_nodes(g,'iterated_feature') 124 | 125 | graph_feature_history.append(graph_feature) 126 | 127 | ##把所有的历史concate起来, 128 | 129 | graph_features = th.cat(graph_feature_history,-1) 130 | 131 | #全连接分类 132 | power = self.linear(graph_features) 133 | return power 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | -------------------------------------------------------------------------------- /models/ml/rdp/util.py: -------------------------------------------------------------------------------- 1 | __author__ = 'jmh081701' 2 | import sys 3 | import os 4 | import json 5 | import numpy as np 6 | from models.ml.rdp import statistic_tractor 7 | def read_txt(filename): 8 | ''' 9 | :param filename: text file the record packet length and timestamp. 10 | :return: 11 | client to server : upload traffic, length > 0 12 | server to client : download traffic , length <0 13 | ''' 14 | with open(filename,"r") as fp: 15 | while True: 16 | line=fp.readline() 17 | if line: 18 | rst=[] 19 | line = line.split(",") 20 | line = line[0:3]+line[-2:-1] 21 | rst.append(float(line[0])) 22 | rst.append(int(line[3],10)) 23 | try: 24 | if line[1].count('107c') > 0: 25 | #client to server > 0 26 | rst[1]*=1 27 | else: 28 | #server to client < 0 29 | rst[1]*= -1 30 | except: 31 | if str(line[1]).count("124.16.") > 0: 32 | #client to server >0 33 | rst[1] *=1 34 | else: 35 | #server to client < 0 36 | rst[0] *=-1 37 | yield rst 38 | else: 39 | break 40 | def gather_peak_up_down(filename,gap=0.1): 41 | file_reader = read_txt(filename) 42 | peaks=[[[0,0]]] 43 | index = 0 44 | for each in file_reader: 45 | if abs(peaks[index][-1][0]-each[0])0 and abs(peaks[index][-1][0]-each[0])= minlen: 84 | new_data.append(x) 85 | new_labels.append(y) 86 | num_traces[y] = count + 1 87 | data = np.array(new_data) 88 | labels = np.array(new_labels) 89 | del new_data, new_labels 90 | if not data.size: 91 | raise ValueError('After filtering, no sequence left.') 92 | del num_traces 93 | 94 | # Pad if traces are of various length or if their uniform length is not equal to maxlen 95 | if maxlen: 96 | if len(data.shape) == 1 or data.shape[1] != maxlen: 97 | print("Pad/trunc with maxlen={}".format(maxlen)) 98 | #data = data[:, :maxlen] 99 | # Old way: 100 | data = sq.pad_sequences(data, maxlen=maxlen, padding='post', truncating='post', dtype="float64") 101 | 102 | if dnn_type == "lstm" or dnn_type == "cnn": 103 | data = data.reshape(data.shape[0], data.shape[1], 1) 104 | if type == "sdae" and len(data.shape) > 2: 105 | print("WEIRD! data.shape={}".format(data.shape)) 106 | data = data.reshape(data.shape[0], data.shape[1]) 107 | 108 | if not openw: 109 | print("Categorize") 110 | labels = categorize(labels) 111 | 112 | print("Data {}, labels {}".format(data.shape, labels.shape)) 113 | 114 | return data, labels 115 | 116 | 117 | def split_dataset(x, y, val_split=0.05, test_split=0.05): 118 | ''' 依概率切分数据集,同时对数据做了shuffle 119 | :param x: 120 | :param y: 121 | :param val_split: 122 | :param test_split: 123 | :return: 124 | ''' 125 | x_train =[] 126 | x_val =[] 127 | x_test =[] 128 | 129 | y_train =[] 130 | y_val =[] 131 | y_test=[] 132 | 133 | num = x.shape[0] 134 | for i in range(num): 135 | rnd = random.random() 136 | if rnd =percentage: 34 | return i*bin + bin/2.0 35 | 36 | 37 | def length_distribution(packet_length,bin=30): 38 | distribution = np.zeros(shape =(1500//bin,)) 39 | for each in packet_length: 40 | each = int(abs(each)) 41 | if each >= 1500 : 42 | each = 1499 43 | distribution[each//bin] +=1 44 | distribution =distribution / len(packet_length) 45 | #print(distribution) 46 | return distribution 47 | def poison_generate_function(data): 48 | rst = [] 49 | for x in np.arange(1.1,10,2): 50 | sum =0 51 | for n in range(0,len(data)): 52 | sum =sum + data[n]*np.math.sin(2*x*np.math.pi /31 *n) 53 | rst.append(round(sum,3)) 54 | return rst 55 | def _poison_generate_function(data): 56 | rst =[] 57 | sum =0 58 | for n in range(0,len(data)): 59 | sum =sum + data[n]* np.math.pow(np.math.e,-x) * np.math.pow(x,1+n)/np.math.factorial(1+n) 60 | rst.append(round(sum,3)) 61 | return rst 62 | def L_generate_function(data): 63 | rst = [] 64 | for x in np.arange(1.1,10,2): 65 | sum =0 66 | for n in range(0,len(data)): 67 | sum =sum + data[n]*np.math.cos(2*x*np.math.pi /91 *n) 68 | rst.append(round(sum,3)) 69 | return rst 70 | def _L_generate_function(data): 71 | rst = [] 72 | for x in np.arange(1.5,3.5,0.5): 73 | sum =0 74 | for n in range(0,len(data)): 75 | fac=np.math.pow(x,n+1) 76 | sum =sum + data[n]*fac/(1-fac) 77 | rst.append(round(sum,3)) 78 | return rst 79 | def generate_function(data): 80 | rst=[] 81 | rst+=poison_generate_function(data) 82 | rst+=L_generate_function(data) 83 | return rst 84 | def peak_pkt_length_feature(_peak): 85 | if len(_peak) == 0: 86 | return [0] * 10 + [0] *17 87 | peak = np.mat(_peak) 88 | packet_length_data = list(map(lambda x : x[0],peak[:,1].tolist())) #只取包长 89 | gen_features = generate_function(packet_length_data) 90 | 91 | mom_features = [0] * 17 92 | #0-4 阶中心矩 93 | mom_features[0] = moment(1,packet_length_data,c=1) 94 | mom_features[1] = moment(2,packet_length_data,c=1) 95 | mom_features[2] = moment(3,packet_length_data,c=1) 96 | mom_features[3] = moment(4,packet_length_data,c=1) 97 | mom_features[4] = moment(5,packet_length_data,c=1) 98 | #1-3阶 原点矩 99 | for each in packet_length_data: 100 | mom_features[5] +=abs(each) 101 | mom_features[6] +=abs(each)**2 102 | mom_features[7] +=abs(each)**3 103 | mom_features[6] =(mom_features[6]/len(packet_length_data)) **(1/2) 104 | mom_features[7] =(mom_features[7]/len(packet_length_data)) **(1/3) 105 | #10%-90% 百分位数 106 | 107 | for i in range(8,17): 108 | mom_features[i] = length_percentile(packet_length_data,percentage=(i-8+1)*0.1) 109 | return gen_features + mom_features 110 | def peak_relative_arrive_time_feature(_peak): 111 | if len(_peak) == 0: 112 | return [0] * 5 113 | peak = np.mat(_peak) 114 | arrive_time_data = list(map(lambda x : x[0],peak[:,0].tolist()) ) 115 | #gen_features = generate_function(arrive_time_data) 116 | mom_features = [0] * 5 117 | for i in range(0,5): 118 | for each in arrive_time_data: 119 | mom_features[i] +=each** i 120 | if i != 0: 121 | mom_features[i]= round((mom_features[i] **(1/i) ).real,3) 122 | return mom_features 123 | def peak_feature(peak): 124 | up_peak=[] 125 | down_peak=[] 126 | total_peak=[peak[0]] 127 | if total_peak[0][1] >0 : 128 | up_peak.append((total_peak[0][0],total_peak[0][1])) 129 | else: 130 | down_peak.append((total_peak[0][0],-total_peak[0][1])) 131 | for i in range(1,len(peak)): 132 | total_peak.append((peak[i][0]-peak[0][0],peak[i][1])) 133 | #上下游的包 #注意,需要验证一下 total_peak本身是否需要带负号 134 | if peak[i][1]>0: 135 | #upload的包 136 | up_peak.append((total_peak[i][0],total_peak[i][1])) 137 | else: 138 | #download 的包 139 | down_peak.append((total_peak[i][0],-total_peak[i][1])) 140 | features=[] 141 | #pkt length 142 | features += peak_pkt_length_feature(total_peak) 143 | #print('total peak pkt length feature:',len(features)) 144 | features += peak_pkt_length_feature(up_peak) 145 | #print('up peak pkt length feature:',len(features)) 146 | features += peak_pkt_length_feature(down_peak) 147 | #print('down peak pkt length feature:',len(features)) 148 | #relative arrive time 149 | features += peak_relative_arrive_time_feature(total_peak) 150 | #print('total peak pkt arrive time feature:',len(features)) 151 | features += peak_relative_arrive_time_feature(up_peak) 152 | #print('up peak pkt arrive time feature:',len(features)) 153 | features += peak_relative_arrive_time_feature(down_peak) 154 | #print('down peak pkt arrive time feature:',len(features)) 155 | return features 156 | 157 | if __name__ == '__main__': 158 | packet_lengths=[(0,40),(0,53),(0,53),(0,1074),(0,73),(0,40),(0,217),(0,131),(0,209),(0,73),(0,40),(0,254),(0,73)] 159 | print(peak_pkt_length_feature(packet_lengths)) 160 | 161 | 162 | 163 | -------------------------------------------------------------------------------- /models/ml/appscanner/model.py: -------------------------------------------------------------------------------- 1 | __author__ = 'dk' 2 | from models.model_base import abs_model 3 | import os 4 | import shutil 5 | import json 6 | from config import raw_dataset_base, min_flow_len 7 | from models.ml.appscanner import feature_extractor 8 | import numpy as np 9 | from models.ml.appscanner import min_max 10 | import pickle 11 | import lightgbm as lgb 12 | import tqdm 13 | from sklearn.metrics import accuracy_score,classification_report 14 | from models.ml.appscanner.hyper_params import hyper_params 15 | 16 | class model(abs_model): 17 | def __init__(self, dataset, randseed, splitrate): 18 | super(model,self).__init__('appscanner',randseed= randseed) 19 | if os.path.exists(self.database) == False: 20 | os.makedirs(self.database,exist_ok=True) 21 | 22 | self.dataset = dataset 23 | self.model = self.database + '/'+ self.name + '_' + dataset + '_model' 24 | self.data = self.database + '/'+ self.name + '_' + dataset + '/' 25 | self.splitrate = splitrate 26 | #原始数据集目录 27 | full_rdata = raw_dataset_base + self.dataset 28 | self.full_rdata = full_rdata 29 | 30 | if self.data_exists() == False: 31 | self.parser_raw_data() 32 | 33 | 34 | def parser_raw_data(self): 35 | full_rdata = self.full_rdata 36 | if os.path.exists(full_rdata) == False: 37 | raise OSError('Dataset {0} (full path: {1}) does not exist!'.format(self.dataset,full_rdata)) 38 | #从原始数据集目录构建appscanner所需的数据集 39 | X = [] 40 | y = [] 41 | for _root, _dirs, _files in os.walk(full_rdata): 42 | labels = [] 43 | for file in _files: 44 | labels.append(file) 45 | labels.sort() 46 | for file in tqdm.trange(len(_files)): 47 | file = _files[file] 48 | label = labels.index(file) 49 | file = _root + '/' + file 50 | 51 | with open(file) as fp: 52 | rdata = json.load(fp) 53 | 54 | for each in rdata : 55 | pkt_size= each['packet_length'] 56 | if len(pkt_size) < min_flow_len : 57 | continue 58 | x = feature_extractor.feature_extract(pkt_size) 59 | X.append(x) 60 | y.append(label) 61 | X = np.array(X) 62 | _max = np.array(min_max._max) 63 | _min = np.array(min_max._min) 64 | #归一化 65 | X = (X - _min)/(_max - _min) 66 | X = X.tolist() 67 | 68 | X_train = [] 69 | y_train = [] 70 | X_valid = [] 71 | y_valid = [] 72 | X_test = [] 73 | y_test = [] 74 | for i in range(len(X)): 75 | r = self.rand.uniform(0,1) 76 | if r < self.splitrate: 77 | X_test.append(X[i]) 78 | y_test.append(y[i]) 79 | elif r < self.splitrate * (2 - self.splitrate) : 80 | X_valid.append(X[i]) 81 | y_valid.append(y[i]) 82 | else: 83 | X_train.append(X[i]) 84 | y_train.append(y[i]) 85 | os.makedirs(self.data,exist_ok=True) 86 | 87 | with open(self.data + 'X_train.pkl','wb') as fp: 88 | pickle.dump(X_train, fp) 89 | 90 | with open(self.data + 'y_train.pkl','wb') as fp: 91 | pickle.dump(y_train,fp) 92 | 93 | with open(self.data + 'X_valid.pkl', 'wb') as fp: 94 | pickle.dump(X_valid,fp) 95 | 96 | with open(self.data + 'y_valid.pkl', 'wb') as fp: 97 | pickle.dump(y_valid, fp) 98 | 99 | with open(self.data + 'X_test.pkl', 'wb') as fp : 100 | pickle.dump(X_test, fp) 101 | 102 | with open(self.data + 'y_test.pkl' ,'wb') as fp: 103 | pickle.dump(y_test, fp) 104 | 105 | def load_data(self): 106 | with open(self.data + 'X_train.pkl','rb') as fp: 107 | X_train = pickle.load(fp) 108 | 109 | with open(self.data + 'y_train.pkl','rb') as fp: 110 | y_train = pickle.load(fp) 111 | 112 | with open(self.data + 'X_valid.pkl','rb') as fp: 113 | X_valid = pickle.load(fp) 114 | 115 | with open(self.data + 'y_valid.pkl','rb') as fp: 116 | y_valid = pickle.load(fp) 117 | 118 | with open(self.data + 'X_test.pkl','rb') as fp : 119 | X_test = pickle.load(fp) 120 | 121 | with open(self.data + 'y_test.pkl','rb') as fp: 122 | y_test = pickle.load(fp) 123 | 124 | return np.array(X_train), np.array(y_train), np.array(X_valid), np.array(y_valid), np.array(X_test), np.array(y_test) 125 | 126 | def train(self): 127 | X_train, y_train, X_valid, y_valid, X_test, y_test = self.load_data() 128 | lgb_train = lgb.Dataset(data=X_train,label=y_train) 129 | lgb_eval = lgb.Dataset(data=X_valid,label=y_valid) 130 | 131 | hyper_params['num_class'] = self.num_classes() 132 | gbm = lgb.train(params=hyper_params, 133 | train_set=lgb_train, 134 | valid_sets=lgb_eval, 135 | num_boost_round=50, 136 | early_stopping_rounds=5) 137 | #save model 138 | try: 139 | gbm.save_model(self.model) 140 | except BaseException as exp: 141 | pass 142 | logit = gbm.predict(data=X_test) 143 | label_predict = list(map(lambda x : np.argmax(x),logit)) 144 | 145 | accuracy = accuracy_score(y_test,label_predict) 146 | print('[Appscanner Test on {0} accuracy:{1}]'.format(self.dataset,accuracy)) 147 | 148 | def test(self): 149 | X_train, y_train, X_valid, y_valid, X_test, y_test = self.load_data() 150 | #load model 151 | try: 152 | gbm = lgb.Booster(model_file= self.model) 153 | except BaseException as exp: 154 | raise exp 155 | logit = gbm.predict(data=X_test) 156 | label_predict = list(map(lambda x : np.argmax(x),logit)) 157 | 158 | accuracy = accuracy_score(y_test,label_predict) 159 | report = classification_report(y_true=y_test,y_pred=label_predict) 160 | 161 | print("[Appscanner] Test on {0}, accuracy is {1}. ".format(self.dataset,accuracy)) 162 | print(report) 163 | 164 | if __name__ == '__main__': 165 | appscanner = model('website113', 128, 0.1) 166 | #appscanner.parser_raw_data() 167 | appscanner.train() 168 | appscanner.test() 169 | --------------------------------------------------------------------------------