├── classification.py ├── link ELF dataset.txt ├── link PSI_graph data.txt ├── model.pkl ├── model.py ├── settings.json └── train_CFG.py /classification.py: -------------------------------------------------------------------------------- 1 | from torch.autograd import Variable 2 | import torch 3 | import torch.optim as optim 4 | import torch.nn as nn 5 | 6 | from sklearn.utils import shuffle 7 | from sklearn.metrics import f1_score 8 | from sklearn.metrics import roc_curve, auc 9 | 10 | from gensim.models.keyedvectors import KeyedVectors 11 | import numpy as np 12 | import argparse 13 | import copy 14 | import pandas 15 | import csv 16 | import random 17 | import pickle 18 | from model import TextCNN 19 | 20 | from utils import utils 21 | import matplotlib.pyplot as plt 22 | 23 | def prepareData(malFile, norFile): 24 | a = utils() 25 | data = a.readFile(malFile) 26 | x = [] 27 | y = [] 28 | for item in data: 29 | x.append(data[item]) 30 | y.append(0) 31 | data = a.readFile(norFile) 32 | for item in data: 33 | x.append(data[item]) 34 | y.append(1) 35 | return x, y 36 | 37 | def test(model, X, Y): 38 | model.eval() 39 | predLabel = [] 40 | for i in range(len(X)/52): 41 | batch_x = X[i * 52 : (i + 1) * 52] 42 | batch_x = Variable(torch.FloatTensor(batch_x)).cuda() 43 | pred = model(batch_x, len(batch_x)) 44 | predLabel.extend(pred.data.max(1)[1].cpu().numpy()) 45 | print(f1_score(Y, predLabel)) 46 | print(Y) 47 | predLabel = np.array(predLabel) 48 | # Draw ROC, AUC 49 | fpr, tpr, thresholds = roc_curve(Y, predLabel.round(decimals=3), pos_label = 1) 50 | aucValue = auc(fpr, tpr) 51 | print(aucValue) 52 | plt.figure() 53 | lw =2 54 | plt.plot(fpr, tpr, color='darkorange', lw=lw, label='ROC curve (area = %0.2f)' % auc(fpr, tpr)) 55 | plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--') 56 | plt.xlim([0.0, 1.0]) 57 | plt.ylim([0.0, 1.05]) 58 | plt.xlabel('False Positive Rate') 59 | plt.ylabel('True Positive Rate') 60 | plt.title('Receiver operating characteristic example') 61 | plt.legend(loc="lower right") 62 | plt.show() 63 | 64 | def train(x, y): 65 | model = TextCNN() 66 | model = model.cuda() 67 | parameters = filter(lambda p: p.requires_grad, model.parameters()) 68 | optimizer = optim.SGD(model.parameters(), lr=1e-3) 69 | criterion = nn.CrossEntropyLoss(size_average=False) 70 | 71 | for epoch in range(100): 72 | total = 0 73 | for i in range(0, len(x)/64): 74 | batch_x = x[i*64:(i+1)*64] 75 | batch_y = y[i*64:(i+1)*64] 76 | batch_x = Variable(torch.FloatTensor(batch_x)).cuda() 77 | batch_y = Variable(torch.LongTensor(batch_y)).cuda() 78 | optimizer.zero_grad() 79 | model.train() 80 | pred = model(batch_x, 64) 81 | loss = criterion(pred, batch_y) 82 | #print(loss) 83 | loss.backward() 84 | nn.utils.clip_grad_norm(parameters, max_norm=3) 85 | total += np.sum(pred.data.max(1)[1].cpu().numpy() == batch_y.data.cpu().numpy()) 86 | optimizer.step() 87 | print("epoch ", epoch + 1, " acc: ", float(total)/len(x)) 88 | return model 89 | 90 | if __name__ == '__main__': 91 | data_x, data_y = prepareData('/media/aisu/Others/Hoang/SOIS_2018/CNN_ATrung/data/malware_dims_1024_epochs_100_lr_0.3_embeddings.txt', '/media/aisu/Others/Hoang/SOIS_2018/CNN_ATrung/data/benign_dims_1024_epochs_100_lr_0.3_embeddings.txt') 92 | data_x, data_y = shuffle(data_x, data_y) 93 | data_x_train = data_x[:3968] 94 | data_y_train = data_y[:3968] 95 | data_x_test = data_x[3968:] 96 | data_y_test = data_y[3968:] 97 | #print(len(data_x_test)) 98 | model = train(data_x, data_y) 99 | test(model, data_x_test, data_y_test) 100 | with open(r"model.pkl", "wb") as output_file: 101 | pickle.dump(model, output_file) -------------------------------------------------------------------------------- /link ELF dataset.txt: -------------------------------------------------------------------------------- 1 | https://drive.google.com/file/d/1NK4DMFP5x75BbXZtw4Xgv7K8cBQcoPCW/view?usp=sharing -------------------------------------------------------------------------------- /link PSI_graph data.txt: -------------------------------------------------------------------------------- 1 | https://drive.google.com/file/d/1eGny59CVvmPcQ9CpKZFQwGdL3fpTr3HW/view?usp=sharing -------------------------------------------------------------------------------- /model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | 6 | class TextCNN(nn.Module): 7 | def __init__(self): 8 | super(TextCNN, self).__init__() 9 | 10 | self.conv1 = nn.Sequential( 11 | nn.Conv1d(1, 256, kernel_size=7, stride=1), 12 | nn.ReLU(), 13 | nn.MaxPool1d(kernel_size=3, stride=3) 14 | ) 15 | 16 | self.conv2 = nn.Sequential( 17 | nn.Conv1d(256, 256, kernel_size=7, stride=1), 18 | nn.ReLU(), 19 | nn.MaxPool1d(kernel_size=3, stride=3) 20 | ) 21 | 22 | self.conv3 = nn.Sequential( 23 | nn.Conv1d(256, 256, kernel_size=3, stride=1), 24 | nn.ReLU() 25 | ) 26 | 27 | self.conv4 = nn.Sequential( 28 | nn.Conv1d(256, 256, kernel_size=3, stride=1), 29 | nn.ReLU() 30 | ) 31 | 32 | self.conv5 = nn.Sequential( 33 | nn.Conv1d(256, 256, kernel_size=3, stride=1), 34 | nn.ReLU() 35 | ) 36 | 37 | self.conv6 = nn.Sequential( 38 | nn.Conv1d(256, 256, kernel_size=3, stride=1), 39 | nn.ReLU(), 40 | nn.MaxPool1d(kernel_size=3, stride=3) 41 | ) 42 | 43 | self.fc = nn.Linear(8704, 2) 44 | 45 | def forward(self, x, num): 46 | x = x.view(num, 1, 1024) 47 | x = self.conv1(x) 48 | x = self.conv2(x) 49 | x = self.conv3(x) 50 | x = self.conv4(x) 51 | x = self.conv5(x) 52 | x = self.conv6(x) 53 | 54 | x = x.view(x.size(0), -1) 55 | x = self.fc(x) 56 | return F.log_softmax(x) -------------------------------------------------------------------------------- /settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "python.pythonPath": "/usr/bin/python" 3 | } -------------------------------------------------------------------------------- /train_CFG.py: -------------------------------------------------------------------------------- 1 | from torch.autograd import Variable 2 | import torch 3 | import torch.optim as optim 4 | import torch.nn as nn 5 | 6 | from sklearn.utils import shuffle 7 | from sklearn.metrics import f1_score 8 | from sklearn.metrics import roc_curve, auc 9 | 10 | from gensim.models.keyedvectors import KeyedVectors 11 | import numpy as np 12 | import argparse 13 | import copy 14 | import pandas 15 | import csv 16 | import random 17 | import pickle 18 | from model import TextCNN 19 | 20 | from utils import utils 21 | import matplotlib.pyplot as plt 22 | 23 | def getClass(item): 24 | item = item[25:] 25 | item = item[:item.find('.')] 26 | if (int(item) < 2838): 27 | return 1 28 | else: 29 | return 0 30 | 31 | def prepareData(file): 32 | a = utils() 33 | data = a.readFile(file) 34 | x = [] 35 | y = [] 36 | for item in data: 37 | #print(item) 38 | x.append(data[item]) 39 | y.append(getClass(item)) 40 | return x, y 41 | 42 | def test(model, X, Y): 43 | model.eval() 44 | predLabel = [] 45 | #Y = Y[:1872] 46 | for i in range(len(X)/52): 47 | batch_x = X[i * 52 : (i + 1) * 52] 48 | batch_x = Variable(torch.FloatTensor(batch_x)).cuda() 49 | pred = model(batch_x, len(batch_x)) 50 | predLabel.extend(pred.data.max(1)[1].cpu().numpy()) 51 | print(f1_score(Y, predLabel)) 52 | print(Y) 53 | predLabel = np.array(predLabel) 54 | # Draw ROC, AUC 55 | fpr, tpr, thresholds = roc_curve(Y, predLabel.round(decimals=3), pos_label = 1) 56 | aucValue = auc(fpr, tpr) 57 | print(aucValue) 58 | plt.figure() 59 | lw =2 60 | plt.plot(fpr, tpr, color='darkorange', lw=lw, label='ROC curve (area = %0.2f)' % auc(fpr, tpr)) 61 | plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--') 62 | plt.xlim([0.0, 1.0]) 63 | plt.ylim([0.0, 1.05]) 64 | plt.xlabel('False Positive Rate') 65 | plt.ylabel('True Positive Rate') 66 | plt.title('Receiver operating characteristic example') 67 | plt.legend(loc="lower right") 68 | plt.show() 69 | 70 | def train(x, y): 71 | model = TextCNN() 72 | model = model.cuda() 73 | parameters = filter(lambda p: p.requires_grad, model.parameters()) 74 | optimizer = optim.Adam(model.parameters(), lr=1e-3) 75 | criterion = nn.CrossEntropyLoss(size_average=False) 76 | 77 | for epoch in range(50): 78 | total = 0 79 | for i in range(0, len(x)/64): 80 | batch_x = x[i*64:(i+1)*64] 81 | batch_y = y[i*64:(i+1)*64] 82 | batch_x = Variable(torch.FloatTensor(batch_x)).cuda() 83 | batch_y = Variable(torch.LongTensor(batch_y)).cuda() 84 | optimizer.zero_grad() 85 | model.train() 86 | pred = model(batch_x, 64) 87 | loss = criterion(pred, batch_y) 88 | #print(loss) 89 | loss.backward() 90 | nn.utils.clip_grad_norm(parameters, max_norm=3) 91 | total += np.sum(pred.data.max(1)[1].cpu().numpy() == batch_y.data.cpu().numpy()) 92 | optimizer.step() 93 | print("epoch ", epoch + 1, " acc: ", float(total)/len(x)) 94 | return model 95 | 96 | if __name__ == '__main__': 97 | data_x, data_y = prepareData('/media/aisu/Others/Hoang/SOIS_2018/CNN_ATrung/data/PSI_dims_1024_epochs_100_lr_0.3_embeddings.txt') 98 | data_x, data_y = shuffle(data_x, data_y) 99 | data_x_train = data_x[:3968] 100 | data_y_train = data_y[:3968] 101 | data_x_test = data_x[3968:] 102 | data_y_test = data_y[3968:] 103 | print(len(data_x_test), len(data_y_test)) 104 | #print(len(data_x_test)) 105 | model = train(data_x, data_y) 106 | test(model, data_x_test, data_y_test) 107 | with open(r"model.pkl", "wb") as output_file: 108 | pickle.dump(model, output_file) --------------------------------------------------------------------------------