├── classification.py
├── link ELF dataset.txt
├── link PSI_graph data.txt
├── model.pkl
├── model.py
├── settings.json
└── train_CFG.py


/classification.py:
--------------------------------------------------------------------------------
  1 | from torch.autograd import Variable
  2 | import torch
  3 | import torch.optim as optim
  4 | import torch.nn as nn
  5 | 
  6 | from sklearn.utils import shuffle
  7 | from sklearn.metrics import f1_score
  8 | from sklearn.metrics import roc_curve, auc
  9 | 
 10 | from gensim.models.keyedvectors import KeyedVectors
 11 | import numpy as np
 12 | import argparse
 13 | import copy
 14 | import pandas
 15 | import csv
 16 | import random
 17 | import pickle
 18 | from model import TextCNN
 19 | 
 20 | from utils import utils
 21 | import matplotlib.pyplot as plt
 22 | 
 23 | def prepareData(malFile, norFile):
 24 |     a = utils()
 25 |     data = a.readFile(malFile)
 26 |     x = []
 27 |     y = []
 28 |     for item in data:
 29 |         x.append(data[item])
 30 |         y.append(0)
 31 |     data = a.readFile(norFile)
 32 |     for item in data:
 33 |         x.append(data[item])
 34 |         y.append(1)
 35 |     return x, y
 36 | 
 37 | def test(model, X, Y):
 38 |     model.eval()
 39 |     predLabel = []
 40 |     for i in range(len(X)/52):
 41 |         batch_x = X[i * 52 : (i + 1) * 52]
 42 |         batch_x = Variable(torch.FloatTensor(batch_x)).cuda()
 43 |         pred = model(batch_x, len(batch_x))
 44 |         predLabel.extend(pred.data.max(1)[1].cpu().numpy())
 45 |     print(f1_score(Y, predLabel))
 46 |     print(Y)
 47 |     predLabel = np.array(predLabel)
 48 |     # Draw ROC, AUC
 49 |     fpr, tpr, thresholds = roc_curve(Y, predLabel.round(decimals=3), pos_label = 1)
 50 |     aucValue = auc(fpr, tpr)
 51 |     print(aucValue)
 52 |     plt.figure()
 53 |     lw =2
 54 |     plt.plot(fpr, tpr, color='darkorange', lw=lw, label='ROC curve (area = %0.2f)' % auc(fpr, tpr))
 55 |     plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
 56 |     plt.xlim([0.0, 1.0])
 57 |     plt.ylim([0.0, 1.05])
 58 |     plt.xlabel('False Positive Rate')
 59 |     plt.ylabel('True Positive Rate')
 60 |     plt.title('Receiver operating characteristic example')
 61 |     plt.legend(loc="lower right")
 62 |     plt.show()
 63 | 
 64 | def train(x, y):
 65 |     model = TextCNN()
 66 |     model = model.cuda()
 67 |     parameters = filter(lambda p: p.requires_grad, model.parameters())
 68 |     optimizer = optim.SGD(model.parameters(), lr=1e-3)
 69 |     criterion = nn.CrossEntropyLoss(size_average=False)
 70 | 
 71 |     for epoch in range(100):
 72 |         total = 0
 73 |         for i in range(0, len(x)/64):
 74 |             batch_x = x[i*64:(i+1)*64]
 75 |             batch_y = y[i*64:(i+1)*64]
 76 |             batch_x = Variable(torch.FloatTensor(batch_x)).cuda()
 77 |             batch_y = Variable(torch.LongTensor(batch_y)).cuda()
 78 |             optimizer.zero_grad()
 79 |             model.train()
 80 |             pred = model(batch_x, 64)
 81 |             loss = criterion(pred, batch_y)
 82 |             #print(loss)
 83 |             loss.backward()
 84 |             nn.utils.clip_grad_norm(parameters, max_norm=3)
 85 |             total += np.sum(pred.data.max(1)[1].cpu().numpy() == batch_y.data.cpu().numpy())
 86 |             optimizer.step()
 87 |         print("epoch ", epoch + 1, " acc: ", float(total)/len(x))
 88 |     return model
 89 | 
 90 | if __name__ == '__main__':
 91 |     data_x, data_y = prepareData('/media/aisu/Others/Hoang/SOIS_2018/CNN_ATrung/data/malware_dims_1024_epochs_100_lr_0.3_embeddings.txt', '/media/aisu/Others/Hoang/SOIS_2018/CNN_ATrung/data/benign_dims_1024_epochs_100_lr_0.3_embeddings.txt')
 92 |     data_x, data_y = shuffle(data_x, data_y)
 93 |     data_x_train = data_x[:3968]
 94 |     data_y_train = data_y[:3968]
 95 |     data_x_test = data_x[3968:]
 96 |     data_y_test = data_y[3968:]
 97 |     #print(len(data_x_test))
 98 |     model = train(data_x, data_y)
 99 |     test(model, data_x_test, data_y_test)
100 |     with open(r"model.pkl", "wb") as output_file:
101 |         pickle.dump(model, output_file)


--------------------------------------------------------------------------------
/link ELF dataset.txt:
--------------------------------------------------------------------------------
1 | https://drive.google.com/file/d/1NK4DMFP5x75BbXZtw4Xgv7K8cBQcoPCW/view?usp=sharing


--------------------------------------------------------------------------------
/link PSI_graph data.txt:
--------------------------------------------------------------------------------
1 | https://drive.google.com/file/d/1eGny59CVvmPcQ9CpKZFQwGdL3fpTr3HW/view?usp=sharing


--------------------------------------------------------------------------------
/model.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | 
 5 | 
 6 | class TextCNN(nn.Module):
 7 |     def __init__(self):
 8 |         super(TextCNN, self).__init__()
 9 | 
10 |         self.conv1 = nn.Sequential(
11 |             nn.Conv1d(1, 256, kernel_size=7, stride=1),
12 |             nn.ReLU(),
13 |             nn.MaxPool1d(kernel_size=3, stride=3)
14 |         )
15 | 
16 |         self.conv2 = nn.Sequential(
17 |             nn.Conv1d(256, 256, kernel_size=7, stride=1),
18 |             nn.ReLU(),
19 |             nn.MaxPool1d(kernel_size=3, stride=3)
20 |         )
21 | 
22 |         self.conv3 = nn.Sequential(
23 |             nn.Conv1d(256, 256, kernel_size=3, stride=1),
24 |             nn.ReLU()
25 |         )
26 | 
27 |         self.conv4 = nn.Sequential(
28 |             nn.Conv1d(256, 256, kernel_size=3, stride=1),
29 |             nn.ReLU()
30 |         )
31 | 
32 |         self.conv5 = nn.Sequential(
33 |             nn.Conv1d(256, 256, kernel_size=3, stride=1),
34 |             nn.ReLU()
35 |         )
36 | 
37 |         self.conv6 = nn.Sequential(
38 |             nn.Conv1d(256, 256, kernel_size=3, stride=1),
39 |             nn.ReLU(),
40 |             nn.MaxPool1d(kernel_size=3, stride=3)
41 |         )
42 | 
43 |         self.fc = nn.Linear(8704, 2)
44 | 
45 |     def forward(self, x, num):
46 |         x = x.view(num, 1, 1024)
47 |         x = self.conv1(x)
48 |         x = self.conv2(x)
49 |         x = self.conv3(x)
50 |         x = self.conv4(x)
51 |         x = self.conv5(x)
52 |         x = self.conv6(x)
53 | 
54 |         x = x.view(x.size(0), -1)
55 |         x = self.fc(x)
56 |         return F.log_softmax(x)


--------------------------------------------------------------------------------
/settings.json:
--------------------------------------------------------------------------------
1 | {
2 |     "python.pythonPath": "/usr/bin/python"
3 | }


--------------------------------------------------------------------------------
/train_CFG.py:
--------------------------------------------------------------------------------
  1 | from torch.autograd import Variable
  2 | import torch
  3 | import torch.optim as optim
  4 | import torch.nn as nn
  5 | 
  6 | from sklearn.utils import shuffle
  7 | from sklearn.metrics import f1_score
  8 | from sklearn.metrics import roc_curve, auc
  9 | 
 10 | from gensim.models.keyedvectors import KeyedVectors
 11 | import numpy as np
 12 | import argparse
 13 | import copy
 14 | import pandas
 15 | import csv
 16 | import random
 17 | import pickle
 18 | from model import TextCNN
 19 | 
 20 | from utils import utils
 21 | import matplotlib.pyplot as plt
 22 | 
 23 | def getClass(item):
 24 |     item = item[25:]
 25 |     item = item[:item.find('.')]
 26 |     if (int(item) < 2838):
 27 |         return 1
 28 |     else:
 29 |         return 0
 30 | 
 31 | def prepareData(file):
 32 |     a = utils()
 33 |     data = a.readFile(file)
 34 |     x = []
 35 |     y = []
 36 |     for item in data:
 37 |         #print(item)
 38 |         x.append(data[item])
 39 |         y.append(getClass(item))
 40 |     return x, y
 41 | 
 42 | def test(model, X, Y):
 43 |     model.eval()
 44 |     predLabel = []
 45 |     #Y = Y[:1872]
 46 |     for i in range(len(X)/52):
 47 |         batch_x = X[i * 52 : (i + 1) * 52]
 48 |         batch_x = Variable(torch.FloatTensor(batch_x)).cuda()
 49 |         pred = model(batch_x, len(batch_x))
 50 |         predLabel.extend(pred.data.max(1)[1].cpu().numpy())
 51 |     print(f1_score(Y, predLabel))
 52 |     print(Y)
 53 |     predLabel = np.array(predLabel)
 54 |     # Draw ROC, AUC
 55 |     fpr, tpr, thresholds = roc_curve(Y, predLabel.round(decimals=3), pos_label = 1)
 56 |     aucValue = auc(fpr, tpr)
 57 |     print(aucValue)
 58 |     plt.figure()
 59 |     lw =2
 60 |     plt.plot(fpr, tpr, color='darkorange', lw=lw, label='ROC curve (area = %0.2f)' % auc(fpr, tpr))
 61 |     plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
 62 |     plt.xlim([0.0, 1.0])
 63 |     plt.ylim([0.0, 1.05])
 64 |     plt.xlabel('False Positive Rate')
 65 |     plt.ylabel('True Positive Rate')
 66 |     plt.title('Receiver operating characteristic example')
 67 |     plt.legend(loc="lower right")
 68 |     plt.show()
 69 | 
 70 | def train(x, y):
 71 |     model = TextCNN()
 72 |     model = model.cuda()
 73 |     parameters = filter(lambda p: p.requires_grad, model.parameters())
 74 |     optimizer = optim.Adam(model.parameters(), lr=1e-3)
 75 |     criterion = nn.CrossEntropyLoss(size_average=False)
 76 | 
 77 |     for epoch in range(50):
 78 |         total = 0
 79 |         for i in range(0, len(x)/64):
 80 |             batch_x = x[i*64:(i+1)*64]
 81 |             batch_y = y[i*64:(i+1)*64]
 82 |             batch_x = Variable(torch.FloatTensor(batch_x)).cuda()
 83 |             batch_y = Variable(torch.LongTensor(batch_y)).cuda()
 84 |             optimizer.zero_grad()
 85 |             model.train()
 86 |             pred = model(batch_x, 64)
 87 |             loss = criterion(pred, batch_y)
 88 |             #print(loss)
 89 |             loss.backward()
 90 |             nn.utils.clip_grad_norm(parameters, max_norm=3)
 91 |             total += np.sum(pred.data.max(1)[1].cpu().numpy() == batch_y.data.cpu().numpy())
 92 |             optimizer.step()
 93 |         print("epoch ", epoch + 1, " acc: ", float(total)/len(x))
 94 |     return model
 95 | 
 96 | if __name__ == '__main__':
 97 |     data_x, data_y = prepareData('/media/aisu/Others/Hoang/SOIS_2018/CNN_ATrung/data/PSI_dims_1024_epochs_100_lr_0.3_embeddings.txt')
 98 |     data_x, data_y = shuffle(data_x, data_y)
 99 |     data_x_train = data_x[:3968]
100 |     data_y_train = data_y[:3968]
101 |     data_x_test = data_x[3968:]
102 |     data_y_test = data_y[3968:]
103 |     print(len(data_x_test), len(data_y_test))
104 |     #print(len(data_x_test))
105 |     model = train(data_x, data_y)
106 |     test(model, data_x_test, data_y_test)
107 |     with open(r"model.pkl", "wb") as output_file:
108 |         pickle.dump(model, output_file)


--------------------------------------------------------------------------------