├── README.md
├── feature_based_original_dataset
    ├── README.md
    ├── adversarial_train.py
    ├── defensive_distillation.py
    ├── ensemble.py
    ├── evaluate_models.py
    ├── fgsm.py
    ├── incremental_learning.py
    ├── jsma.py
    ├── label_encoding.py
    ├── models.py
    ├── models_grid_search.py
    ├── neural_network.py
    ├── nn_grid_search.py
    ├── set_onehot_encoding.py
    ├── testing_set_1000.txt
    ├── testing_set_1500.txt
    ├── train_models.py
    ├── train_random_subsampling.py
    └── training_set_1000.txt
├── feature_based_reduced_dataset
    ├── README.md
    ├── count_feature_variance.py
    ├── detector.py
    ├── eliminate_features.py
    ├── eliminate_low_high_support_features.py
    ├── eliminated_variance.csv
    ├── jsma.py
    ├── label_encoding.py
    ├── models.py
    ├── neural_network.py
    ├── set_onehot_encoding.py
    ├── testing_set_1000.txt
    ├── testing_set_1500.txt
    ├── testing_set_8500.txt
    ├── train_models.py
    ├── train_random_subsampl.py
    ├── training_set_1500.txt
    └── training_set_8500.txt
├── preprocessing
    ├── README.md
    ├── count_features_for_each_class.py
    ├── extract_feature_occurrences.py
    ├── extract_feature_types.py
    ├── features_counter.csv
    └── mean_features.py
└── requirements.txt


/README.md:
--------------------------------------------------------------------------------
 1 | ## Research on Android malware detection based on ML models & the weakness of DNNs in adversarial examples.
 2 | 
 3 | 
 4 |   The experiments are carried through the [Drebin dataset](https://www.sec.cs.tu-bs.de/~danarp/drebin/).
 5 |   
 6 |   
 7 |   1) preprocessing folder contains scripts related to dataset and its features.
 8 |   2) feature_based_original_dataset folder contains scripts for the experiments in whole feature space.
 9 |   3) feature_based_reduced_dataset folder contains scripts for the experiments on the reduced feature space.
10 |   
11 |   
12 |   Each folder contains a README file to help you through the experiments.
13 |   
14 | 


--------------------------------------------------------------------------------
/feature_based_original_dataset/adversarial_train.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import numpy as np
  3 | from tensorflow import keras
  4 | from sklearn.metrics import confusion_matrix
  5 | import set_onehot_encoding as onehot
  6 | import os
  7 | import neural_network as NN
  8 | import random
  9 | 
 10 | 
 11 | def create_random_sets(set_size=1500, malware_ratio=0.3):
 12 |     print("Generating set...")
 13 |     testing_set = onehot.generate_set(set_size, malware_ratio)  # generate random set
 14 |     print("Generating input...")
 15 |     # shuffle the set randomly and perform one-hot encoding
 16 |     test_data, test_labels = onehot.generate_input(testing_set, total_features)
 17 |     return test_data, test_labels
 18 | 
 19 | 
 20 | """
 21 | functions to compute Jacobian with numpy.
 22 | https://medium.com/unit8-machine-learning-publication/computing-the-jacobian-matrix-of-a-neural-network-in-python-4f162e5db180
 23 | First we specify the the forward and backward passes of each layer to implement backpropagation manually.
 24 | """
 25 | 
 26 | 
 27 | def affine_forward(x, w, b):
 28 |     """
 29 |     Forward pass of an affine layer
 30 |     :param x: input of dimension (I, )
 31 |     :param w: weights matrix of dimension (I, O)
 32 |     :param b: biais vector of dimension (O, )
 33 |     :return output of dimension (O, ), and cache needed for backprop
 34 |     """
 35 |     out = np.dot(x, w) + b
 36 |     cache = (x, w)
 37 |     return out, cache
 38 | 
 39 | 
 40 | def affine_backward(dout, cache):
 41 |     """
 42 |     Backward pass for an affine layer.
 43 |     :param dout: Upstream Jacobian, of shape (M, O)
 44 |     :param cache: Tuple of:
 45 |       - x: Input data, of shape (I, )
 46 |       - w: Weights, of shape (I, O)
 47 |     :return the jacobian matrix containing derivatives of the M neural network outputs with respect to
 48 |             this layer's inputs, evaluated at x, of shape (M, I)
 49 |     """
 50 |     x, w = cache
 51 |     dx = np.dot(dout, w.T)
 52 |     return dx
 53 | 
 54 | 
 55 | def relu_forward(x):
 56 |     """ Forward ReLU
 57 |     """
 58 |     out = np.maximum(np.zeros(x.shape), x)
 59 |     cache = x
 60 |     return out, cache
 61 | 
 62 | 
 63 | def relu_backward(dout, cache):
 64 |     """
 65 |     Backward pass of ReLU
 66 |     :param dout: Upstream Jacobian
 67 |     :param cache: the cached input for this layer
 68 |     :return: the jacobian matrix containing derivatives of the M neural network outputs with respect to
 69 |              this layer's inputs, evaluated at x.
 70 |     """
 71 |     x = cache
 72 |     dx = dout * np.where(x > 0, np.ones(x.shape), np.zeros(x.shape))
 73 |     return dx
 74 | 
 75 | 
 76 | def softmax_forward(x):
 77 |     """ Forward softmax
 78 |     """
 79 |     exps = np.exp(x - np.max(x))
 80 |     s = exps / exps.sum()
 81 |     return s, s
 82 | 
 83 | 
 84 | def softmax_backward(dout, cache):
 85 |     """
 86 |     Backward pass for softmax
 87 |     :param dout: Upstream Jacobian
 88 |     :param cache: contains the cache (in this case the output) for this layer
 89 |     """
 90 |     s = cache
 91 |     ds = np.diag(s) - np.outer(s, s.T)
 92 |     dx = np.dot(dout, ds)
 93 |     return dx
 94 | 
 95 | 
 96 | def get_activations(model, layer_id, X):
 97 |     """
 98 |     Computes outputs of intermediate layers
 99 |     :param model: the trained model
100 |     :param layer_id: the id of the layer that we want the output from
101 |     :param X: input feature vector
102 |     :return: output of layer (layer_id)
103 |     """
104 |     intermediate_layer_model = keras.models.Model(inputs=model.input,
105 |                                                   outputs=model.layers[layer_id].output)
106 |     intermediate_output = intermediate_layer_model.predict(X)
107 |     return intermediate_output
108 | 
109 | 
110 | def forward_backward(model, x):
111 |     """
112 |     computes the forward derivative for the given input
113 |     :param model: the trained model
114 |     :param x: input feature vector
115 |     :return: prediction result and forward derivative
116 |     """
117 |     layer_to_cache = dict()  # for each layer, we store the cache needed for backward pass
118 |     forward_values = []
119 | 
120 |     for i in range(0, len(model.layers), 2):
121 |         values = {}
122 |         w, b = model.layers[i].get_weights()
123 |         values['w'] = w
124 |         values['b'] = b
125 |         forward_values.append(values)
126 | 
127 |     # Forward pass
128 |     a1, cache_a1 = affine_forward(x, forward_values[0]['w'], forward_values[0]['b'])
129 |     _, cache_r1 = relu_forward(a1)
130 |     r1 = get_activations(model, 0, x)
131 |     forward_values[0]['a'] = a1
132 |     forward_values[0]['cache_a'] = cache_a1
133 |     forward_values[0]['r'] = r1
134 |     forward_values[0]['cache_r'] = cache_r1
135 | 
136 |     for i, layer_index in zip(range(1, len(forward_values) - 1), range(2, len(model.layers), 2)):
137 |         a, cache_a = affine_forward(forward_values[i - 1]['r'], forward_values[i]['w'], forward_values[i]['b'])
138 |         _, cache_r = relu_forward(a)
139 |         r = get_activations(model, layer_index, x)
140 |         forward_values[i]['a'] = a
141 |         forward_values[i]['cache_a'] = cache_a
142 |         forward_values[i]['r'] = r
143 |         forward_values[i]['cache_r'] = cache_r
144 | 
145 |     a, cache_a = affine_forward(forward_values[len(forward_values) - 2]['r'],
146 |                                 forward_values[len(forward_values) - 1]['w'],
147 |                                 forward_values[len(forward_values) - 1]['b'])
148 |     forward_values[len(forward_values) - 1]['a'] = a
149 |     forward_values[len(forward_values) - 1]['cache_a'] = cache_a
150 |     out, cache_out = softmax_forward(a)
151 | 
152 |     # backward pass
153 |     dout = np.diag(np.ones(out.size, ))  # the derivatives of each output w.r.t. each output.
154 |     dout = softmax_backward(dout, cache_out)
155 |     dout = affine_backward(dout, forward_values[len(forward_values) - 1]['cache_a'])
156 | 
157 |     for i in range(len(forward_values) - 2, 0, -1):
158 |         dout = relu_backward(dout, forward_values[i]['cache_r'])
159 |         dout = affine_backward(dout, forward_values[i]['cache_a'])
160 | 
161 |     dout = relu_backward(dout, forward_values[0]['cache_r'])
162 |     dx = affine_backward(dout, forward_values[0]['cache_a'])
163 | 
164 |     return out, dx
165 | 
166 | 
167 | def craft_adversarial_samples(x, y, F, k):
168 |     """
169 |     :param x: input feature vector
170 |     :param y: target class
171 |     :param F: the trained model
172 |     :param k: index of the hidden layer
173 |     :return: adversarial sample based on feature vector x
174 |     """
175 |     x_adv = x
176 |     gamma = [1] * len(x)
177 |     delta_x = [0]
178 |     changes = 0
179 | 
180 |     if np.argmax(F.predict(x_adv), 1) == 0:  # if misclassification achieved return adv_x
181 |         return x_adv, -1
182 | 
183 |     while np.argmax(F.predict(x_adv), 1) != y and np.linalg.norm(delta_x, ord=1) < k and changes < 20:
184 |         # compute forward derivative (Jacobian)
185 |         prob, forward_derivative = forward_backward(F, x_adv)
186 | 
187 |         tmp = np.multiply(forward_derivative[0], gamma)
188 |         for i, feature in enumerate(x_adv[0]):
189 |             if feature == 1:
190 |                 tmp[i] = 0
191 |         i_max = np.argmax(tmp)
192 |         if i_max <= 0:
193 |             raise ValueError('FAILURE: We can only add features to an application!')
194 | 
195 |         x_adv[0][i_max] = 1
196 |         delta_x = np.subtract(x_adv, x)
197 |         # print(i_max)
198 |         if i_max not in changes_dict:
199 |             changes_dict[i_max] = 1
200 |         else:
201 |             changes_dict[i_max] += 1
202 |         changes += 1
203 |     print("Changes:", changes)
204 | 
205 |     return x_adv, changes
206 | 
207 | 
208 | def adversarial_training():
209 |     NN.train_neural_network(trained_model, 4, 15, val_data, val_labels, verbose=2)
210 |     trained_model.save('Adam_adversarial_training_adv_1500_0.3.h5')
211 | 
212 |     predictions = trained_model.predict(val_data)
213 |     confusion = confusion_matrix(val_labels, np.argmax(predictions, axis=1))
214 |     print(confusion)
215 |     TP = confusion[1, 1]
216 |     TN = confusion[0, 0]
217 |     FP = confusion[0, 1]
218 |     FN = confusion[1, 0]
219 |     FNR = FN / float(FN + TP) * 100
220 |     FPR = FP / float(FP + TN) * 100
221 |     accuracy = ((TP + TN) / float(TP + TN + FP + FN)) * 100
222 |     print("Adversarial  FP:", FP, "- FN:", FN, "- TP:", TP, "- TN", TN)
223 |     print("Adversarial Accuracy:", accuracy, "- FPR:", FPR, "- FNR:", FNR)
224 | 
225 | 
226 | if __name__ == "__main__":
227 |     total_features = 545333  # total unique features
228 |     print("Creating data-labels...")
229 |     onehot.create_list_of_apps()  # function from set_one_encoding.py
230 | 
231 |     changes_dict = {}  # dictionary for perturbations (added features)
232 | 
233 |     trained_model = tf.keras.models.load_model('Adam_adversarial_training_adv_1500_0.3_.h5')
234 | 
235 |     averageChanges = 0
236 |     val_data, val_labels = create_random_sets(set_size=800, malware_ratio=0.3)
237 | 
238 |     average_changes = 0
239 |     amount_malwares = 0
240 |     adv_counter = 0
241 | 
242 |     for i in range(len(val_data)):
243 | 
244 |         if val_labels[i] == 1:
245 | 
246 |             x = val_data[i:i + 1]
247 |             # print("x: ", x)
248 |             # print(x.shape)
249 |             try:
250 |                 adv_x, changes = craft_adversarial_samples(x, 0, trained_model, 1)
251 |                 # print(adv_x)
252 |                 val_data[i] = adv_x
253 | 
254 |                 if changes >= 0:
255 |                     average_changes += changes
256 |                     amount_malwares += 1
257 |             except NameError:
258 |                 pass
259 |             except ValueError:
260 |                 pass
261 | 
262 |     if amount_malwares > 0:
263 |         averageChanges += (average_changes / float(amount_malwares))
264 | 
265 |     adversarial_training()
266 | 


--------------------------------------------------------------------------------
/feature_based_original_dataset/defensive_distillation.py:
--------------------------------------------------------------------------------
  1 | from keras import Sequential
  2 | from keras.callbacks import TensorBoard, EarlyStopping, ModelCheckpoint
  3 | from keras.layers import Dense, Dropout
  4 | from keras.optimizers import Adam
  5 | import tensorflow as tf
  6 | import set_onehot_encoding as onehot
  7 | import os
  8 | import keras
  9 | 
 10 | total_features = 545333  # total unique features
 11 | path = "defensive_distillation/"
 12 | 
 13 | if not os.path.exists(path):  # check if path exists
 14 |     os.mkdir(path)
 15 | print("Creating data-labels...")
 16 | onehot.create_list_of_apps()  # function from set_one_encoding.py
 17 | 
 18 | 
 19 | def create_training_input():
 20 |     if os.path.isfile("training_set_1500.txt") is False:
 21 |         set_size = 1500
 22 |         malware_ratio = 0.3
 23 |         print("Creating data-labels...")
 24 |         print("Generating TESTING set...")
 25 |         training_set = onehot.generate_set(set_size, malware_ratio)  # generate random testing set
 26 |         with open("training_set_1500.txt", "w") as file:
 27 |             for item in training_set:
 28 |                 file.write(str(item) + "\n")
 29 |     training_set = []  # the list of testing set
 30 |     with open("training_set_1500.txt", "r") as file:  # read testing set file and append applications to list
 31 |         for line in file:
 32 |             line.strip()
 33 |             line = line[:-1]
 34 |             training_set.append(line)
 35 |     print("Generating TESTING input...")
 36 |     data, labels = onehot.generate_input(training_set, total_features)  # perform one-hot encoding
 37 |     return data, labels
 38 | 
 39 | 
 40 | def create_testing_input():
 41 |     if os.path.isfile("testing_set_1500.txt") is False:
 42 |         set_size = 1500
 43 |         malware_ratio = 0.3
 44 |         print("Creating data-labels...")
 45 |         print("Generating TESTING set...")
 46 |         testing_set = onehot.generate_set(set_size, malware_ratio)  # generate random testing set
 47 |         with open("testing_set_1500.txt", "w") as file:
 48 |             for item in testing_set:
 49 |                 file.write(str(item) + "\n")
 50 |     testing_set = []  # the list of testing set
 51 |     with open("testing_set_1500.txt", "r") as file:  # read testing set file and append applications to list
 52 |         for line in file:
 53 |             line.strip()
 54 |             line = line[:-1]
 55 |             testing_set.append(line)
 56 |     print("Generating TESTING input...")
 57 |     test_data, test_labels = onehot.generate_input(testing_set, total_features)  # perform one-hot encoding
 58 |     return test_data, test_labels
 59 | 
 60 | 
 61 | def train(train_data, train_labels, test_data, test_labels, file_name,
 62 |           epochs=4, batch_size=150, train_temp=1, init=None, callbacks=False):
 63 |     # neural net parameters
 64 |     units = [200, 200]
 65 |     activation_function = "relu"
 66 |     kernel = "glorot_uniform"
 67 |     bias = "zeros"
 68 |     dropout = 0.2
 69 |     learn_rate = 0.001
 70 | 
 71 |     model = Sequential()  # neural net init
 72 |     model.add(Dense(units=units[0], activation=activation_function, input_dim=total_features, kernel_initializer=kernel,
 73 |                     bias_initializer=bias))
 74 |     model.add(Dropout(dropout))  # add dropout rate
 75 | 
 76 |     for hidden_layer_units in units[1:]:  # add hidden layers defined units in train_models.py
 77 |         model.add(Dense(units=hidden_layer_units, activation=activation_function, kernel_initializer=kernel,
 78 |                         bias_initializer=bias))
 79 |         model.add(Dropout(dropout))
 80 | 
 81 |     model.add(Dense(2))  # output layer, with  with two neurons and without activation function
 82 | 
 83 |     if init is not None:
 84 |         model.load_weights(init)
 85 | 
 86 |     def fn(correct, predicted):
 87 |         return tf.nn.softmax_cross_entropy_with_logits(labels=correct, logits=(predicted / train_temp))
 88 | 
 89 |     # loss the fn method defined above, Adam optimizer
 90 |     model.compile(loss=fn,
 91 |                   optimizer=Adam(lr=learn_rate),
 92 |                   metrics=["accuracy"])
 93 | 
 94 |     if callbacks:
 95 |         log_dir = path + "log/dir/DNN"
 96 |         tensorboard_callback = TensorBoard(log_dir=log_dir, histogram_freq=1, write_graph=True, write_images=True)
 97 |         early_stopping_callback = EarlyStopping(monitor='val_loss', mode='min', patience=10, verbose=2)
 98 |         model_checkpoint_callback = ModelCheckpoint(file_name, monitor='val_accuracy',
 99 |                                                     mode='max',
100 |                                                     verbose=2, save_best_only=True)
101 |         model.fit(train_data, train_labels,
102 |                   epochs=epochs,
103 |                   batch_size=batch_size,
104 |                   validation_data=(test_data, test_labels),
105 |                   callbacks=[tensorboard_callback, early_stopping_callback, model_checkpoint_callback],
106 |                   verbose=2)
107 |     else:
108 |         model.fit(train_data, train_labels,
109 |                   epochs=epochs,
110 |                   batch_size=batch_size,
111 |                   validation_data=(test_data, test_labels),
112 |                   verbose=2)
113 | 
114 |         if file_name is not None:
115 |             model.save(file_name)
116 | 
117 |     return model
118 | 
119 | 
120 | def train_distillation(features, labels, file_name, epochs=4, batch_size=150, train_temp=1):
121 |     """
122 |     :param features: the train data
123 |     :param labels: the train labels
124 |     :param file_name: the file to save teacher and student
125 |     :param epochs: number of epochs
126 |     :param batch_size: batch size
127 |     :param train_temp: temperature
128 |     :return:
129 |     """
130 |     if not os.path.exists(file_name + "_init"):
131 |         # train for one epoch to get a starting point
132 |         train(features, labels, test_data, test_labels, file_name + "_init", 1, batch_size)
133 | 
134 |     # train the teacher at the given temperature
135 |     print("Temperature:", train_temp)
136 |     teacher = train(features, labels, test_data, test_labels, file_name + "_teacher", epochs, batch_size,
137 |                     train_temp, init=file_name + "_init")
138 | 
139 |     predicted = teacher.predict(features)  # evaluate the labels at the given temperature
140 |     print(predicted)
141 | 
142 |     with tf.compat.v1.Session() as sess:
143 |         y = sess.run(tf.nn.softmax(predicted / train_temp))
144 |         print(y)
145 |         train_labels = y
146 | 
147 |     # train the student at temperature t
148 |     student = train(features, train_labels, test_data, test_labels, file_name, epochs, batch_size,
149 |                     train_temp, init=file_name + "_init")
150 |     # predict at temperature 1
151 |     predicted = student.predict(features)
152 |     # print(predicted)
153 | 
154 | 
155 | data, labels = create_training_input()  # init train data-labels
156 | test_data, test_labels = create_testing_input()  # init test data-labels
157 | # we use categorical_cross_entropy and thus, an encode to one hot labels is required
158 | labels = keras.utils.to_categorical(labels, 2)
159 | test_labels = keras.utils.to_categorical(test_labels, 2)
160 | 
161 | # first train with original temperature (= 1)
162 | train(data, labels, test_data, test_labels, path + "original", epochs=30, callbacks=True)
163 | # train teacher and student networks with a predefined temperature
164 | train_distillation(data, labels, path + "distilled-100", epochs=7, batch_size=150, train_temp=120)
165 | 


--------------------------------------------------------------------------------
/feature_based_original_dataset/ensemble.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import numpy as np
  3 | from tensorflow import keras
  4 | from sklearn.metrics import confusion_matrix
  5 | from joblib import load
  6 | import set_onehot_encoding as onehot
  7 | import os
  8 | 
  9 | def create_set():
 10 |     if os.path.isfile("testing_set_200.txt") is False:
 11 |         set_size = 200
 12 |         malware_ratio = 0.5
 13 |         print("Creating data-labels...")
 14 |         print("Generating TESTING set...")
 15 |         testing_set = onehot.generate_set(set_size, malware_ratio)  # generate random testing set
 16 |         with open("testing_set_200.txt", "w") as file:
 17 |             for item in testing_set:
 18 |                 file.write(str(item) + "\n")
 19 |     testing_set = []  # the list of testing set
 20 |     with open("testing_set_200.txt", "r") as file:  # read testing set file and append applications to list
 21 |         for line in file:
 22 |             line.strip()
 23 |             line = line[:-1]
 24 |             testing_set.append(line)
 25 |     print("Generating TESTING input...")
 26 |     test_data, test_labels = onehot.generate_input(testing_set, total_features)  # perform one-hot encoding
 27 |     return test_data, test_labels
 28 | 
 29 | 
 30 | """
 31 | functions to compute Jacobian with numpy.
 32 | https://medium.com/unit8-machine-learning-publication/computing-the-jacobian-matrix-of-a-neural-network-in-python-4f162e5db180
 33 | First we specify the the forward and backward passes of each layer to implement backpropagation manually.
 34 | """
 35 | 
 36 | 
 37 | def affine_forward(x, w, b):
 38 |     """
 39 |     Forward pass of an affine layer
 40 |     :param x: input of dimension (I, )
 41 |     :param w: weights matrix of dimension (I, O)
 42 |     :param b: biais vector of dimension (O, )
 43 |     :return output of dimension (O, ), and cache needed for backprop
 44 |     """
 45 |     out = np.dot(x, w) + b
 46 |     cache = (x, w)
 47 |     return out, cache
 48 | 
 49 | 
 50 | def affine_backward(dout, cache):
 51 |     """
 52 |     Backward pass for an affine layer.
 53 |     :param dout: Upstream Jacobian, of shape (M, O)
 54 |     :param cache: Tuple of:
 55 |       - x: Input data, of shape (I, )
 56 |       - w: Weights, of shape (I, O)
 57 |     :return the jacobian matrix containing derivatives of the M neural network outputs with respect to
 58 |             this layer's inputs, evaluated at x, of shape (M, I)
 59 |     """
 60 |     x, w = cache
 61 |     dx = np.dot(dout, w.T)
 62 |     return dx
 63 | 
 64 | 
 65 | def relu_forward(x):
 66 |     """ Forward ReLU
 67 |     """
 68 |     out = np.maximum(np.zeros(x.shape), x)
 69 |     cache = x
 70 |     return out, cache
 71 | 
 72 | 
 73 | def relu_backward(dout, cache):
 74 |     """
 75 |     Backward pass of ReLU
 76 |     :param dout: Upstream Jacobian
 77 |     :param cache: the cached input for this layer
 78 |     :return: the jacobian matrix containing derivatives of the M neural network outputs with respect to
 79 |              this layer's inputs, evaluated at x.
 80 |     """
 81 |     x = cache
 82 |     dx = dout * np.where(x > 0, np.ones(x.shape), np.zeros(x.shape))
 83 |     return dx
 84 | 
 85 | 
 86 | def softmax_forward(x):
 87 |     """ Forward softmax
 88 |     """
 89 |     exps = np.exp(x - np.max(x))
 90 |     s = exps / exps.sum()
 91 |     return s, s
 92 | 
 93 | 
 94 | def softmax_backward(dout, cache):
 95 |     """
 96 |     Backward pass for softmax
 97 |     :param dout: Upstream Jacobian
 98 |     :param cache: contains the cache (in this case the output) for this layer
 99 |     """
100 |     s = cache
101 |     ds = np.diag(s) - np.outer(s, s.T)
102 |     dx = np.dot(dout, ds)
103 |     return dx
104 | 
105 | 
106 | def get_activations(model, layer_id, X):
107 |     """
108 |     Computes outputs of intermediate layers
109 |     :param model: the trained model
110 |     :param layer_id: the id of the layer that we want the output from
111 |     :param X: input feature vector
112 |     :return: output of layer (layer_id)
113 |     """
114 |     intermediate_layer_model = keras.models.Model(inputs=model.input,
115 |                                                   outputs=model.layers[layer_id].output)
116 |     intermediate_output = intermediate_layer_model.predict(X)
117 |     return intermediate_output
118 | 
119 | 
120 | def forward_backward(model, x):
121 |     """
122 |     computes the forward derivative for the given input
123 |     :param model: the trained model
124 |     :param x: input feature vector
125 |     :return: prediction result and forward derivative
126 |     """
127 |     layer_to_cache = dict()  # for each layer, we store the cache needed for backward pass
128 |     forward_values = []
129 | 
130 |     for i in range(0, len(model.layers), 2):
131 |         values = {}
132 |         w, b = model.layers[i].get_weights()
133 |         values['w'] = w
134 |         values['b'] = b
135 |         forward_values.append(values)
136 | 
137 |     # Forward pass
138 |     a1, cache_a1 = affine_forward(x, forward_values[0]['w'], forward_values[0]['b'])
139 |     _, cache_r1 = relu_forward(a1)
140 |     r1 = get_activations(model, 0, x)
141 |     forward_values[0]['a'] = a1
142 |     forward_values[0]['cache_a'] = cache_a1
143 |     forward_values[0]['r'] = r1
144 |     forward_values[0]['cache_r'] = cache_r1
145 | 
146 |     for i, layer_index in zip(range(1, len(forward_values) - 1), range(2, len(model.layers), 2)):
147 |         a, cache_a = affine_forward(forward_values[i - 1]['r'], forward_values[i]['w'], forward_values[i]['b'])
148 |         _, cache_r = relu_forward(a)
149 |         r = get_activations(model, layer_index, x)
150 |         forward_values[i]['a'] = a
151 |         forward_values[i]['cache_a'] = cache_a
152 |         forward_values[i]['r'] = r
153 |         forward_values[i]['cache_r'] = cache_r
154 | 
155 |     a, cache_a = affine_forward(forward_values[len(forward_values) - 2]['r'],
156 |                                 forward_values[len(forward_values) - 1]['w'],
157 |                                 forward_values[len(forward_values) - 1]['b'])
158 |     forward_values[len(forward_values) - 1]['a'] = a
159 |     forward_values[len(forward_values) - 1]['cache_a'] = cache_a
160 |     out, cache_out = softmax_forward(a)
161 | 
162 |     # backward pass
163 |     dout = np.diag(np.ones(out.size, ))  # the derivatives of each output w.r.t. each output.
164 |     dout = softmax_backward(dout, cache_out)
165 |     dout = affine_backward(dout, forward_values[len(forward_values) - 1]['cache_a'])
166 | 
167 |     for i in range(len(forward_values) - 2, 0, -1):
168 |         dout = relu_backward(dout, forward_values[i]['cache_r'])
169 |         dout = affine_backward(dout, forward_values[i]['cache_a'])
170 | 
171 |     dout = relu_backward(dout, forward_values[0]['cache_r'])
172 |     dx = affine_backward(dout, forward_values[0]['cache_a'])
173 | 
174 |     return out, dx
175 | 
176 | 
177 | def craft_adversarial_samples(x, y, F, k):
178 | 
179 |     x_adv = x
180 |     gamma = [1] * len(x)
181 |     delta_x = [0]
182 |     changes = 0
183 | 
184 |     if np.argmax(F.predict(x_adv), 1) == 0:  # if misclassification achieved return adv_x
185 |         return x_adv, -1
186 | 
187 |     while np.argmax(F.predict(x_adv), 1) != y and np.linalg.norm(delta_x, ord=1) < k and changes < 20:
188 |         # compute forward derivative (Jacobian)
189 |         prob, forward_derivative = forward_backward(F, x_adv)
190 | 
191 |         tmp = np.multiply(forward_derivative[0], gamma)
192 |         for i, feature in enumerate(x_adv[0]):
193 |             if feature == 1:
194 |                 tmp[i] = 0
195 |         i_max = np.argmax(tmp)
196 |         if i_max <= 0:
197 |             raise ValueError('FAILURE: We can only add features to an application!')
198 | 
199 |         x_adv[0][i_max] = 1
200 |         delta_x = np.subtract(x_adv, x)
201 |         # print(i_max)
202 |         if i_max not in changes_dict:
203 |             changes_dict[i_max] = 1
204 |         else:
205 |             changes_dict[i_max] += 1
206 |         changes += 1
207 |     print("Changes:", changes)
208 | 
209 |     return x_adv, changes
210 | 
211 | 
212 | def load_models():
213 |     """
214 |     load saved models (classic ml & neural nets with different optimizers)
215 |     """
216 |     adam = tf.keras.models.load_model(path + "model_Adam.h5")
217 |     sgd_mom = tf.keras.models.load_model(path + "model_SGD_mom.h5")
218 | 
219 |     #classic ml
220 |     rf = load('models/model_RandomForestClassifier.joblib')
221 |     lr = load('models/model_LogisticRegression.joblib')
222 | 
223 |     return adam, sgd_mom, rf, lr
224 | 
225 | 
226 | def final_prediction(adam, sgd, lr, rf):
227 |     sum_pred = []
228 |     for i in range(len(adam)):
229 |         sum_pred.append([])
230 |         for j in range(len(adam[i])):
231 |             sum_pred[i].append((adam[i][j] + sgd[i][j]) + rf[i][j] + lr[i][j]/4)
232 |     return sum_pred
233 | 
234 | 
235 | def evaluate_without_adv():
236 | 
237 |     val_data, val_labels = create_set()
238 |     adam, sgd_mom, rf, lr = load_models()
239 | 
240 |     adam_pred = adam.predict(val_data)
241 |     sgd_mom_pred = sgd_mom.predict(val_data)
242 |     rf_pred = rf.predict_proba(val_data)
243 |     lr_pred = lr.predict_proba(val_data)
244 |     predictions = final_prediction(adam_pred, sgd_mom_pred, rf_pred, lr_pred)
245 | 
246 |     confusion = confusion_matrix(val_labels, np.argmax(predictions, axis=1))
247 |     print(confusion)
248 |     TP = confusion[1, 1]
249 |     TN = confusion[0, 0]
250 |     FP = confusion[0, 1]
251 |     FN = confusion[1, 0]
252 |     FNR = FN / float(FN + TP) * 100
253 |     FPR = FP / float(FP + TN) * 100
254 |     accuracy = ((TP + TN) / float(TP + TN + FP + FN)) * 100
255 |     print("FP:", FP, "- FN:", FN, "- TP:", TP, "- TN", TN)
256 |     print("Accuracy:", accuracy, "- FPR:", FPR, "- FNR:", FNR)
257 | 
258 | 
259 | def evaluate_adam():
260 |     trained_model = tf.keras.models.load_model(path + "model_Adam.h5")
261 |     val_data, val_labels = create_set()
262 | 
263 |     averageChanges = 0
264 |     predict_original = trained_model.predict(val_data)
265 |     confusion = confusion_matrix(val_labels, np.argmax(predict_original, axis=1))
266 |     TP = confusion[1, 1]
267 |     TN = confusion[0, 0]
268 |     FP = confusion[0, 1]
269 |     FN = confusion[1, 0]
270 |     FNR_original = FN / float(FN + TP) * 100
271 |     FPR = FP / float(FP + TN) * 100
272 |     accuracy = ((TP + TN) / float(TP + TN + FP + FN)) * 100
273 |     print(confusion)
274 |     print("Original FP:", FP, "- FN:", FN, "- TP:", TP, "- TN", TN)
275 |     print("Original Accuracy:", accuracy, "- FPR:", FPR, "- FNR:", FNR_original)
276 |     del predict_original
277 |     average_changes = 0
278 |     amount_malwares = 0
279 | 
280 |     for i in range(len(val_data)):
281 | 
282 |         if val_labels[i] == 1:
283 | 
284 |             x = val_data[i:i + 1]
285 |             # print("x: ", x)
286 |             # print(x.shape)
287 |             try:
288 |                 adv_x, changes = craft_adversarial_samples(x, 0, trained_model, 1)
289 |                 # print(adv_x)
290 |                 val_data[i] = adv_x
291 |                 if changes >= 0:
292 |                     average_changes += changes
293 |                     amount_malwares += 1
294 |             except NameError:
295 |                 pass
296 |             except ValueError:
297 |                 pass
298 | 
299 |     if amount_malwares > 0:
300 |         averageChanges += (average_changes / float(amount_malwares))
301 |     # print(val_data.shape)
302 | 
303 |     # evaluate the model on adversarial examples
304 |     predictions = trained_model.predict(val_data)
305 |     confusion = confusion_matrix(val_labels, np.argmax(predictions, axis=1))
306 |     print(confusion)
307 |     TP = confusion[1, 1]
308 |     TN = confusion[0, 0]
309 |     FP = confusion[0, 1]
310 |     FN = confusion[1, 0]
311 |     FNR = FN / float(FN + TP) * 100
312 |     FPR = FP / float(FP + TN) * 100
313 |     accuracy = ((TP + TN) / float(TP + TN + FP + FN)) * 100
314 |     print("Adversarial  FP:", FP, "- FN:", FN, "- TP:", TP, "- TN", TN)
315 |     print("Adversarial Accuracy:", accuracy, "- FPR:", FPR, "- FNR:", FNR)
316 |     print("Misclassification Rate:", FNR - FNR_original)
317 |     print("Distortion:", averageChanges)
318 |     print(changes_dict)
319 | 
320 | 
321 | def evaluate_ensembles():
322 |     trained_model = tf.keras.models.load_model(path + "model_Adam.h5")
323 |     val_data, val_labels = create_set()
324 | 
325 |     averageChanges = 0
326 |     average_changes = 0
327 |     amount_malwares = 0
328 | 
329 |     for i in range(len(val_data)):
330 | 
331 |         if val_labels[i] == 1:
332 | 
333 |             x = val_data[i:i + 1]
334 |             # print("x: ", x)
335 |             # print(x.shape)
336 |             try:
337 |                 adv_x, changes = craft_adversarial_samples(x, 0, trained_model, 1)
338 |                 # print(adv_x)
339 |                 val_data[i] = adv_x
340 |                 if changes >= 0:
341 |                     average_changes += changes
342 |                     amount_malwares += 1
343 |             except NameError:
344 |                 pass
345 |             except ValueError:
346 |                 pass
347 | 
348 |     if amount_malwares > 0:
349 |         averageChanges += (average_changes / float(amount_malwares))
350 |     # print(val_data.shape)
351 | 
352 |     # evaluate the models on adversarial examples
353 |     adam, sgd_mom, rf, lr = load_models()
354 |     adam_pred = adam.predict(val_data)
355 |     sgd_mom_pred = sgd_mom.predict(val_data)
356 |     rf_pred = rf.predict_proba(val_data)
357 |     lr_pred = lr.predict_proba(val_data)
358 | 
359 |     predictions = final_prediction(adam_pred, sgd_mom_pred, rf, lr)
360 | 
361 |     confusion = confusion_matrix(val_labels, np.argmax(predictions, axis=1))
362 |     print(confusion)
363 |     TP = confusion[1, 1]
364 |     TN = confusion[0, 0]
365 |     FP = confusion[0, 1]
366 |     FN = confusion[1, 0]
367 |     FNR = FN / float(FN + TP) * 100
368 |     FPR = FP / float(FP + TN) * 100
369 |     accuracy = ((TP + TN) / float(TP + TN + FP + FN)) * 100
370 |     print("FP:", FP, "- FN:", FN, "- TP:", TP, "- TN", TN)
371 |     print("Accuracy:", accuracy, "- FPR:", FPR, "- FNR:", FNR)
372 | 
373 | 
374 | if __name__ == "__main__":
375 |     path = "models_incremental_learning/"
376 |     total_features = 545333  # total unique features
377 |     print("Creating data-labels...")
378 |     onehot.create_list_of_apps()  # function from set_one_encoding.py
379 | 
380 |     changes_dict = {}  # dictionary for perturbations (added features)
381 | 
382 |     #evaluate_adam()
383 |     evaluate_without_adv()
384 |     evaluate_ensembles()
385 | 


--------------------------------------------------------------------------------
/feature_based_original_dataset/evaluate_models.py:
--------------------------------------------------------------------------------
  1 | import set_onehot_encoding as onehot
  2 | import models
  3 | import neural_network as NN
  4 | import numpy as np
  5 | import pickle
  6 | import keras.optimizers
  7 | from keras.models import load_model
  8 | import tensorflow as tf
  9 | import joblib
 10 | 
 11 | 
 12 | def create_random_sets():
 13 |     print("Generating TESTING set...")
 14 |     testing_set = onehot.generate_set(testing_set_size, malware_ratio)  # generate random testing set
 15 |     print("Generating TESTING input...")
 16 |     test_data, test_labels = onehot.generate_input(testing_set, total_features)  # perform one-hot encoding
 17 |     return test_data, test_labels
 18 | 
 19 | 
 20 | def create_train_set():
 21 |     training_set = []  # the list of training set
 22 | 
 23 |     with open("training_set_1500.txt", "r") as file:  # read training set file and append applications to list
 24 |         for line in file:
 25 |             line.strip()  # remove whitespace
 26 |             line = line[:-1]  # remove \n
 27 |             training_set.append(line)  # add item to list
 28 |     print("Generating TRAINING input...")
 29 |     data, labels = onehot.generate_input(training_set, total_features)  # perform one-hot encoding
 30 | 
 31 |     return data, labels
 32 | 
 33 | 
 34 | def create_test_set():
 35 |     testing_set = []  # the list of testing set
 36 | 
 37 |     with open("testing_set_1500.txt", "r") as file:  # read testing set file and append applications to list
 38 |         for line in file:
 39 |             line.strip()
 40 |             line = line[:-1]
 41 |             testing_set.append(line)
 42 | 
 43 |     print("Generating TESTING input...")
 44 |     test_data, test_labels = onehot.generate_input(testing_set, total_features)  # perform one-hot encoding
 45 |     return test_data, test_labels
 46 | 
 47 | 
 48 | def evaluate_models(runs):
 49 | 
 50 |     for i in range(runs):
 51 |         val_data, val_labels = create_random_sets()
 52 | 
 53 |         # loaded_model = load("models/model_GaussianNB.sav")
 54 |         # GNB.test_gaussian_naive_bayes_classifier(loaded_model, val_data, val_labels)
 55 | 
 56 |         # loaded_model = load("models/model_MultinomialNB.sav")
 57 |         # MNB.test_multi_naive_bayes_classifier(loaded_model, val_data, val_labels)
 58 | 
 59 |         # loaded_model = load("models/model_ComplementNB.sav")
 60 |         # CNB.test_complement_naive_bayes_classifier(loaded_model, val_data, val_labels)
 61 | 
 62 |         # loaded_model = load("models/model_BernoulliNB.sav")
 63 |         # BNB.test_bernoulli_naive_bayes_classifier(loaded_model, val_data, val_labels)
 64 | 
 65 |         # loaded_model = load("models/model_DecisionTreeClassifier.sav")
 66 |         # DT.test_decision_tree_classifier(loaded_model, val_data, val_labels)
 67 | 
 68 |         # loaded_model = load("models/model_RandomForestClassifier.sav")
 69 |         # RF.test_random_forest_classifier(loaded_model, val_data, val_labels)
 70 | 
 71 |         # loaded_model = load("models/model_LogisticRegression.sav")
 72 |         # LR.test_logistic_regression_classifier(loaded_model, val_data, val_labels)
 73 | 
 74 |         # loaded_model = load("models/model_SVC.sav")
 75 |         # SVM.test_svm_classifier(loaded_model, val_data, val_labels)
 76 | 
 77 |         loaded_model = load_model("models/best_model_DNN_Adam.h5")
 78 |         NN.test_neural_network(loaded_model, val_data, val_labels)
 79 | 
 80 | 
 81 |     # GNB.get_average_metrics(runs)
 82 |     # MNB.get_average_metrics(runs)
 83 |     # CNB.get_average_metrics(runs)
 84 |     # BNB.get_average_metrics(runs)
 85 |     # DT.get_average_metrics(runs)
 86 |     # RF.get_average_metrics(runs)
 87 |     # LR.get_average_metrics(runs)
 88 |     # SVM.get_average_metrics(runs)
 89 |     NN.get_average_metrics(runs)
 90 | 
 91 | 
 92 | def evaluate_on_test_set():
 93 |     val_data, val_labels = create_test_set()
 94 | 
 95 |     #loaded_model = load("models/model_GaussianNB.joblib")
 96 |     #GNB.test_gaussian_naive_bayes_classifier(loaded_model, val_data, val_labels)
 97 | 
 98 |     #loaded_model = load("models/model_MultinomialNB.joblib")
 99 |     #MNB.test_multi_naive_bayes_classifier(loaded_model, val_data, val_labels)
100 | 
101 |     #loaded_model = load("models/model_ComplementNB.joblib")
102 |     #CNB.test_complement_naive_bayes_classifier(loaded_model, val_data, val_labels)
103 | 
104 |     #loaded_model = load("models/model_BernoulliNB.joblib")
105 |     #BNB.test_bernoulli_naive_bayes_classifier(loaded_model, val_data, val_labels)
106 | 
107 |     #loaded_model = load("models/model_DecisionTreeClassifier.joblib")
108 |     #DT.test_decision_tree_classifier(loaded_model, val_data, val_labels)
109 | 
110 |     #loaded_model = load("models/model_RandomForestClassifier.joblib")
111 |     #RF.test_random_forest_classifier(loaded_model, val_data, val_labels)
112 |     
113 |     #loaded_model = load("models/KNearestNeighborsClassifier.joblib")
114 |     #KNN.test_knn_classifier(loaded_model, val_data, val_labels)
115 | 
116 |     #loaded_model = load("models/model_LogisticRegression.joblib")
117 |     #LR.test_logistic_regression_classifier(loaded_model, val_data, val_labels)
118 | 
119 |     #loaded_model = load("model_SVC.joblib")
120 |     #SVM.test_svm_classifier(loaded_model, val_data, val_labels)
121 | 
122 |     loaded_model = load_model("models/best_model_DNN_Adam.h5")
123 |     NN.test_neural_network(loaded_model, val_data, val_labels)   
124 | 
125 | 
126 | if __name__ == "__main__":
127 |     total_features = 545333  # total unique features
128 |     testing_set_size = 1500  # set site that will be used to create random test set
129 |     malware_ratio = 0.3  # malware ratio in the set size
130 | 
131 |     print("Creating data-labels...")
132 |     onehot.create_list_of_apps()  # function from set_one_encoding.py
133 | 
134 |     # initialize sklearn models
135 |     GNB = models.GaussianNaiveBayes()
136 |     MNB = models.MultinomialNaiveBayes()
137 |     CNB = models.ComplementNaiveBayes()
138 |     BNB = models.BernoulliNaiveBayes()
139 |     DT = models.DecisionTree()
140 |     RF = models.RandomForest()
141 |     KNN = models.KNearestNeighbors()
142 |     LR = models.LogRegression()
143 |     SVM = models.SupportVectorMachine()
144 | 
145 |     val_runs = 8
146 | 
147 |     #evaluate_models(val_runs)
148 |     evaluate_on_test_set()
149 | 


--------------------------------------------------------------------------------
/feature_based_original_dataset/fgsm.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import numpy as np
 3 | from sklearn.metrics import confusion_matrix
 4 | import set_onehot_encoding as onehot
 5 | import os
 6 | 
 7 | 
 8 | def create_set():
 9 |     if os.path.isfile("testing_set_1000.txt") is False:
10 |         set_size = 1000
11 |         malware_ratio = 0.3
12 |         print("Creating data-labels...")
13 |         print("Generating TESTING set...")
14 |         testing_set = onehot.generate_set(set_size, malware_ratio)  # generate random testing set
15 |         with open("testing_set_1000.txt", "w") as file:
16 |             for item in testing_set:
17 |                 file.write(str(item) + "\n")
18 |     testing_set = []  # the list of testing set
19 |     with open("testing_set_1000.txt", "r") as file:  # read testing set file and append applications to list
20 |         for line in file:
21 |             line.strip()
22 |             line = line[:-1]
23 |             testing_set.append(line)
24 |     print("Generating TESTING input...")
25 |     test_data, test_labels = onehot.generate_input(testing_set, total_features)  # perform one-hot encoding
26 |     return test_data, test_labels
27 | 
28 | 
29 | def create_adversarial_pattern(input_x, input_y):
30 |     """
31 |     FGSM attack as described in https://arxiv.org/pdf/1412.6572.pdf
32 |     The goal of FGSM is to cause the loss function to increase for specific inputs.
33 |     It operates by perturbating each feature of an input x by a small value to maximize the loss.
34 |     Steps:
35 |     1)Compute the gradient of the loss with respect to the input
36 |                             ∇_x J(θ,x,y)
37 |       where x is the model's input, y the target class, θ the model's parameters, ∇_x the gradient and J(θ,x,y) the loss
38 |     2)Take the sign of the gradient (calculated in 1), multiply it by a threshold ε and add it to the
39 |        original input x.
40 |                             x_adv=x+e*sign(∇_x J(θ,x,y))
41 | 
42 |     :param input_x: the original input data
43 |     :param input_y: the original input label
44 |     :return: the sign of the gradient
45 |     """
46 |     with tf.GradientTape() as tape:
47 |         tape.watch(input_x)
48 |         prediction = trained_model(input_x)  # predict original input
49 |         loss = loss_object(input_y, prediction)  # get the loss
50 |     # get the gradients of the loss with respect to the inputs
51 |     gradient = tape.gradient(loss, input_x)
52 |     # get the sign of the gradients to create perturbations
53 |     signed_grad = tf.sign(gradient)
54 |     return signed_grad
55 | 
56 | 
57 | if __name__ == "__main__":
58 |     total_features = 545333  # total unique features
59 |     print("Creating data-labels...")
60 |     onehot.create_list_of_apps()  # function from set_one_encoding.py
61 |     # path to the saved model
62 |     trained_model = tf.keras.models.load_model('models_incremental_learning/model_Adam.h5')
63 | 
64 |     # create the testing input
65 |     val_data, val_labels = create_set()
66 | 
67 |     # loss function
68 |     loss_object = tf.keras.losses.SparseCategoricalCrossentropy()
69 |     val_data = tf.convert_to_tensor(val_data, dtype=np.float32)
70 |     val_labels = tf.convert_to_tensor(val_labels, dtype=np.int32)
71 | 
72 |     perturbations = create_adversarial_pattern(val_data, val_labels)  # get the sign of gradient wrt the input
73 | 
74 |     epsilons = [0, 0.01]  # 0 to evaluate without FGSM, 0.01 for FGSM. Note 0.01 is too small, but able to fool models!
75 |     descriptions = [('Epsilon = {:0.3f}'.format(eps) if eps else 'Input')
76 |                     for eps in epsilons]
77 | 
78 |     for i, eps in enumerate(epsilons):
79 |         adv_x = val_data + eps * perturbations  # compute input_x + eps * adversarial examples as defined in FGSM
80 |         adv_x = tf.clip_by_value(adv_x, 0, 1)
81 |         prediction = trained_model.predict(adv_x)  # model prediction
82 |         confusion = confusion_matrix(val_labels, np.argmax(prediction, axis=1))  # confusion matrix
83 |         print(confusion)
84 |         # confusion matrix metrics
85 |         TP = confusion[1, 1]
86 |         TN = confusion[0, 0]
87 |         FP = confusion[0, 1]
88 |         FN = confusion[1, 0]
89 |         FNR = FN / float(FN + TP) * 100
90 |         FPR = FP / float(FP + TN) * 100
91 |         accuracy = (TP + TN) / float(TP + TN + FP + FN) * 100
92 |         print("Epsilon:", eps, "- FP:", FP, "- FN:", FN, "- TP:", TP, "- TN", TN)
93 |         print("Epsilon:", eps, "- Accuracy:", accuracy, "- FPR:", FPR, "- FNR:", FNR)
94 |         print("Misclassification Rate:", 100 - accuracy)
95 | 


--------------------------------------------------------------------------------
/feature_based_original_dataset/incremental_learning.py:
--------------------------------------------------------------------------------
 1 | import set_onehot_encoding as onehot
 2 | from sklearn.naive_bayes import MultinomialNB, ComplementNB
 3 | import neural_network as NN
 4 | import models
 5 | import numpy as np
 6 | import matplotlib.pyplot as plt
 7 | import joblib
 8 | 
 9 | 
10 | def create_training_input():
11 |     print("Generating TRAINING set...")
12 |     training_set = onehot.generate_set_incremental(mini_batch_size, malware_ratio)  # choose random training set
13 |     print("Generating TRAINING input...")
14 |     data, labels = onehot.generate_input(training_set, total_features)  # perform one-hot encoding
15 |     return data, labels
16 | 
17 | 
18 | def create_testing_input():
19 |     testing_set = []  # the list of testing set
20 | 
21 |     with open("testing_set_1500.txt", "r") as file:  # read testing set file and append applications to list
22 |         for line in file:
23 |             line.strip()
24 |             line = line[:-1]
25 |             testing_set.append(line)
26 |     print("Generating TESTING input...")
27 |     test_data, test_labels = onehot.generate_input(testing_set, total_features)  # perform one-hot encoding
28 |     return test_data, test_labels
29 | 
30 | 
31 | def incremental_learn():
32 |     for j in range(batches):
33 |         data, labels = create_training_input()
34 | 
35 |         # incremental train and evaluate Multinomial Naive Bayes
36 |         #model = MNB.train_incremental(data, labels)
37 |         #MNB.evaluate_multi_naive_bayes_classifier(model, x_test, y_test)
38 | 
39 |         # incremental train and evaluate Complement Naive Bayes
40 |         #model = CNB.train_incremental(data, labels)
41 |         #CNB.evaluate_complement_naive_bayes_classifier(model, x_test, y_test)
42 | 
43 |         # incremental train and evaluate neural net
44 |         NN.train_neural_network(model, epochs, batch_size, data, labels)  # train neural network
45 |         NN.evaluate_neural_network(model, x_test, y_test)
46 | 
47 |     #filename = "model_incremental_" + type(MNB).__name__ + ".joblib"
48 |     #dump(model, filename)
49 |     #MNB.test_multi_naive_bayes_classifier(model, x_test, y_test)
50 | 
51 |     #filename = "model_incremental_" + type(CNB).__name__ + ".joblib"
52 |     #dump(model, filename)
53 |     #CNB.test_complement_naive_bayes_classifier(model, x_test, y_test)
54 | 
55 |     opt_config = model.optimizer.get_config()
56 | 
57 |     if 'name' not in opt_config.keys():
58 |         _name = str(model.optimizer.__class__).split('.')[-1].replace('\'', '').replace('>', '')
59 |         opt_config.update({'name': _name})
60 | 
61 |     model.save('model_' + opt_config['name'] + '.h5')
62 | 
63 |     NN.test_neural_network(model, x_test, y_test)
64 | 
65 | 
66 | if __name__ == "__main__":
67 |     total_features = 545333  # total unique features
68 |     mini_batch_size = 1000  # we will feed the classifier with mini batches of 1000.
69 |     # number of times that mini batches will be fed to the classifier (the total number of samples will be batch_size*i)
70 |     batches = 19
71 | 
72 |     testing_set_size = 1000  # set site that will be used to create random test set
73 | 
74 |     malware_ratio = 0.3  # malware ratio in the mini batch size
75 | 
76 |     training_data = []  # list of training batches
77 |     training_labels = []  # list of testing samples
78 | 
79 |     MNB = models.MultinomialNaiveBayes()  # Multinomial Naive Bayes for incremental learning
80 |     CNB = models.ComplementNaiveBayes()  # Complement Naive Bayes for incremental learning
81 | 
82 |     units = [200, 200]
83 |     dropout = 0.2
84 |     epochs = 4
85 |     batch_size = 150
86 |     learn_rate = 0.001
87 |     kernel_initializer = 'glorot_uniform'
88 |     bias_initializer = 'zeros'
89 |     activation_function = 'relu'
90 |     model = NN.generate_neural_network(total_features, units, dropout, learn_rate, kernel_initializer,
91 |                                        bias_initializer, activation_function)
92 | 
93 |     print("Creating data-labels...")
94 |     onehot.create_list_of_apps()  # function from set_one_encoding.py
95 | 
96 |     x_test, y_test = create_testing_input()
97 |     incremental_learn()
98 | 


--------------------------------------------------------------------------------
/feature_based_original_dataset/jsma.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import numpy as np
  3 | from tensorflow import keras
  4 | from sklearn.metrics import confusion_matrix
  5 | import set_onehot_encoding as onehot
  6 | import os
  7 | import joblib
  8 | import models
  9 | 
 10 | 
 11 | def create_set():
 12 |     if os.path.isfile("testing_set_1000.txt") is False:
 13 |         set_size = 1000
 14 |         malware_ratio = 0.3
 15 |         print("Creating data-labels...")
 16 |         print("Generating TESTING set...")
 17 |         testing_set = onehot.generate_set(set_size, malware_ratio)  # generate random testing set
 18 |         with open("testing_set_1000.txt", "w") as file:
 19 |             for item in testing_set:
 20 |                 file.write(str(item) + "\n")
 21 |     testing_set = []  # the list of testing set
 22 |     with open("testing_set_1000.txt", "r") as file:  # read testing set file and append applications to list
 23 |         for line in file:
 24 |             line.strip()
 25 |             line = line[:-1]
 26 |             testing_set.append(line)
 27 |     print("Generating TESTING input...")
 28 |     test_data, test_labels = onehot.generate_input(testing_set, total_features)  # perform one-hot encoding
 29 |     return test_data, test_labels
 30 | 
 31 | 
 32 | """
 33 | functions to compute Jacobian with numpy.
 34 | https://medium.com/unit8-machine-learning-publication/computing-the-jacobian-matrix-of-a-neural-network-in-python-4f162e5db180
 35 | First we specify the the forward and backward passes of each layer to implement backpropagation manually.
 36 | """
 37 | 
 38 | 
 39 | def affine_forward(x, w, b):
 40 |     """
 41 |     Forward pass of an affine layer
 42 |     :param x: input of dimension (I, )
 43 |     :param w: weights matrix of dimension (I, O)
 44 |     :param b: biais vector of dimension (O, )
 45 |     :return output of dimension (O, ), and cache needed for backprop
 46 |     """
 47 |     out = np.dot(x, w) + b
 48 |     cache = (x, w)
 49 |     return out, cache
 50 | 
 51 | 
 52 | def affine_backward(dout, cache):
 53 |     """
 54 |     Backward pass for an affine layer.
 55 |     :param dout: Upstream Jacobian, of shape (M, O)
 56 |     :param cache: Tuple of:
 57 |       - x: Input data, of shape (I, )
 58 |       - w: Weights, of shape (I, O)
 59 |     :return the jacobian matrix containing derivatives of the M neural network outputs with respect to
 60 |             this layer's inputs, evaluated at x, of shape (M, I)
 61 |     """
 62 |     x, w = cache
 63 |     dx = np.dot(dout, w.T)
 64 |     return dx
 65 | 
 66 | 
 67 | def relu_forward(x):
 68 |     """ Forward ReLU
 69 |     """
 70 |     out = np.maximum(np.zeros(x.shape), x)
 71 |     cache = x
 72 |     return out, cache
 73 | 
 74 | 
 75 | def relu_backward(dout, cache):
 76 |     """
 77 |     Backward pass of ReLU
 78 |     :param dout: Upstream Jacobian
 79 |     :param cache: the cached input for this layer
 80 |     :return: the jacobian matrix containing derivatives of the M neural network outputs with respect to
 81 |              this layer's inputs, evaluated at x.
 82 |     """
 83 |     x = cache
 84 |     dx = dout * np.where(x > 0, np.ones(x.shape), np.zeros(x.shape))
 85 |     return dx
 86 | 
 87 | 
 88 | def softmax_forward(x):
 89 |     """ Forward softmax
 90 |     """
 91 |     exps = np.exp(x - np.max(x))
 92 |     s = exps / exps.sum()
 93 |     return s, s
 94 | 
 95 | 
 96 | def softmax_backward(dout, cache):
 97 |     """
 98 |     Backward pass for softmax
 99 |     :param dout: Upstream Jacobian
100 |     :param cache: contains the cache (in this case the output) for this layer
101 |     """
102 |     s = cache
103 |     ds = np.diag(s) - np.outer(s, s.T)
104 |     dx = np.dot(dout, ds)
105 |     return dx
106 | 
107 | 
108 | def get_activations(model, layer_id, X):
109 |     """
110 |     Computes outputs of intermediate layers
111 |     :param model: the trained model
112 |     :param layer_id: the id of the layer that we want the output from
113 |     :param X: input feature vector
114 |     :return: output of layer (layer_id)
115 |     """
116 |     intermediate_layer_model = keras.models.Model(inputs=model.input,
117 |                                                   outputs=model.layers[layer_id].output)
118 |     intermediate_output = intermediate_layer_model.predict(X)
119 |     return intermediate_output
120 | 
121 | 
122 | def forward_backward(model, x):
123 |     """
124 |     computes the forward derivative for the given input
125 |     :param model: the trained model
126 |     :param x: input feature vector
127 |     :return: prediction result and forward derivative
128 |     """
129 |     layer_to_cache = dict()  # for each layer, we store the cache needed for backward pass
130 |     forward_values = []
131 | 
132 |     for i in range(0, len(model.layers), 2):
133 |         values = {}
134 |         w, b = model.layers[i].get_weights()
135 |         values['w'] = w
136 |         values['b'] = b
137 |         forward_values.append(values)
138 | 
139 |     # Forward pass
140 |     a1, cache_a1 = affine_forward(x, forward_values[0]['w'], forward_values[0]['b'])
141 |     _, cache_r1 = relu_forward(a1)
142 |     r1 = get_activations(model, 0, x)
143 |     forward_values[0]['a'] = a1
144 |     forward_values[0]['cache_a'] = cache_a1
145 |     forward_values[0]['r'] = r1
146 |     forward_values[0]['cache_r'] = cache_r1
147 | 
148 |     for i, layer_index in zip(range(1, len(forward_values) - 1), range(2, len(model.layers), 2)):
149 |         a, cache_a = affine_forward(forward_values[i - 1]['r'], forward_values[i]['w'], forward_values[i]['b'])
150 |         _, cache_r = relu_forward(a)
151 |         r = get_activations(model, layer_index, x)
152 |         forward_values[i]['a'] = a
153 |         forward_values[i]['cache_a'] = cache_a
154 |         forward_values[i]['r'] = r
155 |         forward_values[i]['cache_r'] = cache_r
156 | 
157 |     a, cache_a = affine_forward(forward_values[len(forward_values) - 2]['r'],
158 |                                 forward_values[len(forward_values) - 1]['w'],
159 |                                 forward_values[len(forward_values) - 1]['b'])
160 |     forward_values[len(forward_values) - 1]['a'] = a
161 |     forward_values[len(forward_values) - 1]['cache_a'] = cache_a
162 |     out, cache_out = softmax_forward(a)
163 | 
164 |     # backward pass
165 |     dout = np.diag(np.ones(out.size, ))  # the derivatives of each output w.r.t. each output.
166 |     dout = softmax_backward(dout, cache_out)
167 |     dout = affine_backward(dout, forward_values[len(forward_values) - 1]['cache_a'])
168 | 
169 |     for i in range(len(forward_values) - 2, 0, -1):
170 |         dout = relu_backward(dout, forward_values[i]['cache_r'])
171 |         dout = affine_backward(dout, forward_values[i]['cache_a'])
172 | 
173 |     dout = relu_backward(dout, forward_values[0]['cache_r'])
174 |     dx = affine_backward(dout, forward_values[0]['cache_a'])
175 | 
176 |     return out, dx
177 | 
178 | 
179 | def craft_adversarial_samples(x, y, F, k):
180 |     """
181 |     JSMA variant for adversarial examples crafting algorithm as described in https://arxiv.org/abs/1606.04435
182 |     JSMA iteratively selects the most useful features to perturb a small magnitude of value until the target class is
183 |     achived. The perturbed featured are selected based on the saliency map. Saliency maps are used for a network's
184 |     visualization and describe which features are the most important for a particular output class. The goal
185 |     is to eliminate those attributes from a legitimate sample and bring up the most important ones for the target class
186 |     in oder to cause the model to misclassify. This is done by pushing the features away from the original label
187 |     and closer to the target class.
188 |     Steps:
189 |     1)Compute the gradient of F with respect to the input X to estimate the direction in which a perturbation in X
190 |       would change F's output. That is, compute the forward derivative (the Jacobian of the learned function for
191 |       a legitimate sample).
192 |                             ∇F(x)=(∂F(x))/∂x=[(∂F_j (x))/(∂x_i )]_(iϵ1…M,jϵ1…N)
193 |       where x is the model’s input, F is the network, F(x) the predicted class, M the input dimension,
194 |       N the output dimension, (i, j) is the derivative class of class j with respect to the input feature i.
195 |       In essence, it computes the gradient of F with respect to input x to estimate the direction in which
196 |       a perturbation in x would change the output. In backpropagation, the forward derivative is calculated
197 |       with respect to the loss function and the gradients with respect to the network parameters with the goal of
198 |       updating the weights. On the contrary, in JSMA the forward derivative is taken with respect to the network
199 |       directly and the gradients with respect to the input data.
200 |     2)Choose a perturbation δ of X with maximal positive gradient into the the target class y'.
201 |       In other words, choose the index that maximizes the change into the target class 0 by changing X_i.
202 |       The limitation is that we can only add features and not discard them as in a real world scenario an adversary doesnt want
203 |       to 'break' the functionality of an application.
204 |     Algorithm:
205 |        Input x, y, F, k, I
206 |        x_adv <- x
207 |        Gamma = {1...|x|}
208 |        while arg max_jF_j(x_adv) != y and ||δ_X|| < k do
209 |            Compute the forward derivative ∇F(adv_x)
210 |            i_max = arg max_j∈Γ∩I,X_j=0  ∂Fy(X)/∂Xj
211 |            if i_max <= 0 then
212 |               :return Failure
213 |            end if
214 |            adv_x_i_max = 1
215 |            δ_x <- x_adv - x
216 |            :return adv_x
217 |     :param x: input feature vector
218 |     :param y: target class
219 |     :param F: the trained model
220 |     :param k: index of the hidden layer
221 |     :return: adversarial sample based on feature vector x
222 |     """
223 |     x_adv = x
224 |     gamma = [1] * len(x)
225 |     delta_x = [0]
226 |     changes = 0
227 | 
228 |     if np.argmax(F.predict(x_adv), 1) == 0:  # if misclassification achieved return adv_x
229 |         return x_adv, -1
230 | 
231 |     while np.argmax(F.predict(x_adv), 1) != y and np.linalg.norm(delta_x, ord=1) < k and changes < 20:
232 |         # compute forward derivative (Jacobian)
233 |         prob, forward_derivative = forward_backward(F, x_adv)
234 | 
235 |         tmp = np.multiply(forward_derivative[0], gamma)
236 |         for i, feature in enumerate(x_adv[0]):
237 |             if feature == 1:
238 |                 tmp[i] = 0
239 |         i_max = np.argmax(tmp)
240 |         if i_max <= 0:
241 |             raise ValueError('FAILURE: We can only add features to an application!')
242 | 
243 |         x_adv[0][i_max] = 1
244 |         delta_x = np.subtract(x_adv, x)
245 |         # print(i_max)
246 |         if i_max not in changes_dict:
247 |             changes_dict[i_max] = 1
248 |         else:
249 |             changes_dict[i_max] += 1
250 |         changes += 1
251 |     print("Changes:", changes)
252 | 
253 |     return x_adv, changes
254 | 
255 | 
256 | def evaluate_other_models():
257 |     """
258 |     specify models to evaluate adv examples produced from another model. We give two examples with a neural net with
259 |     SGD optimizer and complement naive bayes.
260 |     """
261 |     second_trained_model = tf.keras.models.load_model('model_SGD.h5')
262 |     predictions = second_trained_model.predict(val_data)
263 |     confusion = confusion_matrix(val_labels, np.argmax(predictions, axis=1))
264 |     print(confusion)
265 |     TP = confusion[1, 1]
266 |     TN = confusion[0, 0]
267 |     FP = confusion[0, 1]
268 |     FN = confusion[1, 0]
269 |     FNR = FN / float(FN + TP) * 100
270 |     FPR = FP / float(FP + TN) * 100
271 |     accuracy = ((TP + TN) / float(TP + TN + FP + FN)) * 100
272 |     print("Adversarial  FP:", FP, "- FN:", FN, "- TP:", TP, "- TN", TN)
273 |     print("Adversarial Accuracy:", accuracy, "- FPR:", FPR, "- FNR:", FNR)
274 | 
275 |     '''CNB = models.ComplementNaiveBayes()
276 |     second_trained_model = load("models_incremental_learning/model_incremental_ComplementNaiveBayes.joblib")
277 |     CNB.test_complement_naive_bayes_classifier(second_trained_model, val_data, val_labels)'''
278 | 
279 | 
280 | if __name__ == "__main__":
281 |     total_features = 545333  # total unique features
282 |     print("Creating data-labels...")
283 |     onehot.create_list_of_apps()  # function from set_one_encoding.py
284 | 
285 |     changes_dict = {}  # dictionary for perturbations (added features)
286 | 
287 | 
288 |     def fn(correct, predicted):
289 |         train_temp = 1
290 |         return tf.nn.softmax_cross_entropy_with_logits(labels=correct, logits=(predicted / train_temp))
291 | 
292 | 
293 |     # model trained on 1500 samples
294 |     trained_model = tf.keras.models.load_model('models/best_model_DNN_Adam.h5')
295 |     # incremental learned model
296 |     #trained_model = tf.keras.models.load_model('models_incremental_learning/model_Adam.h5')
297 |     # adversarial trained model
298 |     #trained_model = tf.keras.models.load_model('Adam_adversarial_training_adv_800_0.3.h5')
299 |     # distilled model
300 |     #trained_model = tf.keras.models.load_model('defensive_distillation/distilled-100', custom_objects={'fn': fn})
301 | 
302 | 
303 |     averageChanges = 0
304 | 
305 |     val_data, val_labels = create_set()
306 |     # print(val_labels)
307 |     predict_original = trained_model.predict(val_data)
308 |     confusion = confusion_matrix(val_labels, np.argmax(predict_original, axis=1))
309 |     TP = confusion[1, 1]
310 |     TN = confusion[0, 0]
311 |     FP = confusion[0, 1]
312 |     FN = confusion[1, 0]
313 |     FNR_original = FN / float(FN + TP) * 100
314 |     FPR = FP / float(FP + TN) * 100
315 |     accuracy = ((TP + TN) / float(TP + TN + FP + FN)) * 100
316 |     print(confusion)
317 |     print("Original FP:", FP, "- FN:", FN, "- TP:", TP, "- TN", TN)
318 |     print("Original Accuracy:", accuracy, "- FPR:", FPR, "- FNR:", FNR_original)
319 |     del predict_original
320 |     average_changes = 0
321 |     amount_malwares = 0
322 | 
323 |     for i in range(len(val_data)):
324 | 
325 |         if val_labels[i] == 1:
326 | 
327 |             x = val_data[i:i + 1]
328 |             #print("x: ", x)
329 |             #print(x.shape)
330 |             try:
331 |                 adv_x, changes = craft_adversarial_samples(x, 0, trained_model, 1)
332 |                 # print(adv_x)
333 |                 val_data[i] = adv_x
334 |                 if changes >= 0:
335 |                     average_changes += changes
336 |                     amount_malwares += 1
337 |             except NameError:
338 |                 pass
339 |             except ValueError:
340 |                 pass
341 | 
342 |     if amount_malwares > 0:
343 |         averageChanges += (average_changes / float(amount_malwares))
344 |     #print(val_data.shape)
345 | 
346 |     # evaluate the model on adversarial examples
347 |     predictions = trained_model.predict(val_data)
348 |     confusion = confusion_matrix(val_labels, np.argmax(predictions, axis=1))
349 |     print(confusion)
350 |     TP = confusion[1, 1]
351 |     TN = confusion[0, 0]
352 |     FP = confusion[0, 1]
353 |     FN = confusion[1, 0]
354 |     FNR = FN / float(FN + TP) * 100
355 |     FPR = FP / float(FP + TN) * 100
356 |     accuracy = ((TP + TN) / float(TP + TN + FP + FN)) * 100
357 |     print("Adversarial  FP:", FP, "- FN:", FN, "- TP:", TP, "- TN", TN)
358 |     print("Adversarial Accuracy:", accuracy, "- FPR:", FPR, "- FNR:", FNR)
359 |     print("Misclassification Rate:", FNR - FNR_original)
360 |     print("Distortion:", averageChanges)
361 |     print(changes_dict)
362 | 
363 |     '''adv_trained_model = tf.keras.models.load_model("Adam_adversarial_training_adv_100_0.7.h5")
364 |     predictions = adv_trained_model.predict(val_data)
365 |     confusion = confusion_matrix(val_labels, np.argmax(predictions, axis=1))
366 |     print(confusion)
367 |     TP = confusion[1, 1]
368 |     TN = confusion[0, 0]
369 |     FP = confusion[0, 1]
370 |     FN = confusion[1, 0]
371 |     FNR = FN / float(FN + TP) * 100
372 |     FPR = FP / float(FP + TN) * 100
373 |     accuracy = ((TP + TN) / float(TP + TN + FP + FN)) * 100
374 |     print("Adversarial  FP:", FP, "- FN:", FN, "- TP:", TP, "- TN", TN)
375 |     print("Adversarial Accuracy:", accuracy, "- FPR:", FPR, "- FNR:", FNR)
376 |     print("Misclassification Rate:", 100 - accuracy)
377 |     print("Distortion:", averageChanges)
378 |     print(changes_dict)'''
379 | 
380 |     # evaluate adversarial examples produced from one model to other ML models
381 |     #evaluate_other_models()
382 | 
383 | 
384 | 
385 | 


--------------------------------------------------------------------------------
/feature_based_original_dataset/label_encoding.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This file maps features from an app to integer values, i.e. label encoding.
 3 | We create a dictionary with all features present in dataset as {feature:index},
 4 | for example the feature android.hardware.touchscreen maps as 0
 5 | First we take every app from the feature_vectors directory and we recreate every file
 6 | of the dataset in another directory. The new file has the indexes mapped to features.
 7 | For example, an app is represented as 6-0-2345-98776-1331110-1-45
 8 | """
 9 | import os
10 | 
11 | feature_vector = {}  # dictionary with indexes mapped to features
12 | index = 0  # index value
13 | feature_vectors_dir = '../feature_vectors/'
14 | feature_indexes_dir = 'features_indexes/'
15 | if not os.path.exists(feature_indexes_dir):
16 |     os.makedirs(feature_indexes_dir)
17 | 
18 | not_assignable_feature_type = ['']  # found from extract_feature_types.py
19 | 
20 | print("Creating a dictionary that maps features to numeric values...")
21 | for filename in os.listdir(feature_vectors_dir):  # read all apps
22 |     with open(feature_vectors_dir + filename, "r") as file:  # open an app
23 |         for line in file:  # read app line by line
24 |             feature_type = line[:line.find('::')]  # extract feature type
25 |             feature = line.strip()  # remove whitespace chars
26 |             if feature_type not in not_assignable_feature_type:   # check if feature type is ''
27 |                 # if a feature is not present in the feature vector, map feature to index and increment index
28 |                 if feature not in feature_vector:
29 |                     feature_vector[feature] = index
30 |                     index += 1
31 | 
32 | print("Creating files with numeric values as features...")
33 | for filename in os.listdir(feature_vectors_dir):  # recreate files with indexes
34 |     with open(feature_vectors_dir + filename, "r") as file:  # first open the orignal feature vectors
35 |         f = open(feature_indexes_dir + filename, "a")  # create a new file with the same SHA name in another dir
36 |         for line in file:  # read original feature vectors line by line
37 |             feature_type = line[:line.find('::')]  # extract feature type
38 |             feature = line.strip()  # remove whitespace chars
39 |             if feature_type not in not_assignable_feature_type:  # check if feature type is ''
40 |                 f.write(str(feature_vector[feature]) + '\n')  # append the index of the feature to the new file
41 |         f.close()
42 | print(str(feature_vector['feature::android.hardware.touchscreen']))  # 0
43 | print("Finished!")
44 | print("Total features in dataset: ", len(feature_vector))  # 545333
45 | 
46 | '''
47 | ffff64617c42e24fd1e572478279d547b834ef5e497f093ec59b3fb49ecec25f maps to 
48 | 0-15597-15-16-17-15598-3297-18-15599-20-114-178-15600-21-458-23-87-10-25-15602-36-27-32
49 | -68-159-69-76-11-236-15603-216-782-43-3302-15604-66-2288-71-47-481598-79-415-289-13-15605
50 | -15606-74-28-15607-58-447-162-35-80-290-139-188-790-464-15608-419-15609-61-15610-293-2
51 | -37-38-39-78-40-84-26-3316-46-8-29-15611-44
52 | '''
53 | 


--------------------------------------------------------------------------------
/feature_based_original_dataset/models.py:
--------------------------------------------------------------------------------
  1 | from sklearn.naive_bayes import GaussianNB, MultinomialNB, ComplementNB, BernoulliNB
  2 | from sklearn.tree import DecisionTreeClassifier
  3 | from sklearn.ensemble import RandomForestClassifier
  4 | from sklearn.neighbors import KNeighborsClassifier
  5 | from sklearn.linear_model import LogisticRegression
  6 | from sklearn import svm
  7 | from sklearn.metrics import confusion_matrix
  8 | from joblib import dump
  9 | import timeit
 10 | import numpy as np
 11 | import os
 12 | 
 13 | # init models
 14 | GNB = GaussianNB()  # Gaussian Naive Bayes
 15 | MNB = MultinomialNB()  # Multinomial Naive Bayes
 16 | CNB = ComplementNB()  # Complement Naive Bayes
 17 | BNB = BernoulliNB()  # Bernouli Naive Bayes
 18 | DT = DecisionTreeClassifier(criterion='gini', max_features=None, splitter='best')  # Decision Tree
 19 | RF = RandomForestClassifier(n_estimators=10, criterion='entropy', max_features='log2')  # Random Forest
 20 | KNN = KNeighborsClassifier(n_neighbors=3, weights='distance', metric='minkowski')  # K Nearest Neighbors
 21 | LR = LogisticRegression(solver='lbfgs', C=2.0, fit_intercept=True, max_iter=100)  # Logistic Regression model
 22 | SVM = svm.SVC(kernel='linear', C=0.5,  gamma='scale', decision_function_shape='ovr')  # Support Vector Machines
 23 | # the parameters are found from the grid search procedure.
 24 | 
 25 | path = "models/"
 26 | if not os.path.exists(path):
 27 |     os.mkdir(path)
 28 | 
 29 | """
 30 | Each class defines a model for our classification task, containing four methods, namely 
 31 | train_, evaluate_, test_, get_average_metrics.
 32 | In train_ methods we fit the models.
 33 |     :param features: train data
 34 |     :param labels: train labels
 35 |     :param labels: save the model if True
 36 |     :return: 
 37 | In evaluate_ methods we get the accuracy on random test sets   
 38 |     :param model: the classifier
 39 |     :param features: test data
 40 |     :param labels: test labels
 41 |     :return:  
 42 | Train and evaluate are used in train_random_subsampling.py and in train_models.py  
 43 | In test_ methods we evaluate our models 
 44 |     :param test_features: validation data
 45 |     :param test_labels: validation labes
 46 |     :return: 
 47 | In get_average_metrics methods we get the average performance of each model because we evaluate each model x times 
 48 | on unseen data.
 49 |     :param val_runs: times to evaluate a model
 50 | Test and average metrics methods are used in evaluate_models.py
 51 | """
 52 | 
 53 | 
 54 | class GaussianNaiveBayes:
 55 |     # metrics for the evaluation stage: FNR, FPR, accuracy
 56 |     average_FNR = 0
 57 |     average_FPR = 0
 58 |     average_accuracy = 0
 59 |     scores = []  # list for the testing training sets
 60 | 
 61 |     @staticmethod
 62 |     def train_gaussian_naive_bayes_classifier(features, labels, save=False):
 63 |         print("\n\n--- Training", type(GNB).__name__, "---")
 64 |         start_time = timeit.default_timer()  # timer
 65 |         model = GNB.fit(features, labels)  # fit model on training set
 66 |         stop_time = timeit.default_timer()
 67 |         print(type(GNB).__name__, "training time: ", stop_time - start_time, "seconds\n\n")
 68 |         if save:  # if defined, save the model
 69 |             print("Saving model...")
 70 |             filename = path + "model_" + type(GNB).__name__ + ".joblib"
 71 |             dump(model, filename)
 72 |         return model
 73 | 
 74 |     def evaluate_gaussian_naive_bayes_classifier(self, model, features, labels):
 75 |         print("\n\n--- Evaluating", type(GNB).__name__, "---")
 76 |         score = model.score(features, labels)  # evaluate the model in training stage
 77 |         print("Accuracy:", score * 100)
 78 |         self.scores.append(score)
 79 |         return self.scores
 80 | 
 81 |     def test_gaussian_naive_bayes_classifier(self, model, test_features, test_labels):
 82 |         print(type(GNB).__name__, "predicting...")
 83 |         start_time = timeit.default_timer()
 84 |         predicted = model.predict(test_features)  # evaluate the model in evaluation stage
 85 |         confusion = confusion_matrix(test_labels, predicted)  # confusion matrix metrics
 86 |         print(confusion)
 87 |         TP = confusion[1, 1]
 88 |         TN = confusion[0, 0]
 89 |         FP = confusion[0, 1]
 90 |         FN = confusion[1, 0]
 91 |         FNR = FN / float(FN + TP) * 100
 92 |         FPR = FP / float(FP + TN) * 100
 93 |         accuracy = (TP + TN) / float(TP + TN + FP + FN) * 100
 94 |         print("FP:", FP, "- FN:", FN, "- TP:", TP, "- TN", TN)
 95 |         print("Accuracy:", accuracy, "- FPR:", FPR, "- FNR:", FNR)
 96 | 
 97 |         stop_time = timeit.default_timer()
 98 |         print(type(GNB).__name__, "prediction time: ", stop_time - start_time, "seconds\n\n")
 99 |         self.average_FNR += FNR
100 |         self.average_FPR += FPR
101 |         self.average_accuracy += accuracy
102 | 
103 |     def get_average_metrics(self, val_runs):
104 |         self.average_FNR = self.average_FNR/val_runs
105 |         self.average_FPR = self.average_FPR/val_runs
106 |         self.average_accuracy = self.average_accuracy/val_runs
107 |         print("Average Accuracy:", self.average_accuracy, "- Average FPR:", self.average_FPR,
108 |               "- Average FNR:", self.average_FNR)
109 | 
110 | 
111 | class MultinomialNaiveBayes:
112 | 
113 |     average_FNR = 0
114 |     average_FPR = 0
115 |     average_accuracy = 0
116 |     scores = []
117 | 
118 |     @staticmethod
119 |     def train_multi_naive_bayes_classifier(features, labels, save=False):
120 |         print("\n\n--- Training", type(MNB).__name__, "---")
121 |         start_time = timeit.default_timer()
122 |         model = MNB.fit(features, labels)
123 |         stop_time = timeit.default_timer()
124 |         print(type(MNB).__name__, "training time: ", stop_time - start_time, "seconds\n\n")
125 |         if save:
126 |             print("Saving model...")
127 |             filename = path + "model_" + type(MNB).__name__ + ".joblib"
128 |             dump(model, filename)
129 |         return model
130 | 
131 |     def evaluate_multi_naive_bayes_classifier(self, model, features, labels):
132 |         print("\n\n--- Evaluating", type(MNB).__name__, "---")
133 |         score = model.score(features, labels)
134 |         print("Accuracy:", score * 100)
135 |         self.scores.append(score)
136 |         return self.scores
137 | 
138 |     def test_multi_naive_bayes_classifier(self, model, test_features, test_labels):
139 |         print(type(MNB).__name__, "predicting...")
140 |         start_time = timeit.default_timer()
141 |         predicted = model.predict(test_features)
142 |         confusion = confusion_matrix(test_labels, predicted)
143 |         print(confusion)
144 |         TP = confusion[1, 1]
145 |         TN = confusion[0, 0]
146 |         FP = confusion[0, 1]
147 |         FN = confusion[1, 0]
148 |         FNR = FN / float(FN + TP) * 100
149 |         FPR = FP / float(FP + TN) * 100
150 |         accuracy = (TP + TN) / float(TP + TN + FP + FN) * 100
151 |         print("FP:", FP, "- FN:", FN, "- TP:", TP, "- TN", TN)
152 |         print("Accuracy:", accuracy, "- FPR:", FPR, "- FNR:", FNR)
153 | 
154 |         stop_time = timeit.default_timer()
155 |         print(type(MNB).__name__, "prediction time: ", stop_time - start_time, "seconds\n\n")
156 |         self.average_FNR += FNR
157 |         self.average_FPR += FPR
158 |         self.average_accuracy += accuracy
159 | 
160 |     def get_average_metrics(self, val_runs):
161 |         self.average_FNR = self.average_FNR / val_runs
162 |         self.average_FPR = self.average_FPR / val_runs
163 |         self.average_accuracy = self.average_accuracy / val_runs
164 |         print("Average Accuracy:", self.average_accuracy, "- Average FPR:", self.average_FPR,
165 |               "- Average FNR:", self.average_FNR)
166 | 
167 |     @staticmethod
168 |     def train_incremental(features, labels):
169 |         print("\n\n--- Training", type(MNB).__name__, "---")
170 |         start_time = timeit.default_timer()
171 |         model = MNB.partial_fit(features, labels, classes=np.unique(labels))
172 |         stop_time = timeit.default_timer()
173 |         print(type(MNB).__name__, "training time: ", stop_time - start_time, "seconds\n\n")
174 |         return model
175 | 
176 | 
177 | class ComplementNaiveBayes:
178 | 
179 |     average_FNR = 0
180 |     average_FPR = 0
181 |     average_accuracy = 0
182 |     scores = []
183 | 
184 |     @staticmethod
185 |     def train_complement_naive_bayes_classifier(features, labels, save=False):
186 |         print("\n\n--- Training", type(CNB).__name__, "---")
187 |         start_time = timeit.default_timer()
188 |         model = CNB.fit(features, labels)
189 |         stop_time = timeit.default_timer()
190 |         print(type(CNB).__name__, "training time: ", stop_time - start_time, "seconds\n\n")
191 |         if save:
192 |             print("Saving model...")
193 |             filename = path + "model_" + type(CNB).__name__ + ".joblib"
194 |             dump(model, filename)
195 |         return model
196 | 
197 |     def evaluate_complement_naive_bayes_classifier(self, model, features, labels):
198 |         print("\n\n--- Evaluating", type(CNB).__name__, "---")
199 |         score = model.score(features, labels)
200 |         print("Accuracy:", score * 100)
201 |         self.scores.append(score)
202 |         return self.scores
203 | 
204 |     def test_complement_naive_bayes_classifier(self, model, test_features, test_labels):
205 |         print(type(CNB).__name__, "predicting...")
206 |         start_time = timeit.default_timer()
207 |         predicted = model.predict(test_features)
208 |         confusion = confusion_matrix(test_labels, predicted)
209 |         print(confusion)
210 |         TP = confusion[1, 1]
211 |         TN = confusion[0, 0]
212 |         FP = confusion[0, 1]
213 |         FN = confusion[1, 0]
214 |         FNR = FN / float(FN + TP) * 100
215 |         FPR = FP / float(FP + TN) * 100
216 |         accuracy = (TP + TN) / float(TP + TN + FP + FN) * 100
217 |         print("FP:", FP, "- FN:", FN, "- TP:", TP, "- TN", TN)
218 |         print("Accuracy:", accuracy, "- FPR:", FPR, "- FNR:", FNR)
219 | 
220 |         stop_time = timeit.default_timer()
221 |         print(type(CNB).__name__, "prediction time: ", stop_time - start_time, "seconds\n\n")
222 |         self.average_FNR += FNR
223 |         self.average_FPR += FPR
224 |         self.average_accuracy += accuracy
225 | 
226 |     def get_average_metrics(self, val_runs):
227 |         self.average_FNR = self.average_FNR / val_runs
228 |         self.average_FPR = self.average_FPR / val_runs
229 |         self.average_accuracy = self.average_accuracy / val_runs
230 |         print("Average Accuracy:", self.average_accuracy, "- Average FPR:", self.average_FPR,
231 |               "- Average FNR:", self.average_FNR)
232 | 
233 |     @staticmethod
234 |     def train_incremental(features, labels):
235 |         print("\n\n--- Training", type(CNB).__name__, "---")
236 |         start_time = timeit.default_timer()
237 |         model = CNB.partial_fit(features, labels, classes=np.unique(labels))
238 |         stop_time = timeit.default_timer()
239 |         print(type(CNB).__name__, "training time: ", stop_time - start_time, "seconds\n\n")
240 |         return model
241 | 
242 | 
243 | class BernoulliNaiveBayes:
244 | 
245 |     average_FNR = 0
246 |     average_FPR = 0
247 |     average_accuracy = 0
248 |     scores = []
249 | 
250 |     @staticmethod
251 |     def train_bernoulli_naive_bayes_classifier(features, labels, save=False):
252 |         print("\n\n--- Training", type(BNB).__name__, "---")
253 |         start_time = timeit.default_timer()
254 |         model = BNB.fit(features, labels)
255 |         stop_time = timeit.default_timer()
256 |         print(type(BNB).__name__, "training time: ", stop_time - start_time, "seconds\n\n")
257 |         if save:
258 |             print("Saving model...")
259 |             filename = path + "model_" + type(BNB).__name__ + ".joblib"
260 |             dump(model, filename)
261 |         return model
262 | 
263 |     def evaluate_bernoulli_naive_bayes_classifier(self, model, features, labels):
264 |         print("\n\n--- Evaluating", type(BNB).__name__, "---")
265 |         score = model.score(features, labels)
266 |         print("Accuracy:", score * 100)
267 |         self.scores.append(score)
268 |         return self.scores
269 | 
270 |     def test_bernoulli_naive_bayes_classifier(self, model, test_features, test_labels):
271 |         print(type(BNB).__name__, "predicting...")
272 |         start_time = timeit.default_timer()
273 |         predicted = model.predict(test_features)
274 |         confusion = confusion_matrix(test_labels, predicted)
275 |         print(confusion)
276 |         TP = confusion[1, 1]
277 |         TN = confusion[0, 0]
278 |         FP = confusion[0, 1]
279 |         FN = confusion[1, 0]
280 |         FNR = FN / float(FN + TP) * 100
281 |         FPR = FP / float(FP + TN) * 100
282 |         accuracy = (TP + TN) / float(TP + TN + FP + FN) * 100
283 |         print("FP:", FP, "- FN:", FN, "- TP:", TP, "- TN", TN)
284 |         print("Accuracy:", accuracy, "- FPR:", FPR, "- FNR:", FNR)
285 | 
286 |         stop_time = timeit.default_timer()
287 |         print(type(BNB).__name__, "prediction time: ", stop_time - start_time, "seconds\n\n")
288 |         self.average_FNR += FNR
289 |         self.average_FPR += FPR
290 |         self.average_accuracy += accuracy
291 | 
292 |     def get_average_metrics(self, val_runs):
293 |         self.average_FNR = self.average_FNR / val_runs
294 |         self.average_FPR = self.average_FPR / val_runs
295 |         self.average_accuracy = self.average_accuracy / val_runs
296 |         print("Average Accuracy:", self.average_accuracy, "- Average FPR:", self.average_FPR,
297 |               "- Average FNR:", self.average_FNR)
298 | 
299 | 
300 | class DecisionTree:
301 | 
302 |     average_FNR = 0
303 |     average_FPR = 0
304 |     average_accuracy = 0
305 |     scores = []
306 | 
307 |     @staticmethod
308 |     def train_decision_tree_classifier(features, labels, save=False):
309 |         print("\n\n--- Training", type(DT).__name__, "---")
310 |         start_time = timeit.default_timer()
311 |         model = DT.fit(features, labels)
312 |         stop_time = timeit.default_timer()
313 |         print(type(DT).__name__, "training time: ", stop_time - start_time, "seconds\n\n")
314 |         if save:
315 |             print("Saving model...")
316 |             filename = path + "model_" + type(DT).__name__ + ".joblib"
317 |             dump(model, filename)
318 |         return model
319 | 
320 |     def evaluate_decision_tree_classifier(self, model, features, labels):
321 |         print("\n\n--- Evaluating", type(DT).__name__, "---")
322 |         score = model.score(features, labels)
323 |         print("Accuracy:", score * 100)
324 |         self.scores.append(score)
325 |         return self.scores
326 | 
327 |     def test_decision_tree_classifier(self, model, test_features, test_labels):
328 |         print(type(DT).__name__, "predicting...")
329 |         start_time = timeit.default_timer()
330 |         predicted = model.predict(test_features)
331 |         confusion = confusion_matrix(test_labels, predicted)
332 |         print(confusion)
333 |         TP = confusion[1, 1]
334 |         TN = confusion[0, 0]
335 |         FP = confusion[0, 1]
336 |         FN = confusion[1, 0]
337 |         FNR = FN / float(FN + TP) * 100
338 |         FPR = FP / float(FP + TN) * 100
339 |         accuracy = (TP + TN) / float(TP + TN + FP + FN) * 100
340 |         print("FP:", FP, "- FN:", FN, "- TP:", TP, "- TN", TN)
341 |         print("Accuracy:", accuracy, "- FPR:", FPR, "- FNR:", FNR)
342 | 
343 |         stop_time = timeit.default_timer()
344 |         print(type(DT).__name__, "prediction time: ", stop_time - start_time, "seconds\n\n")
345 |         self.average_FNR += FNR
346 |         self.average_FPR += FPR
347 |         self.average_accuracy += accuracy
348 | 
349 |     def get_average_metrics(self, val_runs):
350 |         self.average_FNR = self.average_FNR / val_runs
351 |         self.average_FPR = self.average_FPR / val_runs
352 |         self.average_accuracy = self.average_accuracy / val_runs
353 |         print("Average Accuracy:", self.average_accuracy, "- Average FPR:", self.average_FPR,
354 |               "- Average FNR:", self.average_FNR)
355 | 
356 | 
357 | class RandomForest:
358 | 
359 |     average_FNR = 0
360 |     average_FPR = 0
361 |     average_accuracy = 0
362 |     scores = []
363 | 
364 |     @staticmethod
365 |     def train_random_forest_classifier(features, labels, save=False):
366 |         print("\n\n--- Training", type(RF).__name__, "---")
367 |         start_time = timeit.default_timer()
368 |         model = RF.fit(features, labels)
369 |         stop_time = timeit.default_timer()
370 |         print(type(RF).__name__, "training time: ", stop_time - start_time, "seconds\n\n")
371 |         if save:
372 |             print("Saving model...")
373 |             filename = path + "model_" + type(RF).__name__ + ".joblib"
374 |             dump(model, filename)
375 |         return model
376 | 
377 |     def evaluate_random_forest_classifier(self, model, features, labels):
378 |         print("\n\n--- Evaluating", type(RF).__name__, "---")
379 |         score = model.score(features, labels)
380 |         print("Accuracy:", score * 100)
381 |         self.scores.append(score)
382 |         return self.scores
383 | 
384 |     def test_random_forest_classifier(self, model, test_features, test_labels):
385 |         print(type(RF).__name__, "predicting...")
386 |         start_time = timeit.default_timer()
387 |         predicted = model.predict(test_features)
388 |         confusion = confusion_matrix(test_labels, predicted)
389 |         print(confusion)
390 |         TP = confusion[1, 1]
391 |         TN = confusion[0, 0]
392 |         FP = confusion[0, 1]
393 |         FN = confusion[1, 0]
394 |         FNR = FN / float(FN + TP) * 100
395 |         FPR = FP / float(FP + TN) * 100
396 |         accuracy = (TP + TN) / float(TP + TN + FP + FN) * 100
397 |         print("FP:", FP, "- FN:", FN, "- TP:", TP, "- TN", TN)
398 |         print("Accuracy:", accuracy, "- FPR:", FPR, "- FNR:", FNR)
399 | 
400 |         stop_time = timeit.default_timer()
401 |         print(type(RF).__name__, "prediction time: ", stop_time - start_time, "seconds\n\n")
402 |         self.average_FNR += FNR
403 |         self.average_FPR += FPR
404 |         self.average_accuracy += accuracy
405 | 
406 |     def get_average_metrics(self, val_runs):
407 |         self.average_FNR = self.average_FNR / val_runs
408 |         self.average_FPR = self.average_FPR / val_runs
409 |         self.average_accuracy = self.average_accuracy / val_runs
410 |         print("Average Accuracy:", self.average_accuracy, "- Average FPR:", self.average_FPR,
411 |               "- Average FNR:", self.average_FNR)
412 | 
413 | 
414 | class KNearestNeighbors:
415 | 
416 |     average_FNR = 0
417 |     average_FPR = 0
418 |     average_accuracy = 0
419 |     scores = []
420 | 
421 |     @staticmethod
422 |     def train_knn_classifier(features, labels, save=False):
423 |         print("\n\n--- Training", type(KNN).__name__, "---")
424 |         start_time = timeit.default_timer()
425 |         model = KNN.fit(features, labels)
426 |         stop_time = timeit.default_timer()
427 |         print(type(KNN).__name__, "training time: ", stop_time - start_time, "seconds\n\n")
428 |         if save:
429 |             print("Saving model...")
430 |             filename = path + "model_" + type(KNN).__name__ + ".joblib"
431 |             dump(model, filename)
432 |         return model
433 | 
434 |     def evaluate_knn_classifier(self, model, features, labels):
435 |         print("\n\n--- Evaluating", type(KNN).__name__, "---")
436 |         score = model.score(features, labels)
437 |         print("Accuracy:", score * 100)
438 |         self.scores.append(score)
439 |         return self.scores
440 | 
441 |     def test_knn_classifier(self, model, test_features, test_labels):
442 |         print(type(KNN).__name__, "predicting...")
443 |         start_time = timeit.default_timer()
444 |         predicted = model.predict(test_features)
445 |         confusion = confusion_matrix(test_labels, predicted)
446 |         print(confusion)
447 |         TP = confusion[1, 1]
448 |         TN = confusion[0, 0]
449 |         FP = confusion[0, 1]
450 |         FN = confusion[1, 0]
451 |         FNR = FN / float(FN + TP) * 100
452 |         FPR = FP / float(FP + TN) * 100
453 |         accuracy = (TP + TN) / float(TP + TN + FP + FN) * 100
454 |         print("FP:", FP, "- FN:", FN, "- TP:", TP, "- TN", TN)
455 |         print("Accuracy:", accuracy, "- FPR:", FPR, "- FNR:", FNR)
456 | 
457 |         stop_time = timeit.default_timer()
458 |         print(type(KNN).__name__, "prediction time: ", stop_time - start_time, "seconds\n\n")
459 |         self.average_FNR += FNR
460 |         self.average_FPR += FPR
461 |         self.average_accuracy += accuracy
462 | 
463 |     def get_average_metrics(self, val_runs):
464 |         self.average_FNR = self.average_FNR / val_runs
465 |         self.average_FPR = self.average_FPR / val_runs
466 |         self.average_accuracy = self.average_accuracy / val_runs
467 |         print("Average Accuracy:", self.average_accuracy, "- Average FPR:", self.average_FPR,
468 |               "- Average FNR:", self.average_FNR)
469 | 
470 | 
471 | class LogRegression:
472 | 
473 |     average_FNR = 0
474 |     average_FPR = 0
475 |     average_accuracy = 0
476 |     scores = []
477 | 
478 |     @staticmethod
479 |     def train_logistic_regression_classifier(features, labels, save=False):
480 |         print("\n\n--- Training", type(LR).__name__, "---")
481 |         start_time = timeit.default_timer()
482 |         model = LR.fit(features, labels)
483 |         stop_time = timeit.default_timer()
484 |         print(type(LR).__name__, "training time: ", stop_time - start_time, "seconds\n\n")
485 |         if save:
486 |             print("Saving model...")
487 |             filename = path + "model_" + type(LR).__name__ + ".joblib"
488 |             dump(model, filename)
489 |         return model
490 | 
491 |     def evaluate_logistic_regression_classifier(self, model, features, labels):
492 |         print("\n\n--- Evaluating", type(LR).__name__, "---")
493 |         score = model.score(features, labels)
494 |         print("Accuracy:", score * 100)
495 |         self.scores.append(score)
496 |         return self.scores
497 | 
498 |     def test_logistic_regression_classifier(self, model, test_features, test_labels):
499 |         print(type(LR).__name__, "predicting...")
500 |         start_time = timeit.default_timer()
501 |         predicted = model.predict(test_features)
502 |         confusion = confusion_matrix(test_labels, predicted)
503 |         print(confusion)
504 |         TP = confusion[1, 1]
505 |         TN = confusion[0, 0]
506 |         FP = confusion[0, 1]
507 |         FN = confusion[1, 0]
508 |         FNR = FN / float(FN + TP) * 100
509 |         FPR = FP / float(FP + TN) * 100
510 |         accuracy = (TP + TN) / float(TP + TN + FP + FN) * 100
511 |         print("FP:", FP, "- FN:", FN, "- TP:", TP, "- TN", TN)
512 |         print("Accuracy:", accuracy, "- FPR:", FPR, "- FNR:", FNR)
513 | 
514 |         stop_time = timeit.default_timer()
515 |         print(type(LR).__name__, "prediction time: ", stop_time - start_time, "seconds\n\n")
516 |         self.average_FNR += FNR
517 |         self.average_FPR += FPR
518 |         self.average_accuracy += accuracy
519 | 
520 |     def get_average_metrics(self, val_runs):
521 |         self.average_FNR = self.average_FNR / val_runs
522 |         self.average_FPR = self.average_FPR / val_runs
523 |         self.average_accuracy = self.average_accuracy / val_runs
524 |         print("Average Accuracy:", self.average_accuracy, "- Average FPR:", self.average_FPR,
525 |               "- Average FNR:", self.average_FNR)
526 | 
527 | 
528 | class SupportVectorMachine:
529 | 
530 |     average_FNR = 0
531 |     average_FPR = 0
532 |     average_accuracy = 0
533 |     scores = []
534 | 
535 |     @staticmethod
536 |     def train_svm_classifier(features, labels, save=False):
537 |         print("\n\n--- Training", type(SVM).__name__, "---")
538 |         start_time = timeit.default_timer()
539 |         model = SVM.fit(features, labels)
540 |         stop_time = timeit.default_timer()
541 |         print(type(SVM).__name__, "training time: ", stop_time - start_time, "seconds\n\n")
542 |         if save:
543 |             print("Saving model...")
544 |             filename = path + "model_" + type(SVM).__name__ + ".joblib"
545 |             dump(model, filename)
546 |         return model
547 | 
548 |     def evaluate_svm_classifier(self, model, features, labels):
549 |         print("\n\n--- Evaluating", type(SVM).__name__, "---")
550 |         score = model.score(features, labels)
551 |         print("Accuracy:", score * 100)
552 |         self.scores.append(score)
553 |         return self.scores
554 | 
555 |     def test_svm_classifier(self, model, test_features, test_labels):
556 |         print(type(SVM).__name__, "predicting...")
557 |         start_time = timeit.default_timer()
558 |         predicted = model.predict(test_features)
559 |         confusion = confusion_matrix(test_labels, predicted)
560 |         print(confusion)
561 |         TP = confusion[1, 1]
562 |         TN = confusion[0, 0]
563 |         FP = confusion[0, 1]
564 |         FN = confusion[1, 0]
565 |         FNR = FN / float(FN + TP) * 100
566 |         FPR = FP / float(FP + TN) * 100
567 |         accuracy = (TP + TN) / float(TP + TN + FP + FN) * 100
568 |         print("FP:", FP, "- FN:", FN, "- TP:", TP, "- TN", TN)
569 |         print("Accuracy:", accuracy, "- FPR:", FPR, "- FNR:", FNR)
570 | 
571 |         stop_time = timeit.default_timer()
572 |         print(type(SVM).__name__, "prediction time: ", stop_time - start_time, "seconds\n\n")
573 |         self.average_FNR += FNR
574 |         self.average_FPR += FPR
575 |         self.average_accuracy += accuracy
576 | 
577 |     def get_average_metrics(self, val_runs):
578 |         self.average_FNR = self.average_FNR / val_runs
579 |         self.average_FPR = self.average_FPR / val_runs
580 |         self.average_accuracy = self.average_accuracy / val_runs
581 |         print("Average Accuracy:", self.average_accuracy, "- Average FPR:", self.average_FPR,
582 |               "- Average FNR:", self.average_FNR)
583 | 


--------------------------------------------------------------------------------
/feature_based_original_dataset/models_grid_search.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This file performs grid search for 'classic' machine learning algorithms.
  3 | """
  4 | import set_onehot_encoding as onehot
  5 | import os
  6 | from sklearn.linear_model import LogisticRegression
  7 | from sklearn.ensemble import RandomForestClassifier
  8 | from sklearn.neighbors import KNeighborsClassifier
  9 | from sklearn.model_selection import GridSearchCV
 10 | from sklearn import svm
 11 | from sklearn.tree import DecisionTreeClassifier
 12 | 
 13 | 
 14 | def grid_RF():
 15 |     print("--- Random Forest ---")
 16 |     n_estimators = [10, 50, 100, 200]  # number of trees
 17 |     criterion = ['gini', 'entropy']  # measurement for the quality of split
 18 |     max_features = ['sqrt', 'log2', None]  # Number of features to consider at every split
 19 |     min_samples_split = [2, 5, 10]  # Minimum number of samples required to split a node
 20 |     min_samples_leaf = [1, 2, 4]  # Minimum number of samples required at each leaf node
 21 |     # Create the grid
 22 |     param_grid = dict(n_estimators=n_estimators, criterion=criterion, max_features=max_features,
 23 |                       min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf)
 24 | 
 25 |     rf = RandomForestClassifier()  # create the base model to tune
 26 |     # Use the grid search to search for best hyperparameters, using 3-fold cross validation
 27 |     rf_random = GridSearchCV(estimator=rf, param_grid=param_grid, cv=4, verbose=1,
 28 |                              n_jobs=1)  # Fit the grid search model
 29 |     grid_result = rf_random.fit(data, labels)
 30 |     print("Best: ", grid_result.best_score_, "using", grid_result.best_params_)  # find the best hyperparameter
 31 | 
 32 | 
 33 | def grid_KNN():
 34 |     print("--- K Nearest Neighbors ---")
 35 |     n_neighbors = [3, 5, 10, 20, 50]  # number of neighbors
 36 |     weights = ['uniform', 'distance']  # weight function to use in prediction
 37 |     metric = ['euclidean', 'manhattan', 'minkowski']  # distance metric to use
 38 | 
 39 |     param_grid = dict(n_neighbors=n_neighbors, weights=weights, metric=metric)
 40 | 
 41 |     knn = KNeighborsClassifier()
 42 | 
 43 |     knn_grid = GridSearchCV(estimator=knn, param_grid=param_grid, cv=4, n_jobs=-1)
 44 |     grid_result = knn_grid.fit(data, labels)
 45 |     print("Best: ", grid_result.best_score_, "using", grid_result.best_params_)
 46 | 
 47 | 
 48 | def grid_LR():
 49 |     print("--- Logistic Regression ---")
 50 |     C = [0.5, 1.0, 1.5, 2.0, 2.5]  # regularization strength
 51 |     max_iter = [100, 110, 120, 130, 140]  # maximum number of iterations
 52 |     fit_intercept = [True, False]  # add a bias or not to the decision function
 53 | 
 54 |     param_grid = dict(max_iter=max_iter, C=C, fit_intercept=fit_intercept)
 55 | 
 56 |     lr = LogisticRegression(penalty="l2", solver="lbfgs")
 57 | 
 58 |     grid = GridSearchCV(estimator=lr, param_grid=param_grid, cv=4, n_jobs=-1, verbose=1)
 59 |     grid_result = grid.fit(data, labels)
 60 |     print("Best: ", grid_result.best_score_, "using", grid_result.best_params_)
 61 | 
 62 | 
 63 | def grid_SVM():
 64 |     print("--- Support Vector Machines ---")
 65 |     C = [0.25, 0.5, 1.0]  # penalty parameter
 66 |     kernel = ['linear', 'rbf', 'poly']  # kernel type
 67 |     gamma = ['auto', 'scale']  # kernel coefficient
 68 |     decision_function_shape = ['ovo', 'ovr']  # one vs rest or one vs one
 69 | 
 70 |     param_grid = dict(C=C, kernel=kernel, gamma=gamma, decision_function_shape=decision_function_shape)
 71 | 
 72 |     SVM = svm.SVC()
 73 | 
 74 |     grid = GridSearchCV(estimator=SVM, param_grid=param_grid, cv=4, n_jobs=1, verbose=1)
 75 |     grid_result = grid.fit(data, labels)
 76 |     print("Best: ", grid_result.best_score_, "using", grid_result.best_params_)
 77 | 
 78 | 
 79 | def grid_DT():
 80 |     print("--- Decision Tree ---")
 81 |     criterion = ['gini', 'entropy']  # measurement for the quality of split
 82 |     splitter = ['best', 'random']
 83 |     max_features = ['sqrt', 'log2', None]  # Number of features to consider at every split
 84 |     min_samples_split = [2, 5, 10]  # Minimum number of samples required to split a node
 85 |     min_samples_leaf = [1, 2, 4]  # Minimum number of samples required at each leaf node
 86 | 
 87 |     param_grid = dict(criterion=criterion, splitter=splitter, max_features=max_features,
 88 |                       min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf)
 89 | 
 90 |     DT = DecisionTreeClassifier()
 91 | 
 92 |     rf_random = GridSearchCV(estimator=DT, param_grid=param_grid, cv=4, n_jobs=1)
 93 |     grid_result = rf_random.fit(data, labels)
 94 |     print("Best: ", grid_result.best_score_, "using", grid_result.best_params_)
 95 | 
 96 | 
 97 | if __name__ == "__main__":
 98 |     total_features = 545333  # total unique features
 99 |     set_size = 2000  # set site that will be used to create random training and testing set
100 |     malware_ratio = 0.3  # malware ratio in the set size
101 | 
102 |     onehot.create_list_of_apps()  # function from set_one_encoding.py
103 | 
104 |     # check if a predefined training sample exists
105 |     if os.path.isfile("training_set_2000.txt") is False:
106 |         print("Creating data-labels...")
107 |         print("Generating TRAINING set...")
108 |         training_set = onehot.generate_set(set_size, malware_ratio)  # generate random training set
109 |         with open("training_set_2000.txt", "w") as file:
110 |             for item in training_set:
111 |                 file.write(str(item) + "\n")
112 | 
113 |     training_set = []  # the list of training set
114 | 
115 |     with open("training_set_2000.txt", "r") as file:  # read training set file and append applications to list
116 |         for line in file:
117 |             line.strip()
118 |             line = line[:-1]
119 |             training_set.append(line)
120 | 
121 |     print("Generating TRAINING input...")
122 |     data, labels = onehot.generate_input(training_set, total_features)  # perform one-hot encoding
123 |     print("Grid searching...")
124 | 
125 |     grid_RF()
126 |     #grid_KNN()
127 |     #grid_LR()
128 |     #grid_SVM()
129 |     #grid_DT()
130 | 


--------------------------------------------------------------------------------
/feature_based_original_dataset/neural_network.py:
--------------------------------------------------------------------------------
  1 | from sklearn.metrics import confusion_matrix
  2 | import timeit
  3 | from keras import Sequential
  4 | from keras.layers import Dense, Dropout
  5 | import numpy as np
  6 | import matplotlib.pyplot as plt
  7 | from keras.callbacks import TensorBoard
  8 | from keras.callbacks import EarlyStopping
  9 | from keras.callbacks import ModelCheckpoint
 10 | from keras.optimizers import Adam, SGD, RMSprop, Adagrad, Adadelta, Adamax, Nadam
 11 | 
 12 | average_FNR = 0
 13 | average_FPR = 0
 14 | average_accuracy = 0
 15 | 
 16 | 
 17 | def generate_neural_network(total_features, units, dropout, learn_rate, kernel, bias, activation_function):
 18 |     """
 19 |     :param total_features: the total features (input_dim) we used to train our network
 20 |     :param units: neurons in the hidden layers
 21 |     :param dropout: the dropout rate
 22 |     :param learn_rate: learning rate
 23 |     :param kernel: (kernel_initializer) weights initialization
 24 |     :param bias: (bias_initializer) bias init initialization
 25 |     :param activation_function: activation function
 26 |     :return:
 27 |     """
 28 |     model = Sequential()  # neural net init
 29 |     """
 30 |     add input layer dimension with 545333 features
 31 |     hidden layers with the defined units, dropout rate, weight and biases initialization,
 32 |     relu activation function and softmax in output layer
 33 |     """
 34 |     model.add(Dense(units=units[0], activation=activation_function, input_dim=total_features, kernel_initializer=kernel,
 35 |                     bias_initializer=bias))
 36 |     model.add(Dropout(dropout))  # add dropout rate
 37 | 
 38 |     for hidden_layer_units in units[1:]:  # add hidden layers defined units in train_models.py
 39 |         model.add(Dense(units=hidden_layer_units, activation=activation_function, kernel_initializer=kernel,
 40 |                         bias_initializer=bias))
 41 |         model.add(Dropout(dropout))
 42 | 
 43 |     model.add(Dense(2, activation="softmax"))  # output layer, with softmax activation function and 2 neurons
 44 | 
 45 |     # loss: sparse categorical cross entropy, Optimizer: Adam
 46 |     model.compile(loss="sparse_categorical_crossentropy",
 47 |                   optimizer=Adam(lr=learn_rate),
 48 |                   metrics=["accuracy"])
 49 | 
 50 |     """
 51 |     information about the NN, such as the number of layers, the output shape, 
 52 |     the number of weights in each layer and the total weights.
 53 |     """
 54 |     #model.summary()
 55 | 
 56 |     # plot of the neural network graph
 57 |     #plot_model(model, to_file="figures/DNN_model_plot.png", show_shapes=True, show_layer_names=True)
 58 | 
 59 |     return model
 60 | 
 61 | 
 62 | def train_neural_network(model, epochs, batch_size, features, labels, verbose=0,
 63 |                          validation=False, val_data=None, val_labels=None,
 64 |                          callbacks=False, plot_history=False, path="logs/fit/", model_name="DNN_200_200"):
 65 |     """
 66 |     :param modelh5: neural network model from generate_neural_network()
 67 |     :param epochs: number of epochs
 68 |     :param batch_size: batch size
 69 |     :param features: training data
 70 |     :param labels: training labels
 71 |     :param verbose: verbosity level
 72 |     :param validation: if True validate data
 73 |     :param val_data: validation data
 74 |     :param val_labels: validation labels
 75 |     :param callbacks: if True use Tensorboard callback
 76 |     :param plot_history: if True plots accuracy and loss history per epoch
 77 |     :param path:
 78 |     :param model_name:
 79 |     :return:
 80 |     """
 81 |     print("\n\n--- Training", type(model).__name__, "---")
 82 |     start_time = timeit.default_timer()
 83 | 
 84 |     # get the name of the optimizer in the defined model
 85 |     opt_config = model.optimizer.get_config()
 86 |     if 'name' not in opt_config.keys():
 87 |         _name = str(model.optimizer.__class__).split('.')[-1].replace('\'', '').replace('>', '')
 88 |         opt_config.update({'name': _name})
 89 | 
 90 |     if callbacks:
 91 |         # directory to save callbacks
 92 |         log_dir = path + model_name + opt_config['name']
 93 |         # callbacks: TensorBoard, EarlyStopping, ModelCheckPoint
 94 |         # TensorBoard for storing visualizations of the neural net
 95 |         tensorboard_callback = TensorBoard(log_dir=log_dir, histogram_freq=1, write_graph=True, write_images=True)
 96 |         # EarlyStopping to monitor validation loss. If there is any improve after 10 epochs,the training procedure stops
 97 |         early_stopping_callback = EarlyStopping(monitor='val_loss', mode='min', patience=10, verbose=verbose)
 98 |         # ModelCheckpoint to monitor validation accuracy. It stores the model with the highest accuracy
 99 |         model_checkpoint_callback = ModelCheckpoint('best_model_' + opt_config['name'] + '.h5', monitor='val_accuracy', mode='max',
100 |                                                     verbose=verbose, save_best_only=True)
101 |         if not validation:
102 |             # fit the model
103 |             print("Note: Validation data is not included...Only Tensorboard callback is used!")
104 |             history = model.fit(features, labels, epochs=epochs, batch_size=batch_size, verbose=verbose,
105 |                                 callbacks=[tensorboard_callback])  # train the neural network
106 |         else:
107 |             # fit the model
108 |             history = model.fit(features, labels, epochs=epochs, batch_size=batch_size, verbose=verbose,
109 |                                 validation_data=(val_data, val_labels),
110 |                                 callbacks=[tensorboard_callback, early_stopping_callback, model_checkpoint_callback])
111 |     else:  # train the model without the use of callbacks
112 |         history = model.fit(features, labels, epochs=epochs, batch_size=batch_size, verbose=verbose)
113 | 
114 |     if plot_history:  # plots the accuracy and loss per epoch
115 |         if validation:
116 |             # summarize history for training and validation accuracy
117 |             plt.plot(history.history['accuracy'])
118 |             plt.plot(history.history['val_accuracy'])
119 |             plt.title('model accuracy')
120 |             plt.ylabel('accuracy')
121 |             plt.xlabel('epoch')
122 |             plt.legend(['train', 'test'], loc='upper left')
123 |             plt.show()
124 |             # summarize history for training and validation loss
125 |             plt.plot(history.history['loss'])
126 |             plt.plot(history.history['val_loss'])
127 |             plt.title('model loss')
128 |             plt.ylabel('loss')
129 |             plt.xlabel('epoch')
130 |             plt.legend(['train', 'test'], loc='upper left')
131 |             plt.show()
132 |         else:
133 |             # print(history.history.keys())
134 |             # summarize history for training accuracy
135 |             plt.plot(history.history['accuracy'])
136 |             plt.title('model accuracy')
137 |             plt.ylabel('accuracy')
138 |             plt.xlabel('epoch')
139 |             plt.legend(['train'], loc='upper left')
140 |             plt.show()
141 |             # summarize history for training loss
142 |             plt.plot(history.history['loss'])
143 |             plt.title('model loss')
144 |             plt.ylabel('loss')
145 |             plt.xlabel('epoch')
146 |             plt.legend(['train'], loc='upper left')
147 |             plt.show()
148 | 
149 |     stop_time = timeit.default_timer()
150 |     print(type(model).__name__, "training time: ", stop_time - start_time, "seconds\n\n")
151 | 
152 | 
153 | def evaluate_neural_network(model, features, labels):
154 |     """
155 |     :param model: neural network model from generate_neural_network()
156 |     :param features: test data
157 |     :param labels: test labels
158 |     :return:
159 |     """
160 |     scores = model.evaluate(features, labels, verbose=0)
161 |     print(model.metrics_names[1], "%.2f%%" % (scores[1] * 100))
162 |     return scores[1] * 100
163 | 
164 | 
165 | def test_neural_network(model, test_data, test_labels):
166 |     """
167 |     :param model: neural network model from generate_neural_network()
168 |     :param test_data: validation data
169 |     :param test_labels: validation labels
170 |     :return:
171 |     """
172 |     global average_FNR, average_FPR, average_accuracy
173 |     print(type(model).__name__, "predicting...")
174 |     start_time = timeit.default_timer()
175 |     predicted = model.predict(test_data)
176 |     stop_time = timeit.default_timer()
177 |     # print(predicted)
178 |     # prick the class with higher probability
179 |     confusion = confusion_matrix(test_labels, np.argmax(predicted, axis=1))  # confusion matrix
180 |     print(confusion)
181 |     # confusion matrix metrics
182 |     TP = confusion[1, 1]
183 |     TN = confusion[0, 0]
184 |     FP = confusion[0, 1]
185 |     FN = confusion[1, 0]
186 |     FNR = FN / float(FN + TP) * 100
187 |     FPR = FP / float(FP + TN) * 100
188 |     accuracy = (TP + TN) / float(TP + TN + FP + FN) * 100
189 |     print("FP:", FP, "- FN:", FN, "- TP:", TP, "- TN", TN)
190 |     print("Accuracy:", accuracy, "- FPR:", FPR, "- FNR:", FNR)
191 |     print(type(model).__name__, "prediction time: ", stop_time - start_time, "seconds\n\n")
192 |     average_FNR += FNR
193 |     average_FPR += FPR
194 |     average_accuracy += accuracy
195 | 
196 | 
197 | def get_average_metrics(val_runs):
198 |     global average_FNR, average_FPR, average_accuracy
199 |     average_FNR = average_FNR / val_runs
200 |     average_FPR = average_FPR / val_runs
201 |     average_accuracy = average_accuracy / val_runs
202 |     print("Average Accuracy:", average_accuracy, "- Average FPR:", average_FPR, "- Average FNR:", average_FNR)
203 | 


--------------------------------------------------------------------------------
/feature_based_original_dataset/nn_grid_search.py:
--------------------------------------------------------------------------------
  1 | import set_onehot_encoding as onehot
  2 | import neural_network as NN
  3 | from sklearn.model_selection import StratifiedKFold
  4 | import numpy as np
  5 | import os
  6 | 
  7 | def tune_neural_network():
  8 |     kfold = StratifiedKFold(n_splits=4, shuffle=True)  # 4-fold cross-validation
  9 |     # neural net parameters
 10 |     units = [200, 200]  # neurons in each hidden layer
 11 |     dropout = 0.2  # dropout rate
 12 |     epochs = 5  # epochs
 13 |     batch_size = 150  # size in batch
 14 |     learn_rate = 0.001 
 15 |     #momentum = 0.8  # to work with SGD
 16 |     kernel_initializer = 'normal'  # weight init
 17 |     bias_initializer = 'normal'  # bias initi
 18 |     activation_function = 'relu'
 19 | 
 20 |     scores = []
 21 | 
 22 |     for train, test in kfold.split(data, labels):  # train on 3 pieces evaluate on 1 (4 total runs)
 23 |         model = NN.generate_neural_network(total_features, units, dropout, learn_rate, kernel_initializer,
 24 |                                            bias_initializer, activation_function)
 25 | 
 26 |         NN.train_neural_network(model, epochs, batch_size, data[train], labels[train])  # train neural network
 27 | 
 28 |         score = NN.evaluate_neural_network(model, data[test], labels[test])  # evaluate neural net
 29 |         scores.append(score)
 30 | 
 31 |     print("Average accuracy: ", np.mean(scores), "Standard Deviation:", np.std(scores))
 32 | 
 33 | 
 34 | if __name__ == "__main__":
 35 |     total_features = 545333  # total unique features
 36 |     set_size = 2000  # set site that will be used to create random training set
 37 |     testing_set_size = 2000  # set site that will be used to create random test set
 38 |     malware_ratio = 0.3  # malware ratio in the set size
 39 | 
 40 |     onehot.create_list_of_apps()  # function from set_one_encoding.py
 41 | 
 42 |     # check if a predefined training
 43 |     if os.path.isfile("training_set_2000.txt") is False and os.path.isfile("testing_set_2000.txt") is False:
 44 |         print("Creating data-labels...")
 45 |         print("Generating TRAINING set...")
 46 |         training_set = onehot.generate_set(set_size, malware_ratio)  # generate random training set
 47 |         with open("training_set_2000.txt", "w") as file:
 48 |             for item in training_set:
 49 |                 file.write(str(item) + "\n")
 50 | 
 51 |         print("Generating TESTING set...")
 52 |         testing_set = onehot.generate_set(testing_set_size, malware_ratio)  # generate random testing set
 53 |         with open("testing_set_2000.txt", "w") as file:
 54 |             for item in testing_set:
 55 |                 file.write(str(item) + "\n")
 56 | 
 57 |     training_set = []  # the list of training set
 58 |     testing_set = []  # the list of testing set
 59 | 
 60 |     with open("training_set_2000.txt", "r") as file:  # read training set file and append applications to list
 61 |         for line in file:
 62 |             line.strip()  # remove whitespace
 63 |             line = line[:-1]  # remove \n
 64 |             training_set.append(line)  # add item to list
 65 |     with open("testing_set_2000.txt", "r") as file:  # read testing set file and append applications to list
 66 |         for line in file:
 67 |             line.strip()
 68 |             line = line[:-1]
 69 |             testing_set.append(line)
 70 | 
 71 |     print("Generating TRAINING input...")
 72 |     data, labels = onehot.generate_input(training_set, total_features)  # perform one-hot encoding
 73 |     print("Generating TESTING input...")
 74 |     test_data, test_labels = onehot.generate_input(testing_set, total_features)  # perform one-hot encoding
 75 |     tune_neural_network()
 76 | 
 77 | 
 78 | """
 79 | # use above code for grid search if you have enough RAM, modifying tune_batch_epochs() method and comment everything above
 80 | import set_onehot_encoding as onehot
 81 | from sklearn.model_selection import GridSearchCV
 82 | from keras.wrappers.scikit_learn import KerasClassifier
 83 | from keras.models import Sequential
 84 | from keras.layers import Dense, Dropout
 85 | import os
 86 | total_features = 545333  # total unique features
 87 | set_size = 2000  # set site that will be used to create random training and testing set
 88 | malware_ratio = 0.3  # malware ratio in the set size
 89 | 
 90 | onehot.create_list_of_apps()  # function from set_one_encoding.py
 91 | 
 92 | if os.path.isfile("training_set.txt") is False and os.path.isfile("testing_set.txt") is False:
 93 |     print("Creating data-labels...")
 94 |     print("Generating TRAINING set...")
 95 |     training_set = onehot.generate_set(set_size, malware_ratio)  # generate random training set
 96 |     with open("training_set.txt", "w") as file:
 97 |         for item in training_set:
 98 |             file.write(str(item) + "\n")
 99 | 
100 |     print("Generating TESTING set...")
101 |     testing_set = onehot.generate_set(testing_set_size, malware_ratio)  # generate random testing set
102 |     with open("testing_set.txt", "w") as file:
103 |         for item in testing_set:
104 |             file.write(str(item) + "\n")
105 | 
106 | training_set = []
107 | 
108 | with open("training_set.txt", "r") as file:
109 |     for line in file:
110 |         line.strip()
111 |         line = line[:-1]
112 |         training_set.append(line)
113 | 
114 | print("Generating TRAINING input...")
115 | data, labels = onehot.generate_input(training_set, total_features)  # perform one-hot encoding
116 | 
117 | 
118 | def create_model():
119 |     model = Sequential()
120 | 
121 |     model.add(Dense(units=200, activation="relu", input_dim=total_features))
122 |     model.add(Dropout(0.5))  # add dropout
123 | 
124 |     model.add(Dense(units=200, activation="relu"))
125 |     model.add(Dropout(0.5))
126 | 
127 |     model.add(Dense(2, activation="softmax"))  # output layer, with softmax activation function and 2 neurons
128 | 
129 |     model.compile(loss="sparse_categorical_crossentropy",
130 |                   optimizer='adam',
131 |                   metrics=["accuracy"])
132 |     # loss sparse categorical, Adam optimizer
133 |     model.summary()
134 |     return model
135 | 
136 | 
137 | def tune_batch_epochs():
138 | 
139 |     model = KerasClassifier(build_fn=create_model, verbose=1)
140 | 
141 |     epochs = [5, 10, 15, 20]
142 |     batch_size = [50, 100, 128, 200]
143 |     optimizer = ['SGD', 'RMSprop', 'Adagrad', 'Adadelta', 'Adam', 'Adamax', 'Nadam']
144 |     learn_rate = [0.001, 0.01, 0.1, 0.2, 0.3]
145 |     # momentum = [0.0, 0.2, 0.4, 0.6, 0.8, 0.9]  # to work with SGD
146 |     kernel_initializer = ['uniform', 'lecun_uniform', 'normal', 'zero', 'glorot_normal', 'glorot_uniform', 'he_normal', 'he_uniform']  # weight init
147 |     param_grid = dict(batch_size=batch_size, epochs=epochs, optimizer=optimizer, learn_rate=learn_rate, kernel_initializer=kernel_initializer)
148 | 
149 |     grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=1, cv=3)
150 |     grid_result = grid.fit(data, labels)
151 | 
152 |     print("Best: ", grid_result.best_score_, "using", grid_result.best_params_))
153 |     means = grid_result.cv_results_['mean_test_score']
154 |     stds = grid_result.cv_results_['std_test_score']
155 |     params = grid_result.cv_results_['params']
156 |     for mean, stdev, param in zip(means, stds, params):
157 |         print(mean, stdev, "with",  param))
158 | 
159 | 
160 | tune_batch_epochs()
161 | """
162 | 


--------------------------------------------------------------------------------
/feature_based_original_dataset/set_onehot_encoding.py:
--------------------------------------------------------------------------------
  1 | import csv
  2 | from random import randint, shuffle
  3 | import os
  4 | import numpy as np
  5 | 
  6 | csv_malware = "../sha256_family.csv"  # csv file with malware apps
  7 | feature_index_dir = 'features_indexes/'  # directory with indexed features for all apps
  8 | 
  9 | malware = []
 10 | benign = []
 11 | 
 12 | 
 13 | def create_list_of_apps():
 14 |     print("Creating list of malicious apps...")
 15 |     with open(csv_malware, 'r') as file:  # open malware csv file
 16 |         next(file)  # skip the header line
 17 |         reader = csv.reader(file, delimiter=',')  # read the csv malware families
 18 |         for row in reader:
 19 |             malware.append(row[0])  # append every row from the csv file into a list
 20 |     print("Malware apps found: ", len(malware))  # 5560
 21 |     print("Malware sample: ", malware[randint(0, len(malware) - 1)])  # print a random malware sample
 22 | 
 23 |     print("Creating list of benign apps...")
 24 |     for filename in os.listdir(feature_index_dir):  # read all apps
 25 |         if filename not in malware:  # if a SHA name not in malware list, append it to benign list
 26 |             benign.append(filename)
 27 |     print("Benign apps found: ", len(benign))  # 123453
 28 |     print("Benign app sample: ", benign[randint(0, len(benign) - 1)], )  # print a random benign app
 29 | 
 30 |     print("Total apps (Benign & Malicious) found: ", len(malware) + len(benign))  # 129013
 31 | 
 32 | 
 33 | malware_incremental_counter = 0
 34 | benign_incremental_counter = 0
 35 | 
 36 | 
 37 | def generate_set_incremental(set_size, malware_ratio):
 38 |     global malware_incremental_counter, benign_incremental_counter
 39 |     set = []  # list that will fill with app set
 40 | 
 41 |     print("Creating set with", set_size, "samples...")
 42 |     print("Malware ratio:", int(malware_ratio * 100), "%, totaling", int(set_size * malware_ratio), "apps in", set_size)
 43 |     print("Creating malware set...")
 44 | 
 45 |     while len(set) < (set_size * malware_ratio):
 46 |         app = malware[malware_incremental_counter]  # locate malware based on random index in malware list
 47 |         malware_incremental_counter += 1
 48 |         if malware_incremental_counter >= 5560:
 49 |             break
 50 |         if app not in set:
 51 |             set.append(app)  # append malware to set list
 52 | 
 53 |     print("Total malware apps in set: ", len(set))
 54 |     print("Malware sample in set: ", set[0])
 55 | 
 56 |     print("Creating benign set...")
 57 | 
 58 |     while len(set) < set_size:
 59 |         app = benign[benign_incremental_counter]  # locate benign based on random index in benign list
 60 |         benign_incremental_counter += 1
 61 |         if benign_incremental_counter >= 123453:
 62 |             break
 63 |         if app not in set:
 64 |             set.append(app)  # append benign to set list
 65 |     print(malware_incremental_counter)
 66 |     print("Total apps (malicious and benign) in set: ", len(set))
 67 |     return set
 68 | 
 69 | 
 70 | def generate_set(set_size, malware_ratio):
 71 |     set = []  # list that will fill with app set
 72 | 
 73 |     print("Creating set with", set_size, "samples...")
 74 |     print("Malware ratio:", int(malware_ratio * 100), "%, totaling", int(set_size * malware_ratio), "apps in", set_size)
 75 |     print("Creating malware set...")
 76 | 
 77 |     while len(set) < (set_size * malware_ratio):
 78 |         index = randint(0, len(malware) - 1)  # choose a random index between (0,5559)
 79 |         app = malware[index]  # locate malware based on random index in malware list
 80 |         if app not in set:
 81 |             set.append(app)  # append malware to set list
 82 | 
 83 |     print("Total malware apps in set: ", len(set))
 84 |     print("Malware sample in set: ", set[0])
 85 | 
 86 |     print("Creating benign set...")
 87 |     while len(set) < set_size:
 88 |         index = randint(0, len(benign) - 1)  # choose a random index between (0,129012)
 89 |         app = benign[index]  # locate benign based on random index in benign list
 90 |         if app not in set:
 91 |             set.append(app)  # append benign to set list
 92 | 
 93 |     print("Total apps (malicious and benign) in set: ", len(set))
 94 |     return set
 95 | 
 96 | 
 97 | def generate_input(set, total_features):
 98 |     print("performing one hot encoding...")
 99 |     # return a 2D array filled with zeros that will be used for the features of each app
100 |     data = np.zeros((len(set), total_features), dtype=float)
101 |     # return an array filled with zeros that will be used for the label of each app {0-benign 1-malicious}
102 |     labels = np.zeros((len(set),), dtype=int)
103 | 
104 |     shuffle(set)  # shuffle the set
105 |     for id_app, app in enumerate(set):  # iterate through set with a counter
106 |         with open(feature_index_dir + app, 'r') as file:  # open apps in set
107 |             for index in file:  # read line by line
108 |                 data[id_app][int(index)] = 1.0  # update corresponding element of the array with 1.0
109 | 
110 |         if app in malware:
111 |             labels[id_app] = 1  # update corresponding label to 1 if it is malware
112 |         else:
113 |             labels[id_app] = 0
114 | 
115 |     #print(data)
116 |     #print(labels)
117 |     #print(data.shape)
118 |     #print(labels.shape)
119 |     return data, labels
120 | 


--------------------------------------------------------------------------------
/feature_based_original_dataset/train_models.py:
--------------------------------------------------------------------------------
  1 | import set_onehot_encoding as onehot
  2 | import models
  3 | import neural_network as NN
  4 | 
  5 | 
  6 | def create_sets():
  7 |     training_set = []  # the list of training set
  8 |     testing_set = []  # the list of testing set
  9 | 
 10 |     with open("training_set_1500.txt", "r") as file:  # read training set file and append applications to list
 11 |         for line in file:
 12 |             line.strip()  # remove whitespace
 13 |             line = line[:-1]  # remove \n
 14 |             training_set.append(line)  # add item to list
 15 |     with open("testing_set_1500.txt", "r") as file:  # read testing set file and append applications to list
 16 |         for line in file:
 17 |             line.strip()
 18 |             line = line[:-1]
 19 |             testing_set.append(line)
 20 |     print("Generating TRAINING input...")
 21 |     data, labels = onehot.generate_input(training_set, total_features)  # perform one-hot encoding
 22 |     print("Generating TESTING input...")
 23 |     test_data, test_labels = onehot.generate_input(testing_set, total_features)  # perform one-hot encoding
 24 |     return data, labels, test_data, test_labels
 25 | 
 26 | 
 27 | def train_models():
 28 |     data, labels, test_data, test_labels = create_sets()
 29 |     """
 30 |     for classic machine learning model, e.g., Naive Bayes, Decision Tree etc, first we fit the classifier and
 31 |     then we evaluate on the test. The best hyperparameters found from the grid search procedure are defined
 32 |     in the models.py helper script.
 33 |     """
 34 |     #model = GNB.train_gaussian_naive_bayes_classifier(data, labels, save=True)  # train Naive Bayes
 35 |     #GNB.evaluate_gaussian_naive_bayes_classifier(model, test_data, test_labels)  # test performance
 36 | 
 37 |     #model = MNB.train_multi_naive_bayes_classifier(data, labels, save=True)
 38 |     #MNB.evaluate_multi_naive_bayes_classifier(model, test_data, test_labels)
 39 | 
 40 |     #model = CNB.train_complement_naive_bayes_classifier(data, labels, save=True)
 41 |     #CNB.evaluate_complement_naive_bayes_classifier(model, test_data, test_labels)
 42 | 
 43 |     #model = BNB.train_bernoulli_naive_bayes_classifier(data, labels, save=True)
 44 |     #BNB.evaluate_bernoulli_naive_bayes_classifier(model, test_data, test_labels)
 45 | 
 46 |     #model = DT.train_decision_tree_classifier(data, labels, save=True)
 47 |     #DT.evaluate_decision_tree_classifier(model, test_data, test_labels)
 48 | 
 49 |     #model = RF.train_random_forest_classifier(data, labels, save=True)
 50 |     #RF.evaluate_random_forest_classifier(model, test_data, test_labels)
 51 | 
 52 |     #model = KNN.train_knn_classifier(data, labels, save=True)
 53 |     #KNN.evaluate_knn_classifier(model, test_data, test_labels)
 54 | 
 55 |     #model = LR.train_logistic_regression_classifier(data, labels, save=True)
 56 |     #LR.evaluate_logistic_regression_classifier(model, test_data, test_labels)
 57 | 
 58 |     #model = SVM.train_svm_classifier(data, labels, save=True)
 59 |     #SVM.evaluate_svm_classifier(model, test_data, test_labels)
 60 | 
 61 |     # init the neural net
 62 |     model = NN.generate_neural_network(total_features, units, dropout, learn_rate, kernel_initializer,
 63 |     #                                  bias_initializer, activation_function)
 64 |     """
 65 |     train the neural network with the given model, epochs, batch size, train data-labels.
 66 |     Specify verbosity level, validation data, callbacks and plots (if needed).
 67 |     Default parameters:
 68 |     verbose=0, validation=False, val_data=None, val_labels=None, callbacks=False, plot_history=False
 69 |     example:
 70 |     NN.train_neural_network(model, epochs, batch_size, data, labels, verbose=0,
 71 |                             validation=True, val_data=test_data, val_labels=test_labels,
 72 |                             callbacks=True, plot_history=True)
 73 |     This is the main training stage and thus we want to save the best models at the 'right time'. This is done
 74 |     setting the callback to True. Keras will seek for the minimum validation loss and it saves the model with
 75 |     the highest validation accuracy.
 76 |     """
 77 |     NN.train_neural_network(model, epochs, batch_size, data, labels, verbose=2,
 78 |                             validation=True, val_data=test_data, val_labels=test_labels,
 79 |                             callbacks=True)
 80 | 
 81 | 
 82 | if __name__ == "__main__":
 83 |     total_features = 545333  # total unique features
 84 |     set_size = 1500  # set site that will be used to create random training set
 85 |     testing_set_size = 1500  # set site that will be used to create random test set
 86 |     malware_ratio = 0.3  # malware ratio in the set size
 87 | 
 88 |     print("Creating data-labels...")
 89 |     onehot.create_list_of_apps()  # function from set_one_encoding.py
 90 | 
 91 |     # initialize sklearn models (classic machine learning)
 92 |     GNB = models.GaussianNaiveBayes()
 93 |     MNB = models.MultinomialNaiveBayes()
 94 |     CNB = models.ComplementNaiveBayes()
 95 |     BNB = models.BernoulliNaiveBayes()
 96 |     DT = models.DecisionTree()
 97 |     RF = models.RandomForest()
 98 |     KNN = models.KNearestNeighbors()
 99 |     LR = models.LogRegression()
100 |     SVM = models.SupportVectorMachine()
101 | 
102 |     # neural net parameters
103 |     units = [200, 200]
104 |     dropout = 0.2  # dropout rate to avoid over fitting (Note that dropout alone is not efficient)
105 |     epochs = 4  # set maximum epochs to 20. If callbacks are specified Keras will automatically stop the procedure
106 |     batch_size = 150  # we found that the batch size of 150 fits better in our task
107 |     learn_rate = 0.001  # specify the learning rate according to the optimizer used
108 |     kernel_initializer = 'glorot_uniform'  # weight initialization
109 |     bias_initializer = 'zeros'  # bias initialization
110 |     activation_function = 'relu'  # activation function
111 | 
112 |     train_models()
113 | 


--------------------------------------------------------------------------------
/feature_based_original_dataset/train_random_subsampling.py:
--------------------------------------------------------------------------------
  1 | import set_onehot_encoding as onehot
  2 | import models
  3 | import neural_network as NN
  4 | import numpy as np
  5 | 
  6 | 
  7 | def create_random_sets():
  8 |     print("Generating TRAINING set...")
  9 |     training_set = onehot.generate_set(set_size, malware_ratio)  # generate random training set
 10 |     print("Generating TRAINING input...")
 11 |     data, labels = onehot.generate_input(training_set, total_features)  # perform one-hot encoding
 12 |     print("Generating TESTING set...")
 13 |     testing_set = onehot.generate_set(testing_set_size, malware_ratio)  # generate random testing set
 14 |     print("Generating TESTING input...")
 15 |     test_data, test_labels = onehot.generate_input(testing_set, total_features)  # perform one-hot encoding
 16 |     return data, labels, test_data, test_labels  # return train data - labels and test data - labels
 17 | 
 18 | 
 19 | def random_sub_sampling(runs):
 20 |     score_gnb = []  # score for gaussian naive bayes
 21 |     score_mnb = []  # scores for multinomial naive bayes
 22 |     score_cnb = []  # scores for complement naive bayes
 23 |     score_bnb = []  # scores for bernoulli naive bayes
 24 |     score_dt = []  # scores for decision trees
 25 |     score_rf = []  # scores for random forest
 26 |     score_knn = []  # scores for k nearest neighbors
 27 |     score_lr = []  # scores for logistic regression
 28 |     score_svm = []  # scores for support vector machines
 29 |     score_nn = []  # scores for neural network
 30 | 
 31 |     for i in range(runs):
 32 | 
 33 |         data, labels, test_data, test_labels = create_random_sets()  # choose random training and testing sets
 34 | 
 35 |         #model = GNB.train_gaussian_naive_bayes_classifier(data, labels)  # train Gaussian Naive Bayes
 36 |         #score_gnb = GNB.evaluate_gaussian_naive_bayes_classifier(model, test_data, test_labels)  # evaluate performance
 37 | 
 38 |         #model = MNB.train_multi_naive_bayes_classifier(data, labels)  # train Multinomial Naive Bayes
 39 |         #score_mnb = MNB.evaluate_multi_naive_bayes_classifier(model, test_data, test_labels)
 40 | 
 41 |         #model = CNB.train_complement_naive_bayes_classifier(data, labels)  # train Complement Naive Bayes
 42 |         #score_cnb = CNB.evaluate_complement_naive_bayes_classifier(model, test_data, test_labels)
 43 | 
 44 |         #model = BNB.train_bernoulli_naive_bayes_classifier(data, labels)  # train Bernoulli Naive Bayes
 45 |         #score_bnb = BNB.evaluate_bernoulli_naive_bayes_classifier(model, test_data, test_labels)
 46 | 
 47 |         #model = DT.train_decision_tree_classifier(data, labels)  # train Decision Tree Classifier
 48 |         #score_dt = DT.evaluate_decision_tree_classifier(model, test_data, test_labels)
 49 | 
 50 |         #model = RF.train_random_forest_classifier(data, labels)  # train Random Forest
 51 |         #score_rf = RF.evaluate_random_forest_classifier(model, test_data, test_labels)
 52 | 
 53 |         #model = KNN.train_knn_classifier(data, labels)  # train k-Nearest Neighbors Classifier
 54 |         #score_knn = KNN.evaluate_knn_classifier(model, test_data, test_labels)
 55 | 
 56 |         #model = LR.train_logistic_regression_classifier(data, labels)  # train logistic Regression
 57 |         #score_lr = LR.evaluate_logistic_regression_classifier(model, test_data, test_labels)
 58 | 
 59 |         #model = SVM.train_svm_classifier(data, labels)  # train Support Vector Machines
 60 |         #score_svm = SVM.evaluate_svm_classifier(model, test_data, test_labels)
 61 | 
 62 |         # init neural net
 63 |         model = NN.generate_neural_network(total_features, units, dropout, learn_rate, kernel_initializer,
 64 |                                            bias_initializer, activation_function)
 65 |         """
 66 |         this is not the actual training procedure and we don't want to save the models. To save models and implement
 67 |         the early stopping technique refer to train_models.py
 68 |         The goal of this operation is only to determine the behavior of models to random training sets and random
 69 |         testing sets!
 70 |         So, only train and evaluate models.
 71 |         """
 72 |         NN.train_neural_network(model, epochs, batch_size, data, labels, verbose=2)
 73 |         score = NN.evaluate_neural_network(model, test_data, test_labels)
 74 |         score_nn.append(score)
 75 | 
 76 |     # get average accuracy and standard deviation for each model for each model
 77 |     #print("NB Average accuracy: ", np.mean(score_gnb), "Standard Deviation:", np.std(score_gnb))
 78 |     #print("MNB Average accuracy: ", np.mean(score_mnb), "Standard Deviation:", np.std(score_mnb))
 79 |     #print("CNB Average accuracy: ", np.mean(score_cnb), "Standard Deviation:", np.std(score_cnb))
 80 |     #print("BNB Average accuracy: ", np.mean(score_bnb), "Standard Deviation:", np.std(score_bnb))
 81 |     #print("DT Average accuracy: ", np.mean(score_dt), "Standard Deviation:", np.std(score_dt))
 82 |     #print("RF Average accuracy: ", np.mean(score_rf), "Standard Deviation:", np.std(score_rf))
 83 |     #print("kNN Average accuracy: ", np.mean(score_knn), "Standard Deviation:", np.std(score_knn))
 84 |     #print("LR Average accuracy: ", np.mean(score_lr), "Standard Deviation:", np.std(score_lr))
 85 |     #print("SVM Average accuracy: ", np.mean(score_svm), "Standard Deviation:", np.std(score_svm))
 86 |     print("NN Average accuracy: ", np.mean(score_nn), "Standard Deviation:", np.std(score_nn))
 87 | 
 88 | 
 89 | if __name__ == "__main__":
 90 |     total_features = 545333  # total unique features
 91 |     set_size = 1500  # set site that will be used to create random training set
 92 |     testing_set_size = 1500  # set site that will be used to create random test set
 93 |     malware_ratio = 0.3  # malware ratio in the set size
 94 | 
 95 |     print("Creating data-labels...")
 96 |     onehot.create_list_of_apps()  # function from set_one_encoding.py
 97 | 
 98 |     # initialize sklearn models
 99 |     GNB = models.GaussianNaiveBayes()
100 |     MNB = models.MultinomialNaiveBayes()
101 |     CNB = models.ComplementNaiveBayes()
102 |     BNB = models.BernoulliNaiveBayes()
103 |     DT = models.DecisionTree()
104 |     RF = models.RandomForest()
105 |     KNN = models.KNearestNeighbors()
106 |     LR = models.LogRegression()
107 |     SVM = models.SupportVectorMachine()
108 | 
109 |     val_runs = 8  # number of times to train and test a model
110 | 
111 |     # neural net parameters
112 |     units = [200, 200]  # number of neurons in each layer (2 hidden layers)
113 |     dropout = 0.001  # dropout rate
114 |     epochs = 4  # epochs per iteration
115 |     batch_size = 150  # batch size
116 |     learn_rate = 0.001  # learning rate of the specified optimizer
117 |     kernel_initializer = 'glorot_uniform'  # weight initialization
118 |     bias_initializer = 'zeros'  # bias initialization
119 |     activation_function = 'relu'  # activation function in hidden layers (We use Softmax in the output layer)
120 | 
121 |     random_sub_sampling(val_runs)


--------------------------------------------------------------------------------
/feature_based_reduced_dataset/README.md:
--------------------------------------------------------------------------------
 1 | The process is similar to the feature_based_original_dataset. In the reduced feature space we introduce an adversarial sample detector.
 2 | 
 3 | ### 1) Feature Reduction
 4 | 
 5 | The huge variety of features in the dataset leads to a high dimensionality and as a result the data become sparse. The sparsity leads to vast computation cost making the whole dataset unavailable for models to process it in a single run. Moreover, this means that in the dataset present features which have a minimal importance for the final decision of a classifier. In literature there exists algorithmic approaches to feature elimination and dimensionality reduction, but we stick with a manual/“regular” reduction due the high computation cost. An adversary can change a network address without much effort. With the elimination of this class, the features are decreased to 234,845, less than half of the original feature space. Furthermore, each Android application has activities, which in essence is the user interface. The user interface does not constitute a significant matter for the classification task as the names can be random and changed. Activities are contained in Components class with a total of 185,729 features. Therefore, the feature space is reduced to 49,116. Moreover, some features are found to a large extent to both benign and malware applications and as such, these features are not important for classification. These are only 6. Consequently, the feature space is reduced to 49,110 from the original 545,333 features. 
 6 | 
 7 | ```
 8 | python3 eliminate_low_high_support_features.py
 9 | ```
10 | ```
11 | python3 eliminate_features.py
12 | ```
13 | 
14 | As for the applications themselves, we observed that there exist duplicates, 37,077 in particular. The duplicates, we are referring to, are applications that have the same features. This does not mean that the applications are exactly the same, but that they have similar functionality. Specifically for malicious applications, the duplication may mean that these applications are variants of the original malware. With duplicate applications, the amount of applications can be significantly reduced and thus, the algorithms can fit more applications into one run. Finally, we were able to reduce the dataset from to 129,013 (5, 560 malicious) to 91,936 applications (2591 malicious) and from 545,333 unique features to 3880. To find and remove the duplicate applications, we adjusted [this](http://www.davespace.co.uk/python/remove-duplicates.html) python script.
15 | 
16 | ### 2) Training, Evaluating, Crafting adversarial examples
17 | 
18 | Followe README file in feature_based_original_dataset folder.
19 | 
20 | ### 3) Defense by [detecting adversarial examples](https://arxiv.org/pdf/1702.06280.pdf)
21 | 
22 | Another defensive approach, similar to the adversarial training, is to detect adversarial examples prior the feeding of samples into the main classification model. Our approach produces an external classifier, which is able to classify samples as legitimate or adversarial. 
23 | 
24 | Steps for our model:
25 | 1)	Train a classifier F on the original data D={X,0}, labeling all the training set as 0.
26 | 2)	Craft adversarial examples A for F using the JSMA crafting method.
27 | 3)	Train a new model F' on an augmented dataset X⋃A, labeling each adversarial example as 1.
28 | 4)	Before feeding a new sample to the classifier F, the sample passes through the detector for classification. If the sample is recognized as malicious the process stops. 
29 | 
30 | We evaluate the classification accuracy of the newly generated classifier in the reduced feature space as an attempt to defend against the adversarial examples that deceive the model trained with the Adam optimizer. The original performance of the learned model on the reduced feature space is 98.42% (1.58% FNR). With the JSMA variant the model is completely destroyed as the attack makes the model unable to recognize any malicious application, increasing the FNR to 100%. Similar to adversarial training, there is any specific methodology to follow for the mixing of adversarial examples with a specific legitimate samples ratio.  
31 | 
32 | ```
33 | python3 detector.py
34 | ```
35 | 
36 | We begin training a model only on malware space without the presence of any benign applications. Note that we use every piece of malware sample to craft adversarial examples and to train the adversarial detector. We expect the detector to be highly efficient in distinguish adversarial examples from legitimate samples, but without the ability to accurate classify the samples when mixed with benign and malware samples. Indeed, our detector gets a training accuracy of 99.21% at epoch 15. This means that it may be able to classify adversarial samples with high probability. However, the detector is only trained on the malware space. To get a good estimation, we mix benign and malicious applications, we craft adversarial examples for the original model and we evaluate the detector. As expected, the classification performance is not as high, achieving 83.7% with 16.3% FPR without the presence of an adversary. When adversarial examples crafted, the accuracy slightly decreased to 83.5% with 23.14% FPR and only 1% FNR. This means that only a few adversarial examples bypassed the detector. 
37 | 
38 | The FPR in our first evaluation in the detector trained only with malicious applications is quite high. Therefore, we hope that mixing the benign with malicious applications and crafting adversarial examples only for malware will increase the overall accuracy. We get a sample totaling 2000 applications, 600 of them malicious. This means that the detector is trained on 2000 legitimate samples and 600 adversarial. We re-implement the ModelCheckpoint callback to store only the best model in terms of accuracy. The model stored at epoch 24 where the training accuracy is 98.58%. Next, we craft adversarial examples for the original model in a testing set totaling 8500 applications, where almost malicious applications is present (totaling 2,550 in the testing set) and we evaluate the performance of the trained detector. Surprisingly, the performance is extremely high with and without the presence of adversarial examples. In legitimate applications the model achieves 99.72% accuracy (with 0.28% FPR) and with the presence of an adversary, the model achieves 98.1% accuracy with 5.46% FNR and 0.39% FPR.
39 | 
40 | Training on a sample of the dataset results in extremely high performance: only 5% of the adversarial examples can bypass our security mechanism. However, this means that the 5% of the adversarial malware will be mistakenly identified as benign in the main detector. We also evaluate whether the training on a larger samples, in the presence of almost every malicious application, can create a more efficient model. This can be described as an incremental procedure of the detector. As mentioned, incremental learning may give a false sense of performance, since the malicious applications that are in the wild are by no means covered. We train the detector with a set of 8500 applications, 2550 of them malicious. The detector achieves 99.27% accuracy in the training stage (with 0.45% FPR). In the testing stage it achieves an extremely high performance of 99.98% (only 2 legitimate applications are recognized as adversarial) and with the presence of adversarial examples achieves 99.54% accuracy with only 1.49% FNR. Therefore, the efficiency of the detector is dramatically improved. The higher the performance of the adversarial detector, the better for our main model, as the detector can eliminate adversarial examples. 
41 | 
42 | We showed that training a second classifier that distinguishes original samples from adversarial ones can be used as a defensive mechanism. Only the applications that map as legitimate is passed through the main detector for classification. However, since an intelligent attacker can deceive the main classifier, then it will be easy to deceive the adversarial detector. If the adversary is aware of the external classifier, its goal is to produce adversarial examples that are classified as legitimate. The procedure is similar to the adversarial examples crafting for the main model. The purpose is to classify an adversarial sample that correctly classified as adversarial to a benign one. As such, we craft adversarial examples for the adversarial detector in a test set of 2000 applications. Its original performance is 100% without the presence of an adversary and 99.5% (1.67% FNR) on adversarial examples produced for the main classifier. As expected, when crafting adversarial examples for the detector, the classifier is almost completely fooled. Its accuracy is 70.2%, with 99.33% FNR and 9 average perturbations in the feature space. 
43 | 


--------------------------------------------------------------------------------
/feature_based_reduced_dataset/count_feature_variance.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import csv
 3 | import pandas as pd
 4 | feature_vector = {}  # dictionary with indexes mapped to features
 5 | index = 0  # index value
 6 | 
 7 | feature_vectors_dir = '../new_feature_vectors/'
 8 | new_feature_vectors_dir = 'more_new_feature_vectors/'
 9 | csv_file = "../feature_variances.csv"
10 | 
11 | if not os.path.exists(new_feature_vectors_dir):
12 |     os.makedirs(new_feature_vectors_dir)
13 | 
14 | 
15 | def export_to_csv():
16 |     not_assignable_feature_type = ['']  # found from extract_feature_types.py
17 |     features_variance = {}
18 | 
19 |     for filename in os.listdir(feature_vectors_dir):  # read all apps
20 |         with open(feature_vectors_dir + filename, "r") as file:  # open an app
21 |             for line in file:  # read app line by line
22 | 
23 |                 line = line.strip()  # remove whitespace chars
24 |                 if line not in not_assignable_feature_type:  # check if feature type is ''
25 |                     if line not in features_variance:
26 |                         features_variance[line] = 1
27 |                     else:
28 |                         features_variance[line] += 1
29 | 
30 |     print(len(features_variance))
31 | 
32 |     with open(csv_file, 'w', newline="") as csvfile:
33 |         writer = csv.writer(csvfile)
34 |         writer.writerow(["Feature", "Variance"])
35 |         for key, value in features_variance.items():
36 |             writer.writerow([key, value])
37 | 
38 | def export_eliminated_csv():
39 |     df = pd.read_csv(csv_file)
40 |     #print(df.head(3))
41 |     #print(df.sort_values('Variance'))
42 |     drop = (df[df.Variance >= 5])
43 |     print(len(drop))
44 |     export_to_csv = drop.to_csv("eliminated_variance.csv", index=None, header=True)
45 | 
46 | 
47 | 
48 | if __name__ == "__main__":
49 |     #export_to_csv()
50 |     export_eliminated_csv()
51 | 
52 | 
53 | 
54 | 
55 | 
56 | 
57 | 
58 | 
59 | 
60 | 
61 | 


--------------------------------------------------------------------------------
/feature_based_reduced_dataset/eliminate_features.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pandas as pd
 3 | feature_vector = {}  # dictionary with indexes mapped to features
 4 | index = 0  # index value
 5 | feature_vectors_dir = '../eliminated_apps_feature_vectors/'
 6 | new_feature_vectors_dir = 'new_feature_vectors/'
 7 | if not os.path.exists(new_feature_vectors_dir):
 8 |     os.makedirs(new_feature_vectors_dir)
 9 | 
10 | print("Eliminating features with low variance...")
11 | 
12 | df = pd.read_csv('eliminated_variance.csv')
13 | column = df[df.columns[0]]
14 | features = column.tolist()
15 | print(features)
16 | 
17 | 
18 | for filename in os.listdir(feature_vectors_dir):  # read all apps
19 |     with open(feature_vectors_dir + filename, "r") as file:  # open an app
20 |         f = open(new_feature_vectors_dir + filename, "a")  # create a new file with the same SHA name in another dir
21 |         for line in file:  # read app line by line
22 |             feature = line.strip()
23 |             if feature in features:
24 |                 f.write(feature + "\n")
25 | print("finished!")
26 | 


--------------------------------------------------------------------------------
/feature_based_reduced_dataset/eliminate_low_high_support_features.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | feature_vector = {}  # dictionary with indexes mapped to features
 4 | index = 0  # index value
 5 | feature_vectors_dir = 'eliminated_apps_feature_vectors/'
 6 | new_feature_vectors_dir = 'new_feature_vectors/'
 7 | if not os.path.exists(new_feature_vectors_dir):
 8 |     os.makedirs(new_feature_vectors_dir)
 9 | 
10 | not_assignable_feature_type = ['']  # found from extract_feature_types.py
11 | 
12 | print("Eliminating urls & activities...")
13 | for filename in os.listdir(feature_vectors_dir):  # read all apps
14 |     with open(feature_vectors_dir + filename, "r") as file:  # open an app
15 |         f = open(new_feature_vectors_dir + filename, "a")  # create a new file with the same SHA name in another dir
16 |         for line in file:  # read app line by line
17 |             feature_type = line[:line.find('::')]  # extract feature type
18 |             feature = line.strip()  # remove whitespace chars
19 |             if feature_type not in not_assignable_feature_type:  # check if feature type is ''
20 |                 if feature_type != "url" and feature_type != "activity" and \
21 |                         feature != "feature::android.hardware.touchscreen" and \
22 |                         feature != "intent::android.intent.action.MAIN" and \
23 |                         feature != "intent::android.intent.category.LAUNCHER" and \
24 |                         feature != "call::getSystemService" and \
25 |                         feature != "real_permission::android.permission.INTERNET" and \
26 |                         feature != "permission::android.permission.INTERNET":
27 | 
28 |                     f.write(feature + "\n")
29 | 
30 | print("finished!")
31 | 


--------------------------------------------------------------------------------
/feature_based_reduced_dataset/jsma.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import numpy as np
  3 | from tensorflow import keras
  4 | from sklearn.metrics import confusion_matrix
  5 | import set_onehot_encoding as onehot
  6 | import os
  7 | import joblib
  8 | import models
  9 | 
 10 | 
 11 | def create_set():
 12 |     if os.path.isfile("training_set_8500.txt") is False:
 13 |         set_size = 8500
 14 |         malware_ratio = 0.3
 15 |         print("Creating data-labels...")
 16 |         print("Generating TESTING set...")
 17 |         testing_set = onehot.generate_set(set_size, malware_ratio)  # generate random testing set
 18 |         with open("training_set_8500.txt", "w") as file:
 19 |             for item in testing_set:
 20 |                 file.write(str(item) + "\n")
 21 |     testing_set = []  # the list of testing set
 22 |     with open("training_set_8500.txt", "r") as file:  # read testing set file and append applications to list
 23 |         for line in file:
 24 |             line.strip()
 25 |             line = line[:-1]
 26 |             testing_set.append(line)
 27 |     print("Generating TESTING input...")
 28 |     test_data, test_labels = onehot.generate_input(testing_set, total_features)  # perform one-hot encoding
 29 |     return test_data, test_labels
 30 | 
 31 | 
 32 | """
 33 | functions to compute Jacobian with numpy.
 34 | https://medium.com/unit8-machine-learning-publication/computing-the-jacobian-matrix-of-a-neural-network-in-python-4f162e5db180
 35 | First we specify the the forward and backward passes of each layer to implement backpropagation manually.
 36 | """
 37 | 
 38 | 
 39 | def affine_forward(x, w, b):
 40 |     """
 41 |     Forward pass of an affine layer
 42 |     :param x: input of dimension (I, )
 43 |     :param w: weights matrix of dimension (I, O)
 44 |     :param b: biais vector of dimension (O, )
 45 |     :return output of dimension (O, ), and cache needed for backprop
 46 |     """
 47 |     out = np.dot(x, w) + b
 48 |     cache = (x, w)
 49 |     return out, cache
 50 | 
 51 | 
 52 | def affine_backward(dout, cache):
 53 |     """
 54 |     Backward pass for an affine layer.
 55 |     :param dout: Upstream Jacobian, of shape (M, O)
 56 |     :param cache: Tuple of:
 57 |       - x: Input data, of shape (I, )
 58 |       - w: Weights, of shape (I, O)
 59 |     :return the jacobian matrix containing derivatives of the M neural network outputs with respect to
 60 |             this layer's inputs, evaluated at x, of shape (M, I)
 61 |     """
 62 |     x, w = cache
 63 |     dx = np.dot(dout, w.T)
 64 |     return dx
 65 | 
 66 | 
 67 | def relu_forward(x):
 68 |     """ Forward ReLU
 69 |     """
 70 |     out = np.maximum(np.zeros(x.shape), x)
 71 |     cache = x
 72 |     return out, cache
 73 | 
 74 | 
 75 | def relu_backward(dout, cache):
 76 |     """
 77 |     Backward pass of ReLU
 78 |     :param dout: Upstream Jacobian
 79 |     :param cache: the cached input for this layer
 80 |     :return: the jacobian matrix containing derivatives of the M neural network outputs with respect to
 81 |              this layer's inputs, evaluated at x.
 82 |     """
 83 |     x = cache
 84 |     dx = dout * np.where(x > 0, np.ones(x.shape), np.zeros(x.shape))
 85 |     return dx
 86 | 
 87 | 
 88 | def softmax_forward(x):
 89 |     """ Forward softmax
 90 |     """
 91 |     exps = np.exp(x - np.max(x))
 92 |     s = exps / exps.sum()
 93 |     return s, s
 94 | 
 95 | 
 96 | def softmax_backward(dout, cache):
 97 |     """
 98 |     Backward pass for softmax
 99 |     :param dout: Upstream Jacobian
100 |     :param cache: contains the cache (in this case the output) for this layer
101 |     """
102 |     s = cache
103 |     ds = np.diag(s) - np.outer(s, s.T)
104 |     dx = np.dot(dout, ds)
105 |     return dx
106 | 
107 | 
108 | def get_activations(model, layer_id, X):
109 |     """
110 |     Computes outputs of intermediate layers
111 |     :param model: the trained model
112 |     :param layer_id: the id of the layer that we want the output from
113 |     :param X: input feature vector
114 |     :return: output of layer (layer_id)
115 |     """
116 |     intermediate_layer_model = keras.models.Model(inputs=model.input,
117 |                                                   outputs=model.layers[layer_id].output)
118 |     intermediate_output = intermediate_layer_model.predict(X)
119 |     return intermediate_output
120 | 
121 | 
122 | def forward_backward(model, x):
123 |     """
124 |     computes the forward derivative for the given input
125 |     :param model: the trained model
126 |     :param x: input feature vector
127 |     :return: prediction result and forward derivative
128 |     """
129 |     layer_to_cache = dict()  # for each layer, we store the cache needed for backward pass
130 |     forward_values = []
131 | 
132 |     for i in range(0, len(model.layers), 2):
133 |         values = {}
134 |         w, b = model.layers[i].get_weights()
135 |         values['w'] = w
136 |         values['b'] = b
137 |         forward_values.append(values)
138 | 
139 |     # Forward pass
140 |     a1, cache_a1 = affine_forward(x, forward_values[0]['w'], forward_values[0]['b'])
141 |     _, cache_r1 = relu_forward(a1)
142 |     r1 = get_activations(model, 0, x)
143 |     forward_values[0]['a'] = a1
144 |     forward_values[0]['cache_a'] = cache_a1
145 |     forward_values[0]['r'] = r1
146 |     forward_values[0]['cache_r'] = cache_r1
147 | 
148 |     for i, layer_index in zip(range(1, len(forward_values) - 1), range(2, len(model.layers), 2)):
149 |         a, cache_a = affine_forward(forward_values[i - 1]['r'], forward_values[i]['w'], forward_values[i]['b'])
150 |         _, cache_r = relu_forward(a)
151 |         r = get_activations(model, layer_index, x)
152 |         forward_values[i]['a'] = a
153 |         forward_values[i]['cache_a'] = cache_a
154 |         forward_values[i]['r'] = r
155 |         forward_values[i]['cache_r'] = cache_r
156 | 
157 |     a, cache_a = affine_forward(forward_values[len(forward_values) - 2]['r'],
158 |                                 forward_values[len(forward_values) - 1]['w'],
159 |                                 forward_values[len(forward_values) - 1]['b'])
160 |     forward_values[len(forward_values) - 1]['a'] = a
161 |     forward_values[len(forward_values) - 1]['cache_a'] = cache_a
162 |     out, cache_out = softmax_forward(a)
163 | 
164 |     # backward pass
165 |     dout = np.diag(np.ones(out.size, ))  # the derivatives of each output w.r.t. each output.
166 |     dout = softmax_backward(dout, cache_out)
167 |     dout = affine_backward(dout, forward_values[len(forward_values) - 1]['cache_a'])
168 | 
169 |     for i in range(len(forward_values) - 2, 0, -1):
170 |         dout = relu_backward(dout, forward_values[i]['cache_r'])
171 |         dout = affine_backward(dout, forward_values[i]['cache_a'])
172 | 
173 |     dout = relu_backward(dout, forward_values[0]['cache_r'])
174 |     dx = affine_backward(dout, forward_values[0]['cache_a'])
175 | 
176 |     return out, dx
177 | 
178 | 
179 | def craft_adversarial_samples(x, y, F, k):
180 |     """
181 |     JSMA variant for adversarial examples crafting algorithm as described in https://arxiv.org/abs/1606.04435
182 |     JSMA iteratively selects the most useful features to perturb a small magnitude of value until the target class is
183 |     achived. The perturbed featured are selected based on the saliency map. Saliency maps are used for a network's
184 |     visualization and describe which features are the most important for a particular output class. The goal
185 |     is to eliminate those attributes from a legitimate sample and bring up the most important ones for the target class
186 |     in oder to cause the model to misclassify. This is done by pushing the features away from the original label
187 |     and closer to the target class.
188 |     Steps:
189 |     1)Compute the gradient of F with respect to the input X to estimate the direction in which a perturbation in X
190 |       would change F's output. That is, compute the forward derivative (the Jacobian of the learned function for
191 |       a legitimate sample).
192 |                             ∇F(x)=(∂F(x))/∂x=[(∂F_j (x))/(∂x_i )]_(iϵ1…M,jϵ1…N)
193 |       where x is the model’s input, F is the network, F(x) the predicted class, M the input dimension,
194 |       N the output dimension, (i, j) is the derivative class of class j with respect to the input feature i.
195 |       In essence, it computes the gradient of F with respect to input x to estimate the direction in which
196 |       a perturbation in x would change the output. In backpropagation, the forward derivative is calculated
197 |       with respect to the loss function and the gradients with respect to the network parameters with the goal of
198 |       updating the weights. On the contrary, in JSMA the forward derivative is taken with respect to the network
199 |       directly and the gradients with respect to the input data.
200 |     2)Choose a perturbation δ of X with maximal positive gradient into the the target class y'.
201 |       In other words, choose the index that maximizes the change into the target class 0 by changing X_i.
202 |       The limitation is that we can only add features and not discard them as in a real world scenario an adversary doesnt want
203 |       to 'break' the functionality of an application.
204 |     Algorithm:
205 |        Input x, y, F, k, I
206 |        x_adv <- x
207 |        Gamma = {1...|x|}
208 |        while arg max_jF_j(x_adv) != y and ||δ_X|| < k do
209 |            Compute the forward derivative ∇F(adv_x)
210 |            i_max = arg max_j∈Γ∩I,X_j=0  ∂Fy(X)/∂Xj
211 |            if i_max <= 0 then
212 |               :return Failure
213 |            end if
214 |            adv_x_i_max = 1
215 |            δ_x <- x_adv - x
216 |            :return adv_x
217 |     :param x: input feature vector
218 |     :param y: target class
219 |     :param F: the trained model
220 |     :param k: index of the hidden layer
221 |     :return: adversarial sample based on feature vector x
222 |     """
223 |     x_adv = x
224 |     gamma = [1] * len(x)
225 |     delta_x = [0]
226 |     changes = 0
227 | 
228 |     if np.argmax(F.predict(x_adv), 1) == 0:  # if misclassification achieved return adv_x
229 |         return x_adv, -1
230 | 
231 |     while np.argmax(F.predict(x_adv), 1) != y and np.linalg.norm(delta_x, ord=1) < k and changes < 20:
232 |         # compute forward derivative (Jacobian)
233 |         prob, forward_derivative = forward_backward(F, x_adv)
234 | 
235 |         tmp = np.multiply(forward_derivative[0], gamma)
236 |         for i, feature in enumerate(x_adv[0]):
237 |             if feature == 1:
238 |                 tmp[i] = 0
239 |         i_max = np.argmax(tmp)
240 |         if i_max <= 0:
241 |             raise ValueError('FAILURE: We can only add features to an application!')
242 | 
243 |         x_adv[0][i_max] = 1
244 |         delta_x = np.subtract(x_adv, x)
245 |         # print(i_max)
246 |         if i_max not in changes_dict:
247 |             changes_dict[i_max] = 1
248 |         else:
249 |             changes_dict[i_max] += 1
250 |         changes += 1
251 |     print("Changes:", changes)
252 | 
253 |     return x_adv, changes
254 | 
255 | 
256 | def evaluate_detector_on_adversarial_examples():
257 |     average_changes = 0
258 |     amount_malwares = 0
259 |     averageChanges = 0
260 |     # attack the detector
261 |     for i in range(len(val_data)):
262 | 
263 |         if val_labels[i] == 1:
264 | 
265 |             x = val_data[i:i + 1]
266 |             # print("x: ", x)
267 |             # print(x.shape)
268 |             try:
269 |                 adv_x, changes = craft_adversarial_samples(x, 0, detector, 1)
270 |                 # print(adv_x)
271 |                 val_data[i] = adv_x
272 |                 if changes >= 0:
273 |                     average_changes += changes
274 |                     amount_malwares += 1
275 |             except NameError:
276 |                 pass
277 |             except ValueError:
278 |                 pass
279 |     if amount_malwares > 0:
280 |         averageChanges += (average_changes / float(amount_malwares))
281 | 
282 |     # evaluate the detector
283 |     predictions = detector.predict(val_data)
284 |     confusion = confusion_matrix(val_labels, np.argmax(predictions, axis=1))
285 |     print(confusion)
286 |     TP = confusion[1, 1]
287 |     TN = confusion[0, 0]
288 |     FP = confusion[0, 1]
289 |     FN = confusion[1, 0]
290 |     FNR = FN / float(FN + TP) * 100
291 |     FPR = FP / float(FP + TN) * 100
292 |     accuracy = ((TP + TN) / float(TP + TN + FP + FN)) * 100
293 |     print("Adversarial  FP:", FP, "- FN:", FN, "- TP:", TP, "- TN", TN)
294 |     print("Adversarial Accuracy:", accuracy, "- FPR:", FPR, "- FNR:", FNR)
295 |     print("Misclassification Rate:", FNR - FNR_original)
296 |     print("Distortion:", averageChanges)
297 |     print(changes_dict)
298 | 
299 | 
300 | if __name__ == "__main__":
301 |     total_features = 3880  # total unique features
302 |     print("Creating data-labels...")
303 |     onehot.create_list_of_apps()  # function from set_one_encoding.py
304 | 
305 |     changes_dict = {}  # dictionary for perturbations (added features)
306 | 
307 |     trained_model = tf.keras.models.load_model('best_model_Adam.h5')
308 |     #detector = tf.keras.models.load_model('external_detector.h5')
309 | 
310 |     averageChanges = 0
311 | 
312 |     val_data, val_labels = create_set()
313 |     predict_original = trained_model.predict(val_data)
314 |     confusion = confusion_matrix(val_labels, np.argmax(predict_original, axis=1))
315 | 
316 |     TP = confusion[1, 1]
317 |     TN = confusion[0, 0]
318 |     FP = confusion[0, 1]
319 |     FN = confusion[1, 0]
320 |     FNR_original = FN / float(FN + TP) * 100
321 |     FPR = FP / float(FP + TN) * 100
322 |     accuracy = ((TP + TN) / float(TP + TN + FP + FN)) * 100
323 |     print(confusion)
324 |     print("Original FP:", FP, "- FN:", FN, "- TP:", TP, "- TN", TN)
325 |     print("Original Accuracy:", accuracy, "- FPR:", FPR, "- FNR:", FNR_original)
326 |     del predict_original
327 |     average_changes = 0
328 |     amount_malwares = 0
329 |     val_data, val_labels = create_set()
330 | 
331 |     for i in range(len(val_data)):
332 | 
333 |         if val_labels[i] == 1:
334 | 
335 |             x = val_data[i:i + 1]
336 |             #print("x: ", x)
337 |             #print(x.shape)
338 |             try:
339 |                 adv_x, changes = craft_adversarial_samples(x, 0, trained_model, 1)
340 |                 # print(adv_x)
341 |                 val_data[i] = adv_x
342 |                 if changes >= 0:
343 |                     average_changes += changes
344 |                     amount_malwares += 1
345 |             except NameError:
346 |                 pass
347 |             except ValueError:
348 |                 pass
349 | 
350 |     if amount_malwares > 0:
351 |         averageChanges += (average_changes / float(amount_malwares))
352 |     #print(val_data.shape)
353 | 
354 |     # evaluate the model on adversarial examples
355 |     predictions = trained_model.predict(val_data)
356 |     confusion = confusion_matrix(val_labels, np.argmax(predictions, axis=1))
357 |     print(confusion)
358 |     TP = confusion[1, 1]
359 |     TN = confusion[0, 0]
360 |     FP = confusion[0, 1]
361 |     FN = confusion[1, 0]
362 |     FNR = FN / float(FN + TP) * 100
363 |     FPR = FP / float(FP + TN) * 100
364 |     accuracy = ((TP + TN) / float(TP + TN + FP + FN)) * 100
365 |     print("Adversarial  FP:", FP, "- FN:", FN, "- TP:", TP, "- TN", TN)
366 |     print("Adversarial Accuracy:", accuracy, "- FPR:", FPR, "- FNR:", FNR)
367 |     print("Misclassification Rate:", FNR - FNR_original)
368 |     print("Distortion:", averageChanges)
369 |     print(changes_dict)
370 | 	
371 | 	#evaluate_detector_on_adversarial_examples()
372 | 


--------------------------------------------------------------------------------
/feature_based_reduced_dataset/label_encoding.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | feature_vector = {}
 4 | index = 0
 5 | feature_vectors_dir = 'new_feature_vectors/'
 6 | feature_indexes_dir = 'features_indexes/'
 7 | 
 8 | if not os.path.exists(feature_indexes_dir):
 9 |     os.makedirs(feature_indexes_dir)
10 | 
11 | print("Creating a dictionary that maps features to numeric values...")
12 | for filename in os.listdir(feature_vectors_dir):  # read all apps
13 |     with open(feature_vectors_dir + filename, "r") as file:  # open an app
14 |         for line in file:  # read app line by line
15 |             feature_type = line[:line.find('::')]  # extract feature type
16 |             feature = line.strip()  # remove whitespace chars
17 |                 # if a feature is not present in the feature vector, map feature to index and increment index
18 |             if feature not in feature_vector:
19 |                 feature_vector[feature] = index
20 |                 index = index + 1
21 | print("Finished!")
22 | 
23 | print("Creating files with numeric values as features...")
24 | for filename in os.listdir(feature_vectors_dir):  # recreate files with indexes
25 |     with open(feature_vectors_dir + filename, "r") as file:  # first open the orignal feature vectors
26 |         f = open(feature_indexes_dir + filename, "a")  # create a new file with the same SHA name in another dir
27 |         for line in file:  # read original feature vectors line by line
28 |             feature_type = line[:line.find('::')]  # extract feature type
29 |             feature = line.strip()  # remove whitespace chars
30 |             f.write(str(feature_vector[feature]) + '\n')  # append the index of the feature to the new file
31 |         f.close()
32 | 
33 | print("Total features in dataset: ", len(feature_vector))  #
34 | 


--------------------------------------------------------------------------------
/feature_based_reduced_dataset/neural_network.py:
--------------------------------------------------------------------------------
  1 | from sklearn.metrics import confusion_matrix
  2 | import timeit
  3 | from keras import Sequential
  4 | from keras.layers import Dense, Dropout
  5 | import numpy as np
  6 | import matplotlib.pyplot as plt
  7 | from keras.callbacks import TensorBoard
  8 | from keras.callbacks import EarlyStopping
  9 | from keras.callbacks import ModelCheckpoint
 10 | from keras.optimizers import Adam, SGD, RMSprop, Adagrad, Adadelta, Adamax, Nadam
 11 | 
 12 | average_FNR = 0
 13 | average_FPR = 0
 14 | average_accuracy = 0
 15 | 
 16 | 
 17 | def generate_neural_network(total_features, units, dropout, learn_rate, kernel, bias, activation_function):
 18 |     """
 19 |     :param total_features: the total features (input_dim) we used to train our network
 20 |     :param units: neurons in the hidden layers
 21 |     :param dropout: the dropout rate
 22 |     :param learn_rate: learning rate
 23 |     :param kernel: (kernel_initializer) weights initialization
 24 |     :param bias: (bias_initializer) bias init initialization
 25 |     :param activation_function: activation function
 26 |     :return:
 27 |     """
 28 |     model = Sequential()  # neural net init
 29 |     """
 30 |     add input layer dimension with 545333 features
 31 |     hidden layers with the defined units, dropout rate, weight and biases initialization,
 32 |     relu activation function and softmax in output layer
 33 |     """
 34 |     model.add(Dense(units=units[0], activation=activation_function, input_dim=total_features, kernel_initializer=kernel,
 35 |                     bias_initializer=bias))
 36 |     model.add(Dropout(dropout))  # add dropout rate
 37 | 
 38 |     for hidden_layer_units in units[1:]:  # add hidden layers defined units in train_models.py
 39 |         model.add(Dense(units=hidden_layer_units, activation=activation_function, kernel_initializer=kernel,
 40 |                         bias_initializer=bias))
 41 |         model.add(Dropout(dropout))
 42 | 
 43 |     model.add(Dense(2, activation="softmax"))  # output layer, with softmax activation function and 2 neurons
 44 | 
 45 |     # loss: sparse categorical cross entropy, Optimizer: Adam
 46 |     model.compile(loss="sparse_categorical_crossentropy",
 47 |                   optimizer=Adam(lr=learn_rate),
 48 |                   metrics=["accuracy"])
 49 | 
 50 |     """
 51 |     information about the NN, such as the number of layers, the output shape, 
 52 |     the number of weights in each layer and the total weights.
 53 |     """
 54 |     #model.summary()
 55 | 
 56 |     # plot of the neural network graph
 57 |     #plot_model(model, to_file="figures/DNN_model_plot.png", show_shapes=True, show_layer_names=True)
 58 | 
 59 |     return model
 60 | 
 61 | 
 62 | def train_neural_network(model, epochs, batch_size, features, labels, verbose=0,
 63 |                          validation=False, val_data=None, val_labels=None,
 64 |                          callbacks=False, plot_history=False):
 65 |     """
 66 |     :param modelh5: neural network model from generate_neural_network()
 67 |     :param epochs: number of epochs
 68 |     :param batch_size: batch size
 69 |     :param features: training data
 70 |     :param labels: training labels
 71 |     :param verbose: verbosity level
 72 |     :param validation: if True validate data
 73 |     :param val_data: validation data
 74 |     :param val_labels: validation labels
 75 |     :param callbacks: if True use Tensorboard callback
 76 |     :param plot_history: if True plots accuracy and loss history per epoch
 77 |     :return:
 78 |     """
 79 |     print("\n\n--- Training", type(model).__name__, "---")
 80 |     start_time = timeit.default_timer()
 81 | 
 82 |     # get the name of the optimizer in the defined model
 83 |     opt_config = model.optimizer.get_config()
 84 |     if 'name' not in opt_config.keys():
 85 |         _name = str(model.optimizer.__class__).split('.')[-1].replace('\'', '').replace('>', '')
 86 |         opt_config.update({'name': _name})
 87 | 
 88 |     if callbacks:
 89 |         # directory to save callbacks
 90 |         log_dir = "logs/fit/" + "DNN_200_200_" + opt_config['name']
 91 |         # callbacks: TensorBoard, EarlyStopping, ModelCheckPoint
 92 |         # TensorBoard for storing visualizations of the neural net
 93 |         tensorboard_callback = TensorBoard(log_dir=log_dir, histogram_freq=1, write_graph=True, write_images=True)
 94 |         # EarlyStopping to monitor validation loss. If there is any improve after 10 epochs,the training procedure stops
 95 |         early_stopping_callback = EarlyStopping(monitor='val_loss', mode='min', patience=10, verbose=verbose)
 96 |         # ModelCheckpoint to monitor validation accuracy. It stores the model with the highest accuracy
 97 |         model_checkpoint_callback = ModelCheckpoint('best_model_' + opt_config['name'] + '.h5', monitor='val_accuracy', mode='max',
 98 |                                                     verbose=verbose, save_best_only=True)
 99 |         if not validation:
100 |             # fit the model
101 |             print("Note: Validation data is not included...Only Tensorboard callback is used!")
102 |             history = model.fit(features, labels, epochs=epochs, batch_size=batch_size, verbose=verbose,
103 |                                 callbacks=[tensorboard_callback])  # train the neural network
104 |         else:
105 |             # fit the model
106 |             history = model.fit(features, labels, epochs=epochs, batch_size=batch_size, verbose=verbose,
107 |                                 validation_data=(val_data, val_labels),
108 |                                 callbacks=[tensorboard_callback, early_stopping_callback, model_checkpoint_callback])
109 |     else:  # train the model without the use of callbacks
110 |         history = model.fit(features, labels, epochs=epochs, batch_size=batch_size, verbose=verbose)
111 | 
112 |     if plot_history:  # plots the accuracy and loss per epoch
113 |         if not validation:
114 |             # print(history.history.keys())
115 |             # summarize history for training accuracy
116 |             plt.plot(history.history['accuracy'])
117 |             plt.title('model accuracy')
118 |             plt.ylabel('accuracy')
119 |             plt.xlabel('epoch')
120 |             plt.legend(['train'], loc='upper left')
121 |             plt.show()
122 |             # summarize history for training loss
123 |             plt.plot(history.history['loss'])
124 |             plt.title('model loss')
125 |             plt.ylabel('loss')
126 |             plt.xlabel('epoch')
127 |             plt.legend(['train'], loc='upper left')
128 |             plt.show()
129 |         else:
130 |             # summarize history for training and validation accuracy
131 |             plt.plot(history.history['accuracy'])
132 |             plt.plot(history.history['val_accuracy'])
133 |             plt.title('model accuracy')
134 |             plt.ylabel('accuracy')
135 |             plt.xlabel('epoch')
136 |             plt.legend(['train', 'test'], loc='upper left')
137 |             plt.show()
138 |             # summarize history for training and validation loss
139 |             plt.plot(history.history['loss'])
140 |             plt.plot(history.history['val_loss'])
141 |             plt.title('model loss')
142 |             plt.ylabel('loss')
143 |             plt.xlabel('epoch')
144 |             plt.legend(['train', 'test'], loc='upper left')
145 |             plt.show()
146 |     stop_time = timeit.default_timer()
147 |     print(type(model).__name__, "training time: ", stop_time - start_time, "seconds\n\n")
148 | 
149 | 
150 | def evaluate_neural_network(model, features, labels):
151 |     """
152 |     :param model: neural network model from generate_neural_network()
153 |     :param features: test data
154 |     :param labels: test labels
155 |     :return:
156 |     """
157 |     scores = model.evaluate(features, labels, verbose=0)
158 |     print(model.metrics_names[1], "%.2f%%" % (scores[1] * 100))
159 |     return scores[1] * 100
160 | 
161 | 
162 | def test_neural_network(model, test_data, test_labels):
163 |     """
164 |     :param model: neural network model from generate_neural_network()
165 |     :param test_data: validation data
166 |     :param test_labels: validation labels
167 |     :return:
168 |     """
169 |     global average_FNR, average_FPR, average_accuracy
170 |     print(type(model).__name__, "predicting...")
171 |     start_time = timeit.default_timer()
172 |     predicted = model.predict(test_data)
173 |     stop_time = timeit.default_timer()
174 |     # print(predicted)
175 |     # prick the class with higher probability
176 |     confusion = confusion_matrix(test_labels, np.argmax(predicted, axis=1))  # confusion matrix
177 |     print(confusion)
178 |     # confusion matrix metrics
179 |     TP = confusion[1, 1]
180 |     TN = confusion[0, 0]
181 |     FP = confusion[0, 1]
182 |     FN = confusion[1, 0]
183 |     FNR = FN / float(FN + TP) * 100
184 |     FPR = FP / float(FP + TN) * 100
185 |     accuracy = (TP + TN) / float(TP + TN + FP + FN) * 100
186 |     print("FP:", FP, "- FN:", FN, "- TP:", TP, "- TN", TN)
187 |     print("Accuracy:", accuracy, "- FPR:", FPR, "- FNR:", FNR)
188 |     print(type(model).__name__, "prediction time: ", stop_time - start_time, "seconds\n\n")
189 |     average_FNR += FNR
190 |     average_FPR += FPR
191 |     average_accuracy += accuracy
192 | 
193 | 
194 | def get_average_metrics(val_runs):
195 |     global average_FNR, average_FPR, average_accuracy
196 |     average_FNR = average_FNR / val_runs
197 |     average_FPR = average_FPR / val_runs
198 |     average_accuracy = average_accuracy / val_runs
199 |     print("Average Accuracy:", average_accuracy, "- Average FPR:", average_FPR, "- Average FNR:", average_FNR)
200 | 


--------------------------------------------------------------------------------
/feature_based_reduced_dataset/set_onehot_encoding.py:
--------------------------------------------------------------------------------
  1 | import csv
  2 | from random import randint, shuffle
  3 | import os
  4 | import numpy as np
  5 | 
  6 | csv_malware = "../../sha256_family.csv"  # csv file with malware apps
  7 | feature_index_dir = 'features_indexes/'  # directory with indexed features for all apps
  8 | 
  9 | original_malware = []
 10 | eliminated_malware = []
 11 | benign = []
 12 | 
 13 | 
 14 | def create_list_of_apps():
 15 |     print("Creating list of malicious apps...")
 16 |     with open(csv_malware, 'r') as file:  # open malware csv file
 17 |         next(file)  # skip the header line
 18 |         reader = csv.reader(file, delimiter=',')  # read the csv malware families
 19 |         for row in reader:
 20 |             original_malware.append(row[0])  # append every row from the csv file into a list
 21 |     for filename in os.listdir(feature_index_dir):
 22 |         if filename in original_malware:
 23 |             eliminated_malware.append(filename)
 24 |     print("Malware apps found: ", len(eliminated_malware))  # 5560
 25 |     print("Malware sample: ", eliminated_malware[randint(0, len(eliminated_malware) - 1)])  # print a random malware sample
 26 | 
 27 |     print("Creating list of benign apps...")
 28 |     for filename in os.listdir(feature_index_dir):  # read all apps
 29 |         if filename not in original_malware:  # if a SHA name not in malware list, append it to benign list
 30 |             benign.append(filename)
 31 |     print("Benign apps found: ", len(benign))  # 123453
 32 |     print("Benign app sample: ", benign[randint(0, len(benign) - 1)], )  # print a random benign app
 33 | 
 34 |     print("Total apps (Benign & Malicious) found: ", len(eliminated_malware) + len(benign))  # 129013
 35 | 
 36 | malware_incremental_counter = 0
 37 | benign_incremental_counter = 0
 38 | 
 39 | 
 40 | def generate_set_incremental(set_size, malware_ratio):
 41 |     global malware_incremental_counter, benign_incremental_counter
 42 |     set = []  # list that will fill with app set
 43 | 
 44 |     print("Creating set with", set_size, "samples...")
 45 |     print("Malware ratio:", int(malware_ratio * 100), "%, totaling", int(set_size * malware_ratio), "apps in", set_size)
 46 |     print("Creating malware set...")
 47 | 
 48 |     while len(set) < (set_size * malware_ratio):
 49 |         app = eliminated_malware[malware_incremental_counter]  # locate malware based on random index in malware list
 50 |         malware_incremental_counter += 1
 51 |         if malware_incremental_counter >= 2591:
 52 |             break
 53 |         if app not in set:
 54 |             set.append(app)  # append malware to set list
 55 | 
 56 |     print("Total malware apps in set: ", len(set))
 57 |     print("Malware sample in set: ", set[0])
 58 | 
 59 |     print("Creating benign set...")
 60 | 
 61 |     while len(set) < set_size:
 62 |         app = benign[benign_incremental_counter]  # locate benign based on random index in benign list
 63 |         benign_incremental_counter += 1
 64 |         if benign_incremental_counter >= 89345:
 65 |             break
 66 |         if app not in set:
 67 |             set.append(app)  # append benign to set list
 68 |     print(malware_incremental_counter)
 69 |     print("Total apps (malicious and benign) in set: ", len(set))
 70 |     return set
 71 | 
 72 | 
 73 | def generate_set(set_size, malware_ratio):
 74 |     set = []  # list that will fill with app set
 75 | 
 76 |     print("Creating set with", set_size, "samples...")
 77 |     print("Malware ratio:", int(malware_ratio * 100), "%, totaling", int(set_size * malware_ratio), "apps in", set_size)
 78 |     print("Creating malware set...")
 79 | 
 80 |     while len(set) < (set_size * malware_ratio):
 81 |         index = randint(0, len(eliminated_malware) - 1)  # choose a random index between (0,5559)
 82 |         app = eliminated_malware[index]  # locate malware based on random index in malware list
 83 |         if app not in set:
 84 |             set.append(app)  # append malware to set list
 85 | 
 86 |     print("Total malware apps in set: ", len(set))
 87 |     print("Malware sample in set: ", set[0])
 88 | 
 89 |     print("Creating benign set...")
 90 |     while len(set) < set_size:
 91 |         index = randint(0, len(benign) - 1)  # choose a random index between (0,129012)
 92 |         app = benign[index]  # locate benign based on random index in benign list
 93 |         if app not in set:
 94 |             set.append(app)  # append benign to set list
 95 | 
 96 |     print("Total apps (malicious and benign) in set: ", len(set))
 97 |     return set
 98 | 
 99 | 
100 | def generate_input(set, total_features):
101 |     print("performing one hot encoding...")
102 |     # return a 2D array filled with zeros that will be used for the features of each app
103 |     data = np.zeros((len(set), total_features), dtype=float)
104 |     # return an array filled with zeros that will be used for the label of each app {0-benign 1-malicious}
105 |     labels = np.zeros((len(set),), dtype=int)
106 |     shuffle(set)  # shuffle the set, comment out to work with pre defined training and test set
107 |     for id_app, app in enumerate(set):  # iterate through set with a counter
108 |         with open(feature_index_dir + app, 'r') as file:  # open apps in set
109 |             for index in file:  # read line by line
110 |                 data[id_app][int(index)] = 1.0  # update corresponding element of the array with 1.0
111 | 
112 |         if app in eliminated_malware:
113 |             labels[id_app] = 1  # update corresponding label to 1 if it is malware
114 |         else:
115 |             labels[id_app] = 0
116 | 
117 |     #print(data)
118 |     #print(labels)
119 |     #print(data.shape)
120 |     #print(labels.shape)
121 |     return data, labels


--------------------------------------------------------------------------------
/feature_based_reduced_dataset/train_models.py:
--------------------------------------------------------------------------------
  1 | import set_onehot_encoding as onehot
  2 | import feature_based_original_dataset.models as models
  3 | import neural_network as NN
  4 | import numpy as np
  5 | import os
  6 | 
  7 | 
  8 | def create_sets():
  9 | 
 10 |     if os.path.isfile("training_set_8500.txt") is False:
 11 |         set_size = 8500
 12 |         malware_ratio = 0.3
 13 |         print("Creating data-labels...")
 14 |         print("Generating TESTING set...")
 15 |         training_set = onehot.generate_set(set_size, malware_ratio)  # generate random testing set
 16 |         with open("training_set_1500.txt", "w") as file:
 17 |             for item in training_set:
 18 |                 file.write(str(item) + "\n")
 19 | 
 20 |     if os.path.isfile("testing_set_8500.txt") is False:
 21 |         set_size = 8500
 22 |         malware_ratio = 0.3
 23 |         print("Creating data-labels...")
 24 |         print("Generating TESTING set...")
 25 |         testing_set = onehot.generate_set(set_size, malware_ratio)  # generate random testing set
 26 |         with open("testing_set_1500.txt", "w") as file:
 27 |             for item in testing_set:
 28 |                 file.write(str(item) + "\n")
 29 | 
 30 |     training_set = []
 31 |     testing_set = []
 32 | 
 33 |     with open("training_set_8500.txt", "r") as file:  # read training set file and append applications to list
 34 |         for line in file:
 35 |             line.strip()  # remove whitespace
 36 |             line = line[:-1]  # remove \n
 37 |             training_set.append(line)  # add item to list
 38 |     with open("testing_set_8500.txt", "r") as file:  # read testing set file and append applications to list
 39 |         for line in file:
 40 |             line.strip()
 41 |             line = line[:-1]
 42 |             testing_set.append(line)
 43 |     print("Generating TRAINING input...")
 44 |     data, labels = onehot.generate_input(training_set, total_features)  # perform one-hot encoding
 45 |     print("Generating TESTING input...")
 46 |     test_data, test_labels = onehot.generate_input(testing_set, total_features)  # perform one-hot encoding
 47 |     return data, labels, test_data, test_labels
 48 | 
 49 | 
 50 | def train_models():
 51 |     data, labels, test_data, test_labels = create_sets()
 52 | 
 53 |     model = NB.train_multi_naive_bayes_classifier(data,labels, save=False)
 54 |     NB.test_multi_naive_bayes_classifier(model, test_data, test_labels)
 55 | 
 56 |     model = DT.train_decision_tree_classifier(data, labels, save=False)
 57 |     DT.test_decision_tree_classifier(model, test_data, test_labels)
 58 | 
 59 |     model = RF.train_random_forest_classifier(data, labels, save=False)
 60 |     RF.test_random_forest_classifier(model, test_data, test_labels)
 61 | 
 62 |     model = KNN.train_knn_classifier(data, labels, save=False)
 63 |     KNN.test_knn_classifier(model, test_data, test_labels)
 64 | 
 65 |     model = LR.train_logistic_regression_classifier(data, labels, save=False)
 66 |     LR.test_logistic_regression_classifier(model, test_data, test_labels)
 67 | 
 68 |     model = SVM.train_svm_classifier(data, labels, save=False)
 69 |     SVM.test_svm_classifier(model, test_data, test_labels)
 70 | 
 71 |     # init the neural net
 72 |     model = NN.generate_neural_network(total_features, units, dropout, learn_rate, kernel_initializer,
 73 |                                        bias_initializer, activation_function)
 74 |     """
 75 |     train the neural network with the given model, epochs, batch size, train data-labels.
 76 |     Specify verbosity level, validation data, callbacks and plots if needed.
 77 |     Default parameters:
 78 |     verbose=0, validation=False, val_data=None, val_labels=None, callbacks=False, plot_history=False
 79 |     example:
 80 |     NN.train_neural_network(model, epochs, batch_size, data, labels, verbose=0,
 81 |                             validation=True, val_data=test_data, val_labels=test_labels,
 82 |                             callbacks=True, plot_history=True)
 83 |     This is the main training stage and thus we want to save the best models at the right times. This is done
 84 |     setting the callback to True. Keras will seek for the minimum validation loss and it saves the model with
 85 |     the highest validation accuracy.
 86 |     """
 87 |     NN.train_neural_network(model, epochs, batch_size, data, labels, verbose=2,
 88 |                             validation=True, val_data=test_data, val_labels=test_labels,
 89 |                             callbacks=True)
 90 |     NN.test_neural_network(model, test_data, test_labels)
 91 | 
 92 | 
 93 | if __name__ == "__main__":
 94 |     total_features = 3880  # total unique features
 95 |     set_size = 8500  # set site that will be used to create random training set
 96 |     testing_set_size = 8500  # set site that will be used to create random test set
 97 |     malware_ratio = 0.3  # malware ratio in the set size
 98 | 
 99 |     print("Creating data-labels...")
100 |     onehot.create_list_of_apps()  # function from set_one_encoding.py
101 | 
102 |     # initialize sklearn models
103 |     NB = models.MultinomialNaiveBayes()
104 |     DT = models.DecisionTree()
105 |     RF = models.RandomForest()
106 |     KNN = models.KNearestNeighbors()
107 |     LR = models.LogRegression()
108 |     SVM = models.SupportVectorMachine()
109 | 
110 |     val_runs = 8
111 |     # neural net parameters
112 |     units = [200, 200]
113 |     dropout = 0.2
114 |     epochs = 20
115 |     batch_size = 150
116 |     learn_rate = 0.001
117 |     # momentum = 0.0  # to work with SGD
118 |     kernel_initializer = 'glorot_uniform'
119 |     bias_initializer = 'zeros'
120 |     activation_function = 'relu'
121 | 
122 |     train_models()
123 | 


--------------------------------------------------------------------------------
/feature_based_reduced_dataset/train_random_subsampl.py:
--------------------------------------------------------------------------------
 1 | import set_onehot_encoding as onehot
 2 | import models
 3 | import neural_network as NN
 4 | import numpy as np
 5 | 
 6 | def create_random_sets():
 7 |     print("Generating TRAINING set...")
 8 |     training_set = onehot.generate_set(set_size, malware_ratio)  # generate random training set
 9 |     print("Generating TRAINING input...")
10 |     data, labels = onehot.generate_input(training_set, total_features)  # perform one-hot encoding
11 |     print("Generating TESTING set...")
12 |     testing_set = onehot.generate_set(testing_set_size, malware_ratio)  # generate random testing set
13 |     print("Generating TESTING input...")
14 |     test_data, test_labels = onehot.generate_input(testing_set, total_features)  # perform one-hot encoding
15 |     return data, labels, test_data, test_labels  # return train data - labels and test data - labels
16 | 
17 | 
18 | def random_sub_sampling(runs):
19 | 
20 |     score_nn = []
21 |     score_rf = []
22 |     score_lr = []
23 |     score_dt = []
24 |     score_svm = []
25 |     score_knn = []
26 | 
27 |     for i in range(runs):
28 | 
29 |         data, labels, test_data, test_labels = create_random_sets()  # choose random training and testing sets
30 | 
31 |         """# init neural net
32 |         model = NN.generate_neural_network(total_features, units, dropout, learn_rate, kernel_initializer,
33 |                                            bias_initializer, activation_function)
34 |         '''
35 |         this is not the actual training procedure and we don't want to save the models. To save models and implement
36 |         the early stopping technique refer to train_models.py
37 |         The goal of this operation is only to determine the behavior of models to random training sets and random
38 |         testing sets!
39 |         So, only train and evaluate models.
40 |         '''
41 |         NN.train_neural_network(model, epochs, batch_size, data, labels, verbose=2)
42 |         score = NN.evaluate_neural_network(model, test_data, test_labels)
43 |         score_nn.append(score)"""
44 | 
45 |         #model = DT.train_decision_tree_classifier(data, labels)  # train Decision Tree Classifier
46 |         #score_dt = DT.evaluate_decision_tree_classifier(model, test_data, test_labels)
47 | 
48 |         #model = RF.train_random_forest_classifier(data, labels)  # train Random Forest
49 |         #score_rf = RF.evaluate_random_forest_classifier(model, test_data, test_labels)
50 | 
51 |         model = KNN.train_knn_classifier(data, labels)  # train k-Nearest Neighbors Classifier
52 |         score_knn = KNN.evaluate_knn_classifier(model, test_data, test_labels)
53 | 
54 |         #model = LR.train_logistic_regression_classifier(data, labels)  # train logistic Regression
55 |         #score_lr = LR.evaluate_logistic_regression_classifier(model, test_data, test_labels)
56 | 
57 |         #model = SVM.train_svm_classifier(data, labels)  # train Support Vector Machines
58 |         #score_svm = SVM.evaluate_svm_classifier(model, test_data, test_labels)
59 | 
60 |     #print("NN Average accuracy: ", np.mean(score_nn), "Standard Deviation:", np.std(score_nn))
61 |     #print("DT Average accuracy: ", np.mean(score_dt), "Standard Deviation:", np.std(score_dt))
62 |     #print("RF Average accuracy: ", np.mean(score_rf), "Standard Deviation:", np.std(score_rf))
63 |     print("kNN Average accuracy: ", np.mean(score_knn), "Standard Deviation:", np.std(score_knn))
64 |     #print("LR Average accuracy: ", np.mean(score_lr), "Standard Deviation:", np.std(score_lr))
65 |     #print("SVM Average accuracy: ", np.mean(score_svm), "Standard Deviation:", np.std(score_svm))
66 | 
67 | 
68 | 
69 | if __name__ == "__main__":
70 |     total_features = 3880  # total unique features
71 |     set_size = 8500  # set site that will be used to create random training set
72 |     testing_set_size = 8500  # set site that will be used to create random test set
73 |     malware_ratio = 0.3  # malware ratio in the set size
74 | 
75 |     print("Creating data-labels...")
76 |     onehot.create_list_of_apps()  # function from set_one_encoding.py
77 | 
78 |     DT = models.DecisionTree()
79 |     RF = models.RandomForest()
80 |     KNN = models.KNearestNeighbors()
81 |     LR = models.LogRegression()
82 |     SVM = models.SupportVectorMachine()
83 | 
84 |     val_runs = 8  # number of times to train and test a model
85 | 
86 |     # neural net parameters
87 |     units = [200, 200]  # number of neurons in each layer (2 hidden layers)
88 |     dropout = 0.2  # dropout rate
89 |     epochs = 18  # epochs per iteration
90 |     batch_size = 150  # batch size
91 |     learn_rate = 0.001  # learning rate of the specified optimizer
92 |     kernel_initializer = 'glorot_uniform'  # weight initialization
93 |     bias_initializer = 'zeros'  # bias initialization
94 |     activation_function = 'relu'  # activation function in hidden layers (We use Softmax in the output layer)
95 | 
96 |     random_sub_sampling(val_runs)
97 | 


--------------------------------------------------------------------------------
/preprocessing/README.md:
--------------------------------------------------------------------------------
 1 | ## 1) Extracting feature types
 2 | 
 3 | First, we extract the features types present in the dataset. The extraction is done by parsing each file in feature_vectors folder, line by line, where we extract the feature type, finding chars before “::” occurs. Afterwards, it is checked if the feature type is a key to a dictionary and otherwise, it appends as a key with a numeric value. After this operation, we found 11 features types, which are feature, activity, intent, provider, call, api_call, url, permission, real_permission, service_receiver and an empty feature type. 
 4 | 
 5 | ```
 6 | python3 extract_features.py
 7 | ```   
 8 | 
 9 | 
10 | | Feature Type Found | Feature Type | Class |
11 | | ------------- | ------------- |  ------------- |
12 | | provider | Hardware Components | S1 | 
13 | | permission | Requested Permissions | S2 |
14 | | activity | Components | S3 | 
15 | | service_receiver | Components |	S3 | 
16 | | intent | Intents | S4 |  
17 | | call | Restricted Permissions | S5 | 
18 | | real_permission |	Used Permissions | S6 | 
19 | | api_call | Suspicious API Calls |	S7 | 
20 | | url |	Network Addresses |	S8 | 
21 | | feature | - | - | 
22 | 
23 | 
24 | ## 2) Counting features for each class
25 | 
26 | We use three dictionaries for this operation. The first one is predefined with keys, the feature types found in the previous step. With this dictionary, we extract the number, which corresponds to a class to increment a counter that counts the total features for a specific class. The second dictionary is used to check whether a feature in the form of feature_type::feature is present or not. We parse each file line by line, like in the previous step. For each feature found, which is unique, i.e., it has not inserted into the second dictionary, we check if this particular feature is in the dictionary and otherwise, we increment the counter for its corresponding class. 
27 | 
28 | ```
29 | python3 count_features_for_each_class.py
30 | ```  
31 | 
32 | 
33 | | Class | Feature Type | Amount |
34 | | ------------- | ------------- |  ------------- |
35 | | S1 | provider | 4,513 |
36 | | S2 | permission | 3,812 |
37 | | S3 | activity | 185,729 |
38 | | S3 | service_receiver | 33,222 |	
39 | | S4 | intent | 6,379	|
40 | | S5 | call | 733	|
41 | | S6 | real_permission | 70 |	
42 | | S7 | api_call | 315 |	
43 | | S8 | url| 310,488 |
44 | | - | feature | 72 |
45 | 
46 | 
47 | ## 3) Extracting top features
48 | 
49 | We parse all application’s feature vectors to get the top 10 features in malicious and benign apps separately to observe features with high support in both benign and malware applications. We iterate line by line, each malware application and each feature found is appended into a dictionary as a key with an integer value. The value increments by one for the same feature appearance. Similarly, the same procedure goes for benign applications.
50 | 
51 | ```
52 | python3 extract_feature_occurences.py
53 | ```  
54 | 
55 | 
56 | | Top 10 malware features | Amount Malware | Amount Benign |
57 | | ------------- | ------------- |  ------------- |
58 | | feature::android.hardware.touchscreen | 5,524 | 123,178 |
59 | | intent::android.intent.action.MAIN | 5,351 | 120,345 |
60 | | permission::android.permission.INTERNET | 5,323| 102,986 |
61 | | intent::android.intent.category.LAUNCHER | 5,224| 118,504 |	
62 | | call::getSystemService | 5,185 | 104,538	|
63 | | real_permission::android.permission.INTERNET | 4,992 | 103,434	|
64 | | permission::android.permission.READ_PHONE_STATE | 4,931 | 45,085 |	
65 | | real_permission::android.permission.READ_PHONE_STATE | 4,186 | 41,877 |	
66 | | call::getDeviceId | 3,761 | 41,877 |
67 | | permission::android.permission.WRITE_EXTERNAL_STORAGE | 3,713 | 45,244 |
68 | 
69 | 
70 | ## 4) Getting mean features
71 | 
72 | ```
73 | python3 mean.py
74 | ```  
75 | 
76 | 
77 | A mean of 46.7 features present in benign applications, unlike malicious applications that have a mean of 61.7 features per application.
78 | 


--------------------------------------------------------------------------------
/preprocessing/count_features_for_each_class.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This file counts the amount of features that belong to each class.
 3 | """
 4 | import os
 5 | 
 6 | # predefined dictionary with keys the feature types found from extract_feature_types.py
 7 | features_types = {
 8 |     "provider": 1,
 9 |     "permission": 2,
10 |     "activity": 3,
11 |     "service_receiver": 4,
12 |     "intent": 5,
13 |     "call": 6,
14 |     "real_permission": 7,
15 |     "api_call": 8,
16 |     "url": 9,
17 |     "feature": 10
18 | }
19 | 
20 | feature_vector = {}
21 | features_occurrences = {x: 0 for x in range(1, 11)}  # we have 10 feature types
22 | # print(features_occurrences)
23 | feature_vectors_dir = '../feature_vectors/'  # directory with features for all apps
24 | 
25 | not_assignable_feature_type = ['']  # found from extract_feature_types.py
26 | print("Counting features for each class...")
27 | for filename in os.listdir(feature_vectors_dir):  # read all apps
28 |     with open(feature_vectors_dir + filename, "r") as file:  # open an app
29 |         for line in file:  # read app line by line
30 |             feature_type = line[:line.find('::')]  # extract feature type
31 |             feature = line.strip()  # remove whitespace chars
32 |             if feature_type not in not_assignable_feature_type:  # check if feature type is ''
33 |                 # if a feature is not present in the feature vector: map feature to index and increment index
34 |                 if feature not in feature_vector:
35 |                     feature_vector[feature] = 1  # append feature to dictionary
36 |                     temp = features_types.get(feature_type, None)  # get the corresponding value from the predefined dict
37 |                     features_occurrences[temp] += 1  # increment index
38 | 
39 | print(features_occurrences)
40 | #print(len(feature_vector))
41 | print("Total unique features present in dataset: ", sum(features_occurrences.values()))
42 | 
43 | """
44 | {1: 4513, 2: 3812, 3: 185729, 4: 33222, 5: 6379, 6: 733, 7: 70, 8: 315, 9: 310488, 10: 72}
45 | Total features present in dataset:  545333
46 | 
47 | Category     Type              Amount
48 | S1         provider            4513
49 | S2         permission          3812
50 | S3         activity            185729
51 | S3         service_receiver    33222
52 | S4         intent              6379
53 | S5         call                733
54 | S6         real_permission     70
55 | S7         api_call            315
56 | S8         url                 310488
57 | -          feature             72        
58 | """
59 | 


--------------------------------------------------------------------------------
/preprocessing/extract_feature_occurrences.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This file extract the feature occurrences for each feature present in database.
  3 | """
  4 | import os
  5 | from collections import Counter
  6 | import csv
  7 | 
  8 | feature_vectors_dir = '../feature_vectors/'  # directory with features for all apps
  9 | csv_malware = "../sha256_family.csv"  # csv file with malware apps
 10 | features = {}  # dictionary for counting features in apps (malicious & benign)
 11 | malware = []  # list of malware
 12 | malware_features = {}  # dictionary for counting features in malicious apps
 13 | benign_features = {}  # dictionary for counting features in benign apps
 14 | not_assignable_feature_type = ['']  # found from extract_feature_types.py
 15 | 
 16 | 
 17 | def count_features_in_apps():
 18 |     print("Counting features present in apps...")
 19 |     for filename in os.listdir(feature_vectors_dir):  # read all apps
 20 |         with open(feature_vectors_dir + filename, "r") as file:  # open an app
 21 |             for line in file:  # read app line by line
 22 |                 feature_type = line[:line.find('::')]  # extract feature type
 23 |                 feature = line.strip()  # remove whitespace chars
 24 |                 if feature_type not in not_assignable_feature_type:  # check if feature type is ''
 25 |                     # if a feature not present in dictionary append it as a key with a value of 1
 26 |                     if feature not in features:
 27 |                         features[feature] = 1
 28 |                     # if a feature present in dictionary update its value by one
 29 |                     else:
 30 |                         features[feature] += 1
 31 | 
 32 |     print("Total unique features: ", len(features))
 33 |     print("Total features: ", sum(features.values()))
 34 | 
 35 |     print("\n[+]Top 10 features present in apps:")
 36 |     top10_features = Counter(features).most_common(10)
 37 |     for i in top10_features:
 38 |         print(i[0], ":", i[1])
 39 | 
 40 |     # write a csv file with all feature occurrences
 41 |     sorted_features = sorted(features.items(), key=lambda kv: kv[1])
 42 |     write_features = csv.writer(open("features_counter.csv", "w"), delimiter=' ')
 43 |     for key, val in sorted_features:
 44 |         write_features.writerow([key, val])
 45 | 
 46 | 
 47 | def count_features_in_malware():
 48 |     print("\nCounting features in malware apps...")
 49 | 
 50 |     with open(csv_malware, 'r') as file:  # open malware csv file
 51 |         next(file)  # skip the header line
 52 |         reader = csv.reader(file, delimiter=',')  # read the csv
 53 |         for row in reader:
 54 |             malware.append(row[0])  # append every SHA name from the csv file into a list
 55 | 
 56 |     for filename in os.listdir(feature_vectors_dir):  # read all apps
 57 |         if filename in malware:  # if a filename present in malware
 58 |             with open(feature_vectors_dir + filename, "r") as file:  # open malware file
 59 |                 for line in file:  # read malware line by line
 60 |                     feature_type = line[:line.find('::')]  # extract feature
 61 |                     feature = line.strip()  # remove whitespace chars
 62 |                     if feature_type not in not_assignable_feature_type:  # check if feature type is ''
 63 |                         # if a feature not present in dictionary append it as a key with a value
 64 |                         if feature not in malware_features:
 65 |                             malware_features[feature] = 1
 66 |                         # if a feature present in dictionary update its value by one
 67 |                         else:
 68 |                             malware_features[feature] += 1
 69 | 
 70 |     print("Total unique features present in malware: ", len(malware_features))
 71 |     print("Total features in malware: ", sum(malware_features.values()))
 72 | 
 73 |     print("\nTop 10 features present in malware:")
 74 |     top10_features_malware = Counter(malware_features).most_common(10)
 75 |     for i in top10_features_malware:
 76 |         print(i[0], ":", i[1])
 77 | 
 78 | 
 79 | def count_features_in_benign():
 80 |     print("\nCounting features in benign apps...")
 81 |     for filename in os.listdir(feature_vectors_dir):
 82 |         if filename not in malware:
 83 |             with open(feature_vectors_dir + filename, "r") as file:
 84 |                 for line in file:
 85 |                     feature_type = line[:line.find('::')]
 86 |                     feature = line.strip()
 87 |                     if feature_type not in not_assignable_feature_type:
 88 |                         if feature not in benign_features:
 89 |                             benign_features[feature] = 1
 90 |                         else:
 91 |                             benign_features[feature] += 1
 92 | 
 93 |     print("Total unique features present in benign apps: ", len(malware_features))
 94 |     print("Total features in benign apps: ", sum(malware_features.values()))
 95 | 
 96 |     print("\nTop 10 features present in benign:")
 97 |     top10_features_benign = Counter(benign_features).most_common(10)
 98 |     for i in top10_features_benign:
 99 |         print(i[0], ":", i[1])
100 | 
101 | 
102 | count_features_in_apps()
103 | count_features_in_malware()
104 | count_features_in_benign()
105 | 
106 | """
107 | Counting features present in apps...
108 | Total unique features:  545333
109 | Total features:  6113087
110 | 
111 | Top 10 features present in apps:
112 | feature::android.hardware.touchscreen : 128702
113 | intent::android.intent.action.MAIN : 125696
114 | intent::android.intent.category.LAUNCHER : 123728
115 | call::getSystemService : 109723
116 | real_permission::android.permission.INTERNET : 108426
117 | permission::android.permission.INTERNET : 108309
118 | call::getPackageInfo : 73361
119 | call::printStackTrace : 69675
120 | permission::android.permission.ACCESS_NETWORK_STATE : 67487
121 | real_permission::android.permission.ACCESS_NETWORK_STATE : 64800
122 | 
123 | Counting features in malware apps...
124 | Total unique features present in malwares:  15590
125 | Total features in malwares:  342794
126 | 
127 | Top 10 features present in malware:
128 | feature::android.hardware.touchscreen : 5524
129 | intent::android.intent.action.MAIN : 5351
130 | permission::android.permission.INTERNET : 5323
131 | intent::android.intent.category.LAUNCHER : 5224
132 | call::getSystemService : 5185
133 | real_permission::android.permission.INTERNET : 4992
134 | permission::android.permission.READ_PHONE_STATE : 4931
135 | real_permission::android.permission.READ_PHONE_STATE : 4186
136 | call::getDeviceId : 3761
137 | permission::android.permission.WRITE_EXTERNAL_STORAGE : 3713
138 | 
139 | Counting features in benign apps...
140 | Total unique features present in benign apps:  15590
141 | Total features in benign apps:  342794
142 | 
143 | Top 10 features present in benign:
144 | feature::android.hardware.touchscreen : 123178
145 | intent::android.intent.action.MAIN : 120345
146 | intent::android.intent.category.LAUNCHER : 118504
147 | call::getSystemService : 104538
148 | real_permission::android.permission.INTERNET : 103434
149 | permission::android.permission.INTERNET : 102986
150 | call::getPackageInfo : 70604
151 | call::printStackTrace : 65963
152 | permission::android.permission.ACCESS_NETWORK_STATE : 63808
153 | real_permission::android.permission.ACCESS_NETWORK_STATE : 61679
154 | """
155 | 


--------------------------------------------------------------------------------
/preprocessing/extract_feature_types.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This file extracts the unique feature types (classes) present in dataset.
 3 | We found a feature type ('') that doesn't belong in any category.
 4 | """
 5 | import os
 6 | 
 7 | feature_types = {}  # dictionary that will be filled with feature types as keys
 8 | index = 1
 9 | feature_vectors_dir = '../feature_vectors/'  # directory with features for all apps
10 | 
11 | print("Creating a dictionary that extracts the feature types (classes)...")
12 | for filename in os.listdir(feature_vectors_dir):  # read all apps
13 |     with open(feature_vectors_dir + filename, "r") as file:  # open an app
14 |         for line in file:  # read app line by line
15 |             feature_type = line[:line.find('::')]  # extract feature type
16 |             if feature_type not in feature_types:  # check if feature type is in dictionary
17 |                 feature_types[feature_type] = index  # append feature type as key with the value of index
18 |                 index = index + 1  # increment index counter
19 | print("Feature types: ", str(feature_types))
20 | print("Total feature types found: ", len(feature_types))
21 | 
22 | 
23 | """
24 | {
25 | 'feature': 1, 
26 | 'activity': 2, 
27 | 'intent': 3,
28 | 'provider': 4,
29 | 'call': 5,
30 | 'api_call': 6,
31 | 'url': 7,
32 | 'permission': 8,
33 | 'real_permission': 9,
34 | 'service_receiver': 10,
35 | '': 11}
36 | """
37 | 
38 | """
39 | We can divide feature types in 8 classes as:
40 | 1) provider: Hardware components {S1}
41 | 2) permission: Requested Permissions {S2}
42 | 3) activity, service_receiver: Components {S3}
43 | 4) intent: Intents {S4}
44 | 5) call: Restr. API Calls {S5}
45 | 6) real_permission: Used Permissions {S6}
46 | 7) api_call: Susp. API Calls {S7}
47 | 8) url: Network addresses {S8}
48 | -) feature: Not assigned
49 | """
50 | 


--------------------------------------------------------------------------------
/preprocessing/mean_features.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import csv
 3 | 
 4 | feature_vectors_dir = '../feature_vectors/'  # directory with features for all apps
 5 | csv_malware = "../sha256_family.csv"  # csv file with malwares
 6 | features = 0
 7 | malware_features = 0
 8 | benign_features = 0
 9 | malware = []
10 | not_assignable_feature_type = ['']  # found from extract_feature_types.py
11 | 
12 | 
13 | def count_features_in_apps():
14 |     global features
15 |     print("Counting features in apps...")
16 |     for filename in os.listdir(feature_vectors_dir):
17 |         with open(feature_vectors_dir + filename, "r") as file:
18 |             for line in file:
19 |                 feature_type = line[:line.find('::')]
20 |                 line.strip()
21 |                 if feature_type not in not_assignable_feature_type:
22 |                     features += 1
23 |     print("Total features present in apps: ", features)  # 6113102
24 | 
25 | 
26 | def count_features_in_malware():
27 |     global malware_features
28 |     print("\nCounting features in malware apps...")
29 | 
30 |     with open(csv_malware, 'r') as file:  # open malware csv file
31 |         next(file)  # skip the header line
32 |         reader = csv.reader(file, delimiter=',')  # read the csv
33 |         for row in reader:
34 |             malware.append(row[0])  # append every SHA name from the csv file into a list
35 | 
36 |     for filename in os.listdir(feature_vectors_dir):
37 |         if filename in malware:
38 |             with open(feature_vectors_dir + filename, "r") as file:
39 |                 for line in file:
40 |                     # extract feature
41 |                     feature_type = line[:line.find('::')]
42 |                     line.strip()
43 |                     if feature_type not in not_assignable_feature_type:
44 |                         malware_features += 1
45 | 
46 |     print("Total features present in malware: ", malware_features)
47 |     print("Mean of features in malware: ", malware_features / 5560)
48 | 
49 | 
50 | def count_features_in_benign():
51 |     print("\nCounting features in benign apps...")
52 |     global benign_features
53 |     for filename in os.listdir(feature_vectors_dir):  # read all app's SHA names
54 |         if filename not in malware:
55 |             with open(feature_vectors_dir + filename, "r") as file:
56 |                 for line in file:
57 |                     # extract feature
58 |                     feature_type = line[:line.find('::')]
59 |                     if feature_type not in not_assignable_feature_type:
60 |                         benign_features += 1
61 | 
62 |     print("Total unique features present in benign apps: ", benign_features)
63 |     print("Mean of features in benign apps: ", benign_features / 123453)
64 | 
65 | 
66 | count_features_in_apps()
67 | count_features_in_malware()
68 | count_features_in_benign()
69 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | # Library dependencies for the python3 code.  You need to install these with
 2 | # `pip install -r requirements.txt` before you can reproduce the experiments.
 3 | 
 4 | pandas==0.25.1
 5 | numpy==1.17.3
 6 | matplotlib
 7 | scikit-learn==0.21.1
 8 | tensroflow=2.0.0
 9 | keras==2.2.5
10 | 


--------------------------------------------------------------------------------