├── README.md ├── feature_based_original_dataset ├── README.md ├── adversarial_train.py ├── defensive_distillation.py ├── ensemble.py ├── evaluate_models.py ├── fgsm.py ├── incremental_learning.py ├── jsma.py ├── label_encoding.py ├── models.py ├── models_grid_search.py ├── neural_network.py ├── nn_grid_search.py ├── set_onehot_encoding.py ├── testing_set_1000.txt ├── testing_set_1500.txt ├── train_models.py ├── train_random_subsampling.py └── training_set_1000.txt ├── feature_based_reduced_dataset ├── README.md ├── count_feature_variance.py ├── detector.py ├── eliminate_features.py ├── eliminate_low_high_support_features.py ├── eliminated_variance.csv ├── jsma.py ├── label_encoding.py ├── models.py ├── neural_network.py ├── set_onehot_encoding.py ├── testing_set_1000.txt ├── testing_set_1500.txt ├── testing_set_8500.txt ├── train_models.py ├── train_random_subsampl.py ├── training_set_1500.txt └── training_set_8500.txt ├── preprocessing ├── README.md ├── count_features_for_each_class.py ├── extract_feature_occurrences.py ├── extract_feature_types.py ├── features_counter.csv └── mean_features.py └── requirements.txt /README.md: -------------------------------------------------------------------------------- 1 | ## Research on Android malware detection based on ML models & the weakness of DNNs in adversarial examples. 2 | 3 | 4 | The experiments are carried through the [Drebin dataset](https://www.sec.cs.tu-bs.de/~danarp/drebin/). 5 | 6 | 7 | 1) preprocessing folder contains scripts related to dataset and its features. 8 | 2) feature_based_original_dataset folder contains scripts for the experiments in whole feature space. 9 | 3) feature_based_reduced_dataset folder contains scripts for the experiments on the reduced feature space. 10 | 11 | 12 | Each folder contains a README file to help you through the experiments. 13 | 14 | -------------------------------------------------------------------------------- /feature_based_original_dataset/adversarial_train.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | from tensorflow import keras 4 | from sklearn.metrics import confusion_matrix 5 | import set_onehot_encoding as onehot 6 | import os 7 | import neural_network as NN 8 | import random 9 | 10 | 11 | def create_random_sets(set_size=1500, malware_ratio=0.3): 12 | print("Generating set...") 13 | testing_set = onehot.generate_set(set_size, malware_ratio) # generate random set 14 | print("Generating input...") 15 | # shuffle the set randomly and perform one-hot encoding 16 | test_data, test_labels = onehot.generate_input(testing_set, total_features) 17 | return test_data, test_labels 18 | 19 | 20 | """ 21 | functions to compute Jacobian with numpy. 22 | https://medium.com/unit8-machine-learning-publication/computing-the-jacobian-matrix-of-a-neural-network-in-python-4f162e5db180 23 | First we specify the the forward and backward passes of each layer to implement backpropagation manually. 24 | """ 25 | 26 | 27 | def affine_forward(x, w, b): 28 | """ 29 | Forward pass of an affine layer 30 | :param x: input of dimension (I, ) 31 | :param w: weights matrix of dimension (I, O) 32 | :param b: biais vector of dimension (O, ) 33 | :return output of dimension (O, ), and cache needed for backprop 34 | """ 35 | out = np.dot(x, w) + b 36 | cache = (x, w) 37 | return out, cache 38 | 39 | 40 | def affine_backward(dout, cache): 41 | """ 42 | Backward pass for an affine layer. 43 | :param dout: Upstream Jacobian, of shape (M, O) 44 | :param cache: Tuple of: 45 | - x: Input data, of shape (I, ) 46 | - w: Weights, of shape (I, O) 47 | :return the jacobian matrix containing derivatives of the M neural network outputs with respect to 48 | this layer's inputs, evaluated at x, of shape (M, I) 49 | """ 50 | x, w = cache 51 | dx = np.dot(dout, w.T) 52 | return dx 53 | 54 | 55 | def relu_forward(x): 56 | """ Forward ReLU 57 | """ 58 | out = np.maximum(np.zeros(x.shape), x) 59 | cache = x 60 | return out, cache 61 | 62 | 63 | def relu_backward(dout, cache): 64 | """ 65 | Backward pass of ReLU 66 | :param dout: Upstream Jacobian 67 | :param cache: the cached input for this layer 68 | :return: the jacobian matrix containing derivatives of the M neural network outputs with respect to 69 | this layer's inputs, evaluated at x. 70 | """ 71 | x = cache 72 | dx = dout * np.where(x > 0, np.ones(x.shape), np.zeros(x.shape)) 73 | return dx 74 | 75 | 76 | def softmax_forward(x): 77 | """ Forward softmax 78 | """ 79 | exps = np.exp(x - np.max(x)) 80 | s = exps / exps.sum() 81 | return s, s 82 | 83 | 84 | def softmax_backward(dout, cache): 85 | """ 86 | Backward pass for softmax 87 | :param dout: Upstream Jacobian 88 | :param cache: contains the cache (in this case the output) for this layer 89 | """ 90 | s = cache 91 | ds = np.diag(s) - np.outer(s, s.T) 92 | dx = np.dot(dout, ds) 93 | return dx 94 | 95 | 96 | def get_activations(model, layer_id, X): 97 | """ 98 | Computes outputs of intermediate layers 99 | :param model: the trained model 100 | :param layer_id: the id of the layer that we want the output from 101 | :param X: input feature vector 102 | :return: output of layer (layer_id) 103 | """ 104 | intermediate_layer_model = keras.models.Model(inputs=model.input, 105 | outputs=model.layers[layer_id].output) 106 | intermediate_output = intermediate_layer_model.predict(X) 107 | return intermediate_output 108 | 109 | 110 | def forward_backward(model, x): 111 | """ 112 | computes the forward derivative for the given input 113 | :param model: the trained model 114 | :param x: input feature vector 115 | :return: prediction result and forward derivative 116 | """ 117 | layer_to_cache = dict() # for each layer, we store the cache needed for backward pass 118 | forward_values = [] 119 | 120 | for i in range(0, len(model.layers), 2): 121 | values = {} 122 | w, b = model.layers[i].get_weights() 123 | values['w'] = w 124 | values['b'] = b 125 | forward_values.append(values) 126 | 127 | # Forward pass 128 | a1, cache_a1 = affine_forward(x, forward_values[0]['w'], forward_values[0]['b']) 129 | _, cache_r1 = relu_forward(a1) 130 | r1 = get_activations(model, 0, x) 131 | forward_values[0]['a'] = a1 132 | forward_values[0]['cache_a'] = cache_a1 133 | forward_values[0]['r'] = r1 134 | forward_values[0]['cache_r'] = cache_r1 135 | 136 | for i, layer_index in zip(range(1, len(forward_values) - 1), range(2, len(model.layers), 2)): 137 | a, cache_a = affine_forward(forward_values[i - 1]['r'], forward_values[i]['w'], forward_values[i]['b']) 138 | _, cache_r = relu_forward(a) 139 | r = get_activations(model, layer_index, x) 140 | forward_values[i]['a'] = a 141 | forward_values[i]['cache_a'] = cache_a 142 | forward_values[i]['r'] = r 143 | forward_values[i]['cache_r'] = cache_r 144 | 145 | a, cache_a = affine_forward(forward_values[len(forward_values) - 2]['r'], 146 | forward_values[len(forward_values) - 1]['w'], 147 | forward_values[len(forward_values) - 1]['b']) 148 | forward_values[len(forward_values) - 1]['a'] = a 149 | forward_values[len(forward_values) - 1]['cache_a'] = cache_a 150 | out, cache_out = softmax_forward(a) 151 | 152 | # backward pass 153 | dout = np.diag(np.ones(out.size, )) # the derivatives of each output w.r.t. each output. 154 | dout = softmax_backward(dout, cache_out) 155 | dout = affine_backward(dout, forward_values[len(forward_values) - 1]['cache_a']) 156 | 157 | for i in range(len(forward_values) - 2, 0, -1): 158 | dout = relu_backward(dout, forward_values[i]['cache_r']) 159 | dout = affine_backward(dout, forward_values[i]['cache_a']) 160 | 161 | dout = relu_backward(dout, forward_values[0]['cache_r']) 162 | dx = affine_backward(dout, forward_values[0]['cache_a']) 163 | 164 | return out, dx 165 | 166 | 167 | def craft_adversarial_samples(x, y, F, k): 168 | """ 169 | :param x: input feature vector 170 | :param y: target class 171 | :param F: the trained model 172 | :param k: index of the hidden layer 173 | :return: adversarial sample based on feature vector x 174 | """ 175 | x_adv = x 176 | gamma = [1] * len(x) 177 | delta_x = [0] 178 | changes = 0 179 | 180 | if np.argmax(F.predict(x_adv), 1) == 0: # if misclassification achieved return adv_x 181 | return x_adv, -1 182 | 183 | while np.argmax(F.predict(x_adv), 1) != y and np.linalg.norm(delta_x, ord=1) < k and changes < 20: 184 | # compute forward derivative (Jacobian) 185 | prob, forward_derivative = forward_backward(F, x_adv) 186 | 187 | tmp = np.multiply(forward_derivative[0], gamma) 188 | for i, feature in enumerate(x_adv[0]): 189 | if feature == 1: 190 | tmp[i] = 0 191 | i_max = np.argmax(tmp) 192 | if i_max <= 0: 193 | raise ValueError('FAILURE: We can only add features to an application!') 194 | 195 | x_adv[0][i_max] = 1 196 | delta_x = np.subtract(x_adv, x) 197 | # print(i_max) 198 | if i_max not in changes_dict: 199 | changes_dict[i_max] = 1 200 | else: 201 | changes_dict[i_max] += 1 202 | changes += 1 203 | print("Changes:", changes) 204 | 205 | return x_adv, changes 206 | 207 | 208 | def adversarial_training(): 209 | NN.train_neural_network(trained_model, 4, 15, val_data, val_labels, verbose=2) 210 | trained_model.save('Adam_adversarial_training_adv_1500_0.3.h5') 211 | 212 | predictions = trained_model.predict(val_data) 213 | confusion = confusion_matrix(val_labels, np.argmax(predictions, axis=1)) 214 | print(confusion) 215 | TP = confusion[1, 1] 216 | TN = confusion[0, 0] 217 | FP = confusion[0, 1] 218 | FN = confusion[1, 0] 219 | FNR = FN / float(FN + TP) * 100 220 | FPR = FP / float(FP + TN) * 100 221 | accuracy = ((TP + TN) / float(TP + TN + FP + FN)) * 100 222 | print("Adversarial FP:", FP, "- FN:", FN, "- TP:", TP, "- TN", TN) 223 | print("Adversarial Accuracy:", accuracy, "- FPR:", FPR, "- FNR:", FNR) 224 | 225 | 226 | if __name__ == "__main__": 227 | total_features = 545333 # total unique features 228 | print("Creating data-labels...") 229 | onehot.create_list_of_apps() # function from set_one_encoding.py 230 | 231 | changes_dict = {} # dictionary for perturbations (added features) 232 | 233 | trained_model = tf.keras.models.load_model('Adam_adversarial_training_adv_1500_0.3_.h5') 234 | 235 | averageChanges = 0 236 | val_data, val_labels = create_random_sets(set_size=800, malware_ratio=0.3) 237 | 238 | average_changes = 0 239 | amount_malwares = 0 240 | adv_counter = 0 241 | 242 | for i in range(len(val_data)): 243 | 244 | if val_labels[i] == 1: 245 | 246 | x = val_data[i:i + 1] 247 | # print("x: ", x) 248 | # print(x.shape) 249 | try: 250 | adv_x, changes = craft_adversarial_samples(x, 0, trained_model, 1) 251 | # print(adv_x) 252 | val_data[i] = adv_x 253 | 254 | if changes >= 0: 255 | average_changes += changes 256 | amount_malwares += 1 257 | except NameError: 258 | pass 259 | except ValueError: 260 | pass 261 | 262 | if amount_malwares > 0: 263 | averageChanges += (average_changes / float(amount_malwares)) 264 | 265 | adversarial_training() 266 | -------------------------------------------------------------------------------- /feature_based_original_dataset/defensive_distillation.py: -------------------------------------------------------------------------------- 1 | from keras import Sequential 2 | from keras.callbacks import TensorBoard, EarlyStopping, ModelCheckpoint 3 | from keras.layers import Dense, Dropout 4 | from keras.optimizers import Adam 5 | import tensorflow as tf 6 | import set_onehot_encoding as onehot 7 | import os 8 | import keras 9 | 10 | total_features = 545333 # total unique features 11 | path = "defensive_distillation/" 12 | 13 | if not os.path.exists(path): # check if path exists 14 | os.mkdir(path) 15 | print("Creating data-labels...") 16 | onehot.create_list_of_apps() # function from set_one_encoding.py 17 | 18 | 19 | def create_training_input(): 20 | if os.path.isfile("training_set_1500.txt") is False: 21 | set_size = 1500 22 | malware_ratio = 0.3 23 | print("Creating data-labels...") 24 | print("Generating TESTING set...") 25 | training_set = onehot.generate_set(set_size, malware_ratio) # generate random testing set 26 | with open("training_set_1500.txt", "w") as file: 27 | for item in training_set: 28 | file.write(str(item) + "\n") 29 | training_set = [] # the list of testing set 30 | with open("training_set_1500.txt", "r") as file: # read testing set file and append applications to list 31 | for line in file: 32 | line.strip() 33 | line = line[:-1] 34 | training_set.append(line) 35 | print("Generating TESTING input...") 36 | data, labels = onehot.generate_input(training_set, total_features) # perform one-hot encoding 37 | return data, labels 38 | 39 | 40 | def create_testing_input(): 41 | if os.path.isfile("testing_set_1500.txt") is False: 42 | set_size = 1500 43 | malware_ratio = 0.3 44 | print("Creating data-labels...") 45 | print("Generating TESTING set...") 46 | testing_set = onehot.generate_set(set_size, malware_ratio) # generate random testing set 47 | with open("testing_set_1500.txt", "w") as file: 48 | for item in testing_set: 49 | file.write(str(item) + "\n") 50 | testing_set = [] # the list of testing set 51 | with open("testing_set_1500.txt", "r") as file: # read testing set file and append applications to list 52 | for line in file: 53 | line.strip() 54 | line = line[:-1] 55 | testing_set.append(line) 56 | print("Generating TESTING input...") 57 | test_data, test_labels = onehot.generate_input(testing_set, total_features) # perform one-hot encoding 58 | return test_data, test_labels 59 | 60 | 61 | def train(train_data, train_labels, test_data, test_labels, file_name, 62 | epochs=4, batch_size=150, train_temp=1, init=None, callbacks=False): 63 | # neural net parameters 64 | units = [200, 200] 65 | activation_function = "relu" 66 | kernel = "glorot_uniform" 67 | bias = "zeros" 68 | dropout = 0.2 69 | learn_rate = 0.001 70 | 71 | model = Sequential() # neural net init 72 | model.add(Dense(units=units[0], activation=activation_function, input_dim=total_features, kernel_initializer=kernel, 73 | bias_initializer=bias)) 74 | model.add(Dropout(dropout)) # add dropout rate 75 | 76 | for hidden_layer_units in units[1:]: # add hidden layers defined units in train_models.py 77 | model.add(Dense(units=hidden_layer_units, activation=activation_function, kernel_initializer=kernel, 78 | bias_initializer=bias)) 79 | model.add(Dropout(dropout)) 80 | 81 | model.add(Dense(2)) # output layer, with with two neurons and without activation function 82 | 83 | if init is not None: 84 | model.load_weights(init) 85 | 86 | def fn(correct, predicted): 87 | return tf.nn.softmax_cross_entropy_with_logits(labels=correct, logits=(predicted / train_temp)) 88 | 89 | # loss the fn method defined above, Adam optimizer 90 | model.compile(loss=fn, 91 | optimizer=Adam(lr=learn_rate), 92 | metrics=["accuracy"]) 93 | 94 | if callbacks: 95 | log_dir = path + "log/dir/DNN" 96 | tensorboard_callback = TensorBoard(log_dir=log_dir, histogram_freq=1, write_graph=True, write_images=True) 97 | early_stopping_callback = EarlyStopping(monitor='val_loss', mode='min', patience=10, verbose=2) 98 | model_checkpoint_callback = ModelCheckpoint(file_name, monitor='val_accuracy', 99 | mode='max', 100 | verbose=2, save_best_only=True) 101 | model.fit(train_data, train_labels, 102 | epochs=epochs, 103 | batch_size=batch_size, 104 | validation_data=(test_data, test_labels), 105 | callbacks=[tensorboard_callback, early_stopping_callback, model_checkpoint_callback], 106 | verbose=2) 107 | else: 108 | model.fit(train_data, train_labels, 109 | epochs=epochs, 110 | batch_size=batch_size, 111 | validation_data=(test_data, test_labels), 112 | verbose=2) 113 | 114 | if file_name is not None: 115 | model.save(file_name) 116 | 117 | return model 118 | 119 | 120 | def train_distillation(features, labels, file_name, epochs=4, batch_size=150, train_temp=1): 121 | """ 122 | :param features: the train data 123 | :param labels: the train labels 124 | :param file_name: the file to save teacher and student 125 | :param epochs: number of epochs 126 | :param batch_size: batch size 127 | :param train_temp: temperature 128 | :return: 129 | """ 130 | if not os.path.exists(file_name + "_init"): 131 | # train for one epoch to get a starting point 132 | train(features, labels, test_data, test_labels, file_name + "_init", 1, batch_size) 133 | 134 | # train the teacher at the given temperature 135 | print("Temperature:", train_temp) 136 | teacher = train(features, labels, test_data, test_labels, file_name + "_teacher", epochs, batch_size, 137 | train_temp, init=file_name + "_init") 138 | 139 | predicted = teacher.predict(features) # evaluate the labels at the given temperature 140 | print(predicted) 141 | 142 | with tf.compat.v1.Session() as sess: 143 | y = sess.run(tf.nn.softmax(predicted / train_temp)) 144 | print(y) 145 | train_labels = y 146 | 147 | # train the student at temperature t 148 | student = train(features, train_labels, test_data, test_labels, file_name, epochs, batch_size, 149 | train_temp, init=file_name + "_init") 150 | # predict at temperature 1 151 | predicted = student.predict(features) 152 | # print(predicted) 153 | 154 | 155 | data, labels = create_training_input() # init train data-labels 156 | test_data, test_labels = create_testing_input() # init test data-labels 157 | # we use categorical_cross_entropy and thus, an encode to one hot labels is required 158 | labels = keras.utils.to_categorical(labels, 2) 159 | test_labels = keras.utils.to_categorical(test_labels, 2) 160 | 161 | # first train with original temperature (= 1) 162 | train(data, labels, test_data, test_labels, path + "original", epochs=30, callbacks=True) 163 | # train teacher and student networks with a predefined temperature 164 | train_distillation(data, labels, path + "distilled-100", epochs=7, batch_size=150, train_temp=120) 165 | -------------------------------------------------------------------------------- /feature_based_original_dataset/ensemble.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | from tensorflow import keras 4 | from sklearn.metrics import confusion_matrix 5 | from joblib import load 6 | import set_onehot_encoding as onehot 7 | import os 8 | 9 | def create_set(): 10 | if os.path.isfile("testing_set_200.txt") is False: 11 | set_size = 200 12 | malware_ratio = 0.5 13 | print("Creating data-labels...") 14 | print("Generating TESTING set...") 15 | testing_set = onehot.generate_set(set_size, malware_ratio) # generate random testing set 16 | with open("testing_set_200.txt", "w") as file: 17 | for item in testing_set: 18 | file.write(str(item) + "\n") 19 | testing_set = [] # the list of testing set 20 | with open("testing_set_200.txt", "r") as file: # read testing set file and append applications to list 21 | for line in file: 22 | line.strip() 23 | line = line[:-1] 24 | testing_set.append(line) 25 | print("Generating TESTING input...") 26 | test_data, test_labels = onehot.generate_input(testing_set, total_features) # perform one-hot encoding 27 | return test_data, test_labels 28 | 29 | 30 | """ 31 | functions to compute Jacobian with numpy. 32 | https://medium.com/unit8-machine-learning-publication/computing-the-jacobian-matrix-of-a-neural-network-in-python-4f162e5db180 33 | First we specify the the forward and backward passes of each layer to implement backpropagation manually. 34 | """ 35 | 36 | 37 | def affine_forward(x, w, b): 38 | """ 39 | Forward pass of an affine layer 40 | :param x: input of dimension (I, ) 41 | :param w: weights matrix of dimension (I, O) 42 | :param b: biais vector of dimension (O, ) 43 | :return output of dimension (O, ), and cache needed for backprop 44 | """ 45 | out = np.dot(x, w) + b 46 | cache = (x, w) 47 | return out, cache 48 | 49 | 50 | def affine_backward(dout, cache): 51 | """ 52 | Backward pass for an affine layer. 53 | :param dout: Upstream Jacobian, of shape (M, O) 54 | :param cache: Tuple of: 55 | - x: Input data, of shape (I, ) 56 | - w: Weights, of shape (I, O) 57 | :return the jacobian matrix containing derivatives of the M neural network outputs with respect to 58 | this layer's inputs, evaluated at x, of shape (M, I) 59 | """ 60 | x, w = cache 61 | dx = np.dot(dout, w.T) 62 | return dx 63 | 64 | 65 | def relu_forward(x): 66 | """ Forward ReLU 67 | """ 68 | out = np.maximum(np.zeros(x.shape), x) 69 | cache = x 70 | return out, cache 71 | 72 | 73 | def relu_backward(dout, cache): 74 | """ 75 | Backward pass of ReLU 76 | :param dout: Upstream Jacobian 77 | :param cache: the cached input for this layer 78 | :return: the jacobian matrix containing derivatives of the M neural network outputs with respect to 79 | this layer's inputs, evaluated at x. 80 | """ 81 | x = cache 82 | dx = dout * np.where(x > 0, np.ones(x.shape), np.zeros(x.shape)) 83 | return dx 84 | 85 | 86 | def softmax_forward(x): 87 | """ Forward softmax 88 | """ 89 | exps = np.exp(x - np.max(x)) 90 | s = exps / exps.sum() 91 | return s, s 92 | 93 | 94 | def softmax_backward(dout, cache): 95 | """ 96 | Backward pass for softmax 97 | :param dout: Upstream Jacobian 98 | :param cache: contains the cache (in this case the output) for this layer 99 | """ 100 | s = cache 101 | ds = np.diag(s) - np.outer(s, s.T) 102 | dx = np.dot(dout, ds) 103 | return dx 104 | 105 | 106 | def get_activations(model, layer_id, X): 107 | """ 108 | Computes outputs of intermediate layers 109 | :param model: the trained model 110 | :param layer_id: the id of the layer that we want the output from 111 | :param X: input feature vector 112 | :return: output of layer (layer_id) 113 | """ 114 | intermediate_layer_model = keras.models.Model(inputs=model.input, 115 | outputs=model.layers[layer_id].output) 116 | intermediate_output = intermediate_layer_model.predict(X) 117 | return intermediate_output 118 | 119 | 120 | def forward_backward(model, x): 121 | """ 122 | computes the forward derivative for the given input 123 | :param model: the trained model 124 | :param x: input feature vector 125 | :return: prediction result and forward derivative 126 | """ 127 | layer_to_cache = dict() # for each layer, we store the cache needed for backward pass 128 | forward_values = [] 129 | 130 | for i in range(0, len(model.layers), 2): 131 | values = {} 132 | w, b = model.layers[i].get_weights() 133 | values['w'] = w 134 | values['b'] = b 135 | forward_values.append(values) 136 | 137 | # Forward pass 138 | a1, cache_a1 = affine_forward(x, forward_values[0]['w'], forward_values[0]['b']) 139 | _, cache_r1 = relu_forward(a1) 140 | r1 = get_activations(model, 0, x) 141 | forward_values[0]['a'] = a1 142 | forward_values[0]['cache_a'] = cache_a1 143 | forward_values[0]['r'] = r1 144 | forward_values[0]['cache_r'] = cache_r1 145 | 146 | for i, layer_index in zip(range(1, len(forward_values) - 1), range(2, len(model.layers), 2)): 147 | a, cache_a = affine_forward(forward_values[i - 1]['r'], forward_values[i]['w'], forward_values[i]['b']) 148 | _, cache_r = relu_forward(a) 149 | r = get_activations(model, layer_index, x) 150 | forward_values[i]['a'] = a 151 | forward_values[i]['cache_a'] = cache_a 152 | forward_values[i]['r'] = r 153 | forward_values[i]['cache_r'] = cache_r 154 | 155 | a, cache_a = affine_forward(forward_values[len(forward_values) - 2]['r'], 156 | forward_values[len(forward_values) - 1]['w'], 157 | forward_values[len(forward_values) - 1]['b']) 158 | forward_values[len(forward_values) - 1]['a'] = a 159 | forward_values[len(forward_values) - 1]['cache_a'] = cache_a 160 | out, cache_out = softmax_forward(a) 161 | 162 | # backward pass 163 | dout = np.diag(np.ones(out.size, )) # the derivatives of each output w.r.t. each output. 164 | dout = softmax_backward(dout, cache_out) 165 | dout = affine_backward(dout, forward_values[len(forward_values) - 1]['cache_a']) 166 | 167 | for i in range(len(forward_values) - 2, 0, -1): 168 | dout = relu_backward(dout, forward_values[i]['cache_r']) 169 | dout = affine_backward(dout, forward_values[i]['cache_a']) 170 | 171 | dout = relu_backward(dout, forward_values[0]['cache_r']) 172 | dx = affine_backward(dout, forward_values[0]['cache_a']) 173 | 174 | return out, dx 175 | 176 | 177 | def craft_adversarial_samples(x, y, F, k): 178 | 179 | x_adv = x 180 | gamma = [1] * len(x) 181 | delta_x = [0] 182 | changes = 0 183 | 184 | if np.argmax(F.predict(x_adv), 1) == 0: # if misclassification achieved return adv_x 185 | return x_adv, -1 186 | 187 | while np.argmax(F.predict(x_adv), 1) != y and np.linalg.norm(delta_x, ord=1) < k and changes < 20: 188 | # compute forward derivative (Jacobian) 189 | prob, forward_derivative = forward_backward(F, x_adv) 190 | 191 | tmp = np.multiply(forward_derivative[0], gamma) 192 | for i, feature in enumerate(x_adv[0]): 193 | if feature == 1: 194 | tmp[i] = 0 195 | i_max = np.argmax(tmp) 196 | if i_max <= 0: 197 | raise ValueError('FAILURE: We can only add features to an application!') 198 | 199 | x_adv[0][i_max] = 1 200 | delta_x = np.subtract(x_adv, x) 201 | # print(i_max) 202 | if i_max not in changes_dict: 203 | changes_dict[i_max] = 1 204 | else: 205 | changes_dict[i_max] += 1 206 | changes += 1 207 | print("Changes:", changes) 208 | 209 | return x_adv, changes 210 | 211 | 212 | def load_models(): 213 | """ 214 | load saved models (classic ml & neural nets with different optimizers) 215 | """ 216 | adam = tf.keras.models.load_model(path + "model_Adam.h5") 217 | sgd_mom = tf.keras.models.load_model(path + "model_SGD_mom.h5") 218 | 219 | #classic ml 220 | rf = load('models/model_RandomForestClassifier.joblib') 221 | lr = load('models/model_LogisticRegression.joblib') 222 | 223 | return adam, sgd_mom, rf, lr 224 | 225 | 226 | def final_prediction(adam, sgd, lr, rf): 227 | sum_pred = [] 228 | for i in range(len(adam)): 229 | sum_pred.append([]) 230 | for j in range(len(adam[i])): 231 | sum_pred[i].append((adam[i][j] + sgd[i][j]) + rf[i][j] + lr[i][j]/4) 232 | return sum_pred 233 | 234 | 235 | def evaluate_without_adv(): 236 | 237 | val_data, val_labels = create_set() 238 | adam, sgd_mom, rf, lr = load_models() 239 | 240 | adam_pred = adam.predict(val_data) 241 | sgd_mom_pred = sgd_mom.predict(val_data) 242 | rf_pred = rf.predict_proba(val_data) 243 | lr_pred = lr.predict_proba(val_data) 244 | predictions = final_prediction(adam_pred, sgd_mom_pred, rf_pred, lr_pred) 245 | 246 | confusion = confusion_matrix(val_labels, np.argmax(predictions, axis=1)) 247 | print(confusion) 248 | TP = confusion[1, 1] 249 | TN = confusion[0, 0] 250 | FP = confusion[0, 1] 251 | FN = confusion[1, 0] 252 | FNR = FN / float(FN + TP) * 100 253 | FPR = FP / float(FP + TN) * 100 254 | accuracy = ((TP + TN) / float(TP + TN + FP + FN)) * 100 255 | print("FP:", FP, "- FN:", FN, "- TP:", TP, "- TN", TN) 256 | print("Accuracy:", accuracy, "- FPR:", FPR, "- FNR:", FNR) 257 | 258 | 259 | def evaluate_adam(): 260 | trained_model = tf.keras.models.load_model(path + "model_Adam.h5") 261 | val_data, val_labels = create_set() 262 | 263 | averageChanges = 0 264 | predict_original = trained_model.predict(val_data) 265 | confusion = confusion_matrix(val_labels, np.argmax(predict_original, axis=1)) 266 | TP = confusion[1, 1] 267 | TN = confusion[0, 0] 268 | FP = confusion[0, 1] 269 | FN = confusion[1, 0] 270 | FNR_original = FN / float(FN + TP) * 100 271 | FPR = FP / float(FP + TN) * 100 272 | accuracy = ((TP + TN) / float(TP + TN + FP + FN)) * 100 273 | print(confusion) 274 | print("Original FP:", FP, "- FN:", FN, "- TP:", TP, "- TN", TN) 275 | print("Original Accuracy:", accuracy, "- FPR:", FPR, "- FNR:", FNR_original) 276 | del predict_original 277 | average_changes = 0 278 | amount_malwares = 0 279 | 280 | for i in range(len(val_data)): 281 | 282 | if val_labels[i] == 1: 283 | 284 | x = val_data[i:i + 1] 285 | # print("x: ", x) 286 | # print(x.shape) 287 | try: 288 | adv_x, changes = craft_adversarial_samples(x, 0, trained_model, 1) 289 | # print(adv_x) 290 | val_data[i] = adv_x 291 | if changes >= 0: 292 | average_changes += changes 293 | amount_malwares += 1 294 | except NameError: 295 | pass 296 | except ValueError: 297 | pass 298 | 299 | if amount_malwares > 0: 300 | averageChanges += (average_changes / float(amount_malwares)) 301 | # print(val_data.shape) 302 | 303 | # evaluate the model on adversarial examples 304 | predictions = trained_model.predict(val_data) 305 | confusion = confusion_matrix(val_labels, np.argmax(predictions, axis=1)) 306 | print(confusion) 307 | TP = confusion[1, 1] 308 | TN = confusion[0, 0] 309 | FP = confusion[0, 1] 310 | FN = confusion[1, 0] 311 | FNR = FN / float(FN + TP) * 100 312 | FPR = FP / float(FP + TN) * 100 313 | accuracy = ((TP + TN) / float(TP + TN + FP + FN)) * 100 314 | print("Adversarial FP:", FP, "- FN:", FN, "- TP:", TP, "- TN", TN) 315 | print("Adversarial Accuracy:", accuracy, "- FPR:", FPR, "- FNR:", FNR) 316 | print("Misclassification Rate:", FNR - FNR_original) 317 | print("Distortion:", averageChanges) 318 | print(changes_dict) 319 | 320 | 321 | def evaluate_ensembles(): 322 | trained_model = tf.keras.models.load_model(path + "model_Adam.h5") 323 | val_data, val_labels = create_set() 324 | 325 | averageChanges = 0 326 | average_changes = 0 327 | amount_malwares = 0 328 | 329 | for i in range(len(val_data)): 330 | 331 | if val_labels[i] == 1: 332 | 333 | x = val_data[i:i + 1] 334 | # print("x: ", x) 335 | # print(x.shape) 336 | try: 337 | adv_x, changes = craft_adversarial_samples(x, 0, trained_model, 1) 338 | # print(adv_x) 339 | val_data[i] = adv_x 340 | if changes >= 0: 341 | average_changes += changes 342 | amount_malwares += 1 343 | except NameError: 344 | pass 345 | except ValueError: 346 | pass 347 | 348 | if amount_malwares > 0: 349 | averageChanges += (average_changes / float(amount_malwares)) 350 | # print(val_data.shape) 351 | 352 | # evaluate the models on adversarial examples 353 | adam, sgd_mom, rf, lr = load_models() 354 | adam_pred = adam.predict(val_data) 355 | sgd_mom_pred = sgd_mom.predict(val_data) 356 | rf_pred = rf.predict_proba(val_data) 357 | lr_pred = lr.predict_proba(val_data) 358 | 359 | predictions = final_prediction(adam_pred, sgd_mom_pred, rf, lr) 360 | 361 | confusion = confusion_matrix(val_labels, np.argmax(predictions, axis=1)) 362 | print(confusion) 363 | TP = confusion[1, 1] 364 | TN = confusion[0, 0] 365 | FP = confusion[0, 1] 366 | FN = confusion[1, 0] 367 | FNR = FN / float(FN + TP) * 100 368 | FPR = FP / float(FP + TN) * 100 369 | accuracy = ((TP + TN) / float(TP + TN + FP + FN)) * 100 370 | print("FP:", FP, "- FN:", FN, "- TP:", TP, "- TN", TN) 371 | print("Accuracy:", accuracy, "- FPR:", FPR, "- FNR:", FNR) 372 | 373 | 374 | if __name__ == "__main__": 375 | path = "models_incremental_learning/" 376 | total_features = 545333 # total unique features 377 | print("Creating data-labels...") 378 | onehot.create_list_of_apps() # function from set_one_encoding.py 379 | 380 | changes_dict = {} # dictionary for perturbations (added features) 381 | 382 | #evaluate_adam() 383 | evaluate_without_adv() 384 | evaluate_ensembles() 385 | -------------------------------------------------------------------------------- /feature_based_original_dataset/evaluate_models.py: -------------------------------------------------------------------------------- 1 | import set_onehot_encoding as onehot 2 | import models 3 | import neural_network as NN 4 | import numpy as np 5 | import pickle 6 | import keras.optimizers 7 | from keras.models import load_model 8 | import tensorflow as tf 9 | import joblib 10 | 11 | 12 | def create_random_sets(): 13 | print("Generating TESTING set...") 14 | testing_set = onehot.generate_set(testing_set_size, malware_ratio) # generate random testing set 15 | print("Generating TESTING input...") 16 | test_data, test_labels = onehot.generate_input(testing_set, total_features) # perform one-hot encoding 17 | return test_data, test_labels 18 | 19 | 20 | def create_train_set(): 21 | training_set = [] # the list of training set 22 | 23 | with open("training_set_1500.txt", "r") as file: # read training set file and append applications to list 24 | for line in file: 25 | line.strip() # remove whitespace 26 | line = line[:-1] # remove \n 27 | training_set.append(line) # add item to list 28 | print("Generating TRAINING input...") 29 | data, labels = onehot.generate_input(training_set, total_features) # perform one-hot encoding 30 | 31 | return data, labels 32 | 33 | 34 | def create_test_set(): 35 | testing_set = [] # the list of testing set 36 | 37 | with open("testing_set_1500.txt", "r") as file: # read testing set file and append applications to list 38 | for line in file: 39 | line.strip() 40 | line = line[:-1] 41 | testing_set.append(line) 42 | 43 | print("Generating TESTING input...") 44 | test_data, test_labels = onehot.generate_input(testing_set, total_features) # perform one-hot encoding 45 | return test_data, test_labels 46 | 47 | 48 | def evaluate_models(runs): 49 | 50 | for i in range(runs): 51 | val_data, val_labels = create_random_sets() 52 | 53 | # loaded_model = load("models/model_GaussianNB.sav") 54 | # GNB.test_gaussian_naive_bayes_classifier(loaded_model, val_data, val_labels) 55 | 56 | # loaded_model = load("models/model_MultinomialNB.sav") 57 | # MNB.test_multi_naive_bayes_classifier(loaded_model, val_data, val_labels) 58 | 59 | # loaded_model = load("models/model_ComplementNB.sav") 60 | # CNB.test_complement_naive_bayes_classifier(loaded_model, val_data, val_labels) 61 | 62 | # loaded_model = load("models/model_BernoulliNB.sav") 63 | # BNB.test_bernoulli_naive_bayes_classifier(loaded_model, val_data, val_labels) 64 | 65 | # loaded_model = load("models/model_DecisionTreeClassifier.sav") 66 | # DT.test_decision_tree_classifier(loaded_model, val_data, val_labels) 67 | 68 | # loaded_model = load("models/model_RandomForestClassifier.sav") 69 | # RF.test_random_forest_classifier(loaded_model, val_data, val_labels) 70 | 71 | # loaded_model = load("models/model_LogisticRegression.sav") 72 | # LR.test_logistic_regression_classifier(loaded_model, val_data, val_labels) 73 | 74 | # loaded_model = load("models/model_SVC.sav") 75 | # SVM.test_svm_classifier(loaded_model, val_data, val_labels) 76 | 77 | loaded_model = load_model("models/best_model_DNN_Adam.h5") 78 | NN.test_neural_network(loaded_model, val_data, val_labels) 79 | 80 | 81 | # GNB.get_average_metrics(runs) 82 | # MNB.get_average_metrics(runs) 83 | # CNB.get_average_metrics(runs) 84 | # BNB.get_average_metrics(runs) 85 | # DT.get_average_metrics(runs) 86 | # RF.get_average_metrics(runs) 87 | # LR.get_average_metrics(runs) 88 | # SVM.get_average_metrics(runs) 89 | NN.get_average_metrics(runs) 90 | 91 | 92 | def evaluate_on_test_set(): 93 | val_data, val_labels = create_test_set() 94 | 95 | #loaded_model = load("models/model_GaussianNB.joblib") 96 | #GNB.test_gaussian_naive_bayes_classifier(loaded_model, val_data, val_labels) 97 | 98 | #loaded_model = load("models/model_MultinomialNB.joblib") 99 | #MNB.test_multi_naive_bayes_classifier(loaded_model, val_data, val_labels) 100 | 101 | #loaded_model = load("models/model_ComplementNB.joblib") 102 | #CNB.test_complement_naive_bayes_classifier(loaded_model, val_data, val_labels) 103 | 104 | #loaded_model = load("models/model_BernoulliNB.joblib") 105 | #BNB.test_bernoulli_naive_bayes_classifier(loaded_model, val_data, val_labels) 106 | 107 | #loaded_model = load("models/model_DecisionTreeClassifier.joblib") 108 | #DT.test_decision_tree_classifier(loaded_model, val_data, val_labels) 109 | 110 | #loaded_model = load("models/model_RandomForestClassifier.joblib") 111 | #RF.test_random_forest_classifier(loaded_model, val_data, val_labels) 112 | 113 | #loaded_model = load("models/KNearestNeighborsClassifier.joblib") 114 | #KNN.test_knn_classifier(loaded_model, val_data, val_labels) 115 | 116 | #loaded_model = load("models/model_LogisticRegression.joblib") 117 | #LR.test_logistic_regression_classifier(loaded_model, val_data, val_labels) 118 | 119 | #loaded_model = load("model_SVC.joblib") 120 | #SVM.test_svm_classifier(loaded_model, val_data, val_labels) 121 | 122 | loaded_model = load_model("models/best_model_DNN_Adam.h5") 123 | NN.test_neural_network(loaded_model, val_data, val_labels) 124 | 125 | 126 | if __name__ == "__main__": 127 | total_features = 545333 # total unique features 128 | testing_set_size = 1500 # set site that will be used to create random test set 129 | malware_ratio = 0.3 # malware ratio in the set size 130 | 131 | print("Creating data-labels...") 132 | onehot.create_list_of_apps() # function from set_one_encoding.py 133 | 134 | # initialize sklearn models 135 | GNB = models.GaussianNaiveBayes() 136 | MNB = models.MultinomialNaiveBayes() 137 | CNB = models.ComplementNaiveBayes() 138 | BNB = models.BernoulliNaiveBayes() 139 | DT = models.DecisionTree() 140 | RF = models.RandomForest() 141 | KNN = models.KNearestNeighbors() 142 | LR = models.LogRegression() 143 | SVM = models.SupportVectorMachine() 144 | 145 | val_runs = 8 146 | 147 | #evaluate_models(val_runs) 148 | evaluate_on_test_set() 149 | -------------------------------------------------------------------------------- /feature_based_original_dataset/fgsm.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | from sklearn.metrics import confusion_matrix 4 | import set_onehot_encoding as onehot 5 | import os 6 | 7 | 8 | def create_set(): 9 | if os.path.isfile("testing_set_1000.txt") is False: 10 | set_size = 1000 11 | malware_ratio = 0.3 12 | print("Creating data-labels...") 13 | print("Generating TESTING set...") 14 | testing_set = onehot.generate_set(set_size, malware_ratio) # generate random testing set 15 | with open("testing_set_1000.txt", "w") as file: 16 | for item in testing_set: 17 | file.write(str(item) + "\n") 18 | testing_set = [] # the list of testing set 19 | with open("testing_set_1000.txt", "r") as file: # read testing set file and append applications to list 20 | for line in file: 21 | line.strip() 22 | line = line[:-1] 23 | testing_set.append(line) 24 | print("Generating TESTING input...") 25 | test_data, test_labels = onehot.generate_input(testing_set, total_features) # perform one-hot encoding 26 | return test_data, test_labels 27 | 28 | 29 | def create_adversarial_pattern(input_x, input_y): 30 | """ 31 | FGSM attack as described in https://arxiv.org/pdf/1412.6572.pdf 32 | The goal of FGSM is to cause the loss function to increase for specific inputs. 33 | It operates by perturbating each feature of an input x by a small value to maximize the loss. 34 | Steps: 35 | 1)Compute the gradient of the loss with respect to the input 36 | ∇_x J(θ,x,y) 37 | where x is the model's input, y the target class, θ the model's parameters, ∇_x the gradient and J(θ,x,y) the loss 38 | 2)Take the sign of the gradient (calculated in 1), multiply it by a threshold ε and add it to the 39 | original input x. 40 | x_adv=x+e*sign(∇_x J(θ,x,y)) 41 | 42 | :param input_x: the original input data 43 | :param input_y: the original input label 44 | :return: the sign of the gradient 45 | """ 46 | with tf.GradientTape() as tape: 47 | tape.watch(input_x) 48 | prediction = trained_model(input_x) # predict original input 49 | loss = loss_object(input_y, prediction) # get the loss 50 | # get the gradients of the loss with respect to the inputs 51 | gradient = tape.gradient(loss, input_x) 52 | # get the sign of the gradients to create perturbations 53 | signed_grad = tf.sign(gradient) 54 | return signed_grad 55 | 56 | 57 | if __name__ == "__main__": 58 | total_features = 545333 # total unique features 59 | print("Creating data-labels...") 60 | onehot.create_list_of_apps() # function from set_one_encoding.py 61 | # path to the saved model 62 | trained_model = tf.keras.models.load_model('models_incremental_learning/model_Adam.h5') 63 | 64 | # create the testing input 65 | val_data, val_labels = create_set() 66 | 67 | # loss function 68 | loss_object = tf.keras.losses.SparseCategoricalCrossentropy() 69 | val_data = tf.convert_to_tensor(val_data, dtype=np.float32) 70 | val_labels = tf.convert_to_tensor(val_labels, dtype=np.int32) 71 | 72 | perturbations = create_adversarial_pattern(val_data, val_labels) # get the sign of gradient wrt the input 73 | 74 | epsilons = [0, 0.01] # 0 to evaluate without FGSM, 0.01 for FGSM. Note 0.01 is too small, but able to fool models! 75 | descriptions = [('Epsilon = {:0.3f}'.format(eps) if eps else 'Input') 76 | for eps in epsilons] 77 | 78 | for i, eps in enumerate(epsilons): 79 | adv_x = val_data + eps * perturbations # compute input_x + eps * adversarial examples as defined in FGSM 80 | adv_x = tf.clip_by_value(adv_x, 0, 1) 81 | prediction = trained_model.predict(adv_x) # model prediction 82 | confusion = confusion_matrix(val_labels, np.argmax(prediction, axis=1)) # confusion matrix 83 | print(confusion) 84 | # confusion matrix metrics 85 | TP = confusion[1, 1] 86 | TN = confusion[0, 0] 87 | FP = confusion[0, 1] 88 | FN = confusion[1, 0] 89 | FNR = FN / float(FN + TP) * 100 90 | FPR = FP / float(FP + TN) * 100 91 | accuracy = (TP + TN) / float(TP + TN + FP + FN) * 100 92 | print("Epsilon:", eps, "- FP:", FP, "- FN:", FN, "- TP:", TP, "- TN", TN) 93 | print("Epsilon:", eps, "- Accuracy:", accuracy, "- FPR:", FPR, "- FNR:", FNR) 94 | print("Misclassification Rate:", 100 - accuracy) 95 | -------------------------------------------------------------------------------- /feature_based_original_dataset/incremental_learning.py: -------------------------------------------------------------------------------- 1 | import set_onehot_encoding as onehot 2 | from sklearn.naive_bayes import MultinomialNB, ComplementNB 3 | import neural_network as NN 4 | import models 5 | import numpy as np 6 | import matplotlib.pyplot as plt 7 | import joblib 8 | 9 | 10 | def create_training_input(): 11 | print("Generating TRAINING set...") 12 | training_set = onehot.generate_set_incremental(mini_batch_size, malware_ratio) # choose random training set 13 | print("Generating TRAINING input...") 14 | data, labels = onehot.generate_input(training_set, total_features) # perform one-hot encoding 15 | return data, labels 16 | 17 | 18 | def create_testing_input(): 19 | testing_set = [] # the list of testing set 20 | 21 | with open("testing_set_1500.txt", "r") as file: # read testing set file and append applications to list 22 | for line in file: 23 | line.strip() 24 | line = line[:-1] 25 | testing_set.append(line) 26 | print("Generating TESTING input...") 27 | test_data, test_labels = onehot.generate_input(testing_set, total_features) # perform one-hot encoding 28 | return test_data, test_labels 29 | 30 | 31 | def incremental_learn(): 32 | for j in range(batches): 33 | data, labels = create_training_input() 34 | 35 | # incremental train and evaluate Multinomial Naive Bayes 36 | #model = MNB.train_incremental(data, labels) 37 | #MNB.evaluate_multi_naive_bayes_classifier(model, x_test, y_test) 38 | 39 | # incremental train and evaluate Complement Naive Bayes 40 | #model = CNB.train_incremental(data, labels) 41 | #CNB.evaluate_complement_naive_bayes_classifier(model, x_test, y_test) 42 | 43 | # incremental train and evaluate neural net 44 | NN.train_neural_network(model, epochs, batch_size, data, labels) # train neural network 45 | NN.evaluate_neural_network(model, x_test, y_test) 46 | 47 | #filename = "model_incremental_" + type(MNB).__name__ + ".joblib" 48 | #dump(model, filename) 49 | #MNB.test_multi_naive_bayes_classifier(model, x_test, y_test) 50 | 51 | #filename = "model_incremental_" + type(CNB).__name__ + ".joblib" 52 | #dump(model, filename) 53 | #CNB.test_complement_naive_bayes_classifier(model, x_test, y_test) 54 | 55 | opt_config = model.optimizer.get_config() 56 | 57 | if 'name' not in opt_config.keys(): 58 | _name = str(model.optimizer.__class__).split('.')[-1].replace('\'', '').replace('>', '') 59 | opt_config.update({'name': _name}) 60 | 61 | model.save('model_' + opt_config['name'] + '.h5') 62 | 63 | NN.test_neural_network(model, x_test, y_test) 64 | 65 | 66 | if __name__ == "__main__": 67 | total_features = 545333 # total unique features 68 | mini_batch_size = 1000 # we will feed the classifier with mini batches of 1000. 69 | # number of times that mini batches will be fed to the classifier (the total number of samples will be batch_size*i) 70 | batches = 19 71 | 72 | testing_set_size = 1000 # set site that will be used to create random test set 73 | 74 | malware_ratio = 0.3 # malware ratio in the mini batch size 75 | 76 | training_data = [] # list of training batches 77 | training_labels = [] # list of testing samples 78 | 79 | MNB = models.MultinomialNaiveBayes() # Multinomial Naive Bayes for incremental learning 80 | CNB = models.ComplementNaiveBayes() # Complement Naive Bayes for incremental learning 81 | 82 | units = [200, 200] 83 | dropout = 0.2 84 | epochs = 4 85 | batch_size = 150 86 | learn_rate = 0.001 87 | kernel_initializer = 'glorot_uniform' 88 | bias_initializer = 'zeros' 89 | activation_function = 'relu' 90 | model = NN.generate_neural_network(total_features, units, dropout, learn_rate, kernel_initializer, 91 | bias_initializer, activation_function) 92 | 93 | print("Creating data-labels...") 94 | onehot.create_list_of_apps() # function from set_one_encoding.py 95 | 96 | x_test, y_test = create_testing_input() 97 | incremental_learn() 98 | -------------------------------------------------------------------------------- /feature_based_original_dataset/jsma.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | from tensorflow import keras 4 | from sklearn.metrics import confusion_matrix 5 | import set_onehot_encoding as onehot 6 | import os 7 | import joblib 8 | import models 9 | 10 | 11 | def create_set(): 12 | if os.path.isfile("testing_set_1000.txt") is False: 13 | set_size = 1000 14 | malware_ratio = 0.3 15 | print("Creating data-labels...") 16 | print("Generating TESTING set...") 17 | testing_set = onehot.generate_set(set_size, malware_ratio) # generate random testing set 18 | with open("testing_set_1000.txt", "w") as file: 19 | for item in testing_set: 20 | file.write(str(item) + "\n") 21 | testing_set = [] # the list of testing set 22 | with open("testing_set_1000.txt", "r") as file: # read testing set file and append applications to list 23 | for line in file: 24 | line.strip() 25 | line = line[:-1] 26 | testing_set.append(line) 27 | print("Generating TESTING input...") 28 | test_data, test_labels = onehot.generate_input(testing_set, total_features) # perform one-hot encoding 29 | return test_data, test_labels 30 | 31 | 32 | """ 33 | functions to compute Jacobian with numpy. 34 | https://medium.com/unit8-machine-learning-publication/computing-the-jacobian-matrix-of-a-neural-network-in-python-4f162e5db180 35 | First we specify the the forward and backward passes of each layer to implement backpropagation manually. 36 | """ 37 | 38 | 39 | def affine_forward(x, w, b): 40 | """ 41 | Forward pass of an affine layer 42 | :param x: input of dimension (I, ) 43 | :param w: weights matrix of dimension (I, O) 44 | :param b: biais vector of dimension (O, ) 45 | :return output of dimension (O, ), and cache needed for backprop 46 | """ 47 | out = np.dot(x, w) + b 48 | cache = (x, w) 49 | return out, cache 50 | 51 | 52 | def affine_backward(dout, cache): 53 | """ 54 | Backward pass for an affine layer. 55 | :param dout: Upstream Jacobian, of shape (M, O) 56 | :param cache: Tuple of: 57 | - x: Input data, of shape (I, ) 58 | - w: Weights, of shape (I, O) 59 | :return the jacobian matrix containing derivatives of the M neural network outputs with respect to 60 | this layer's inputs, evaluated at x, of shape (M, I) 61 | """ 62 | x, w = cache 63 | dx = np.dot(dout, w.T) 64 | return dx 65 | 66 | 67 | def relu_forward(x): 68 | """ Forward ReLU 69 | """ 70 | out = np.maximum(np.zeros(x.shape), x) 71 | cache = x 72 | return out, cache 73 | 74 | 75 | def relu_backward(dout, cache): 76 | """ 77 | Backward pass of ReLU 78 | :param dout: Upstream Jacobian 79 | :param cache: the cached input for this layer 80 | :return: the jacobian matrix containing derivatives of the M neural network outputs with respect to 81 | this layer's inputs, evaluated at x. 82 | """ 83 | x = cache 84 | dx = dout * np.where(x > 0, np.ones(x.shape), np.zeros(x.shape)) 85 | return dx 86 | 87 | 88 | def softmax_forward(x): 89 | """ Forward softmax 90 | """ 91 | exps = np.exp(x - np.max(x)) 92 | s = exps / exps.sum() 93 | return s, s 94 | 95 | 96 | def softmax_backward(dout, cache): 97 | """ 98 | Backward pass for softmax 99 | :param dout: Upstream Jacobian 100 | :param cache: contains the cache (in this case the output) for this layer 101 | """ 102 | s = cache 103 | ds = np.diag(s) - np.outer(s, s.T) 104 | dx = np.dot(dout, ds) 105 | return dx 106 | 107 | 108 | def get_activations(model, layer_id, X): 109 | """ 110 | Computes outputs of intermediate layers 111 | :param model: the trained model 112 | :param layer_id: the id of the layer that we want the output from 113 | :param X: input feature vector 114 | :return: output of layer (layer_id) 115 | """ 116 | intermediate_layer_model = keras.models.Model(inputs=model.input, 117 | outputs=model.layers[layer_id].output) 118 | intermediate_output = intermediate_layer_model.predict(X) 119 | return intermediate_output 120 | 121 | 122 | def forward_backward(model, x): 123 | """ 124 | computes the forward derivative for the given input 125 | :param model: the trained model 126 | :param x: input feature vector 127 | :return: prediction result and forward derivative 128 | """ 129 | layer_to_cache = dict() # for each layer, we store the cache needed for backward pass 130 | forward_values = [] 131 | 132 | for i in range(0, len(model.layers), 2): 133 | values = {} 134 | w, b = model.layers[i].get_weights() 135 | values['w'] = w 136 | values['b'] = b 137 | forward_values.append(values) 138 | 139 | # Forward pass 140 | a1, cache_a1 = affine_forward(x, forward_values[0]['w'], forward_values[0]['b']) 141 | _, cache_r1 = relu_forward(a1) 142 | r1 = get_activations(model, 0, x) 143 | forward_values[0]['a'] = a1 144 | forward_values[0]['cache_a'] = cache_a1 145 | forward_values[0]['r'] = r1 146 | forward_values[0]['cache_r'] = cache_r1 147 | 148 | for i, layer_index in zip(range(1, len(forward_values) - 1), range(2, len(model.layers), 2)): 149 | a, cache_a = affine_forward(forward_values[i - 1]['r'], forward_values[i]['w'], forward_values[i]['b']) 150 | _, cache_r = relu_forward(a) 151 | r = get_activations(model, layer_index, x) 152 | forward_values[i]['a'] = a 153 | forward_values[i]['cache_a'] = cache_a 154 | forward_values[i]['r'] = r 155 | forward_values[i]['cache_r'] = cache_r 156 | 157 | a, cache_a = affine_forward(forward_values[len(forward_values) - 2]['r'], 158 | forward_values[len(forward_values) - 1]['w'], 159 | forward_values[len(forward_values) - 1]['b']) 160 | forward_values[len(forward_values) - 1]['a'] = a 161 | forward_values[len(forward_values) - 1]['cache_a'] = cache_a 162 | out, cache_out = softmax_forward(a) 163 | 164 | # backward pass 165 | dout = np.diag(np.ones(out.size, )) # the derivatives of each output w.r.t. each output. 166 | dout = softmax_backward(dout, cache_out) 167 | dout = affine_backward(dout, forward_values[len(forward_values) - 1]['cache_a']) 168 | 169 | for i in range(len(forward_values) - 2, 0, -1): 170 | dout = relu_backward(dout, forward_values[i]['cache_r']) 171 | dout = affine_backward(dout, forward_values[i]['cache_a']) 172 | 173 | dout = relu_backward(dout, forward_values[0]['cache_r']) 174 | dx = affine_backward(dout, forward_values[0]['cache_a']) 175 | 176 | return out, dx 177 | 178 | 179 | def craft_adversarial_samples(x, y, F, k): 180 | """ 181 | JSMA variant for adversarial examples crafting algorithm as described in https://arxiv.org/abs/1606.04435 182 | JSMA iteratively selects the most useful features to perturb a small magnitude of value until the target class is 183 | achived. The perturbed featured are selected based on the saliency map. Saliency maps are used for a network's 184 | visualization and describe which features are the most important for a particular output class. The goal 185 | is to eliminate those attributes from a legitimate sample and bring up the most important ones for the target class 186 | in oder to cause the model to misclassify. This is done by pushing the features away from the original label 187 | and closer to the target class. 188 | Steps: 189 | 1)Compute the gradient of F with respect to the input X to estimate the direction in which a perturbation in X 190 | would change F's output. That is, compute the forward derivative (the Jacobian of the learned function for 191 | a legitimate sample). 192 | ∇F(x)=(∂F(x))/∂x=[(∂F_j (x))/(∂x_i )]_(iϵ1…M,jϵ1…N) 193 | where x is the model’s input, F is the network, F(x) the predicted class, M the input dimension, 194 | N the output dimension, (i, j) is the derivative class of class j with respect to the input feature i. 195 | In essence, it computes the gradient of F with respect to input x to estimate the direction in which 196 | a perturbation in x would change the output. In backpropagation, the forward derivative is calculated 197 | with respect to the loss function and the gradients with respect to the network parameters with the goal of 198 | updating the weights. On the contrary, in JSMA the forward derivative is taken with respect to the network 199 | directly and the gradients with respect to the input data. 200 | 2)Choose a perturbation δ of X with maximal positive gradient into the the target class y'. 201 | In other words, choose the index that maximizes the change into the target class 0 by changing X_i. 202 | The limitation is that we can only add features and not discard them as in a real world scenario an adversary doesnt want 203 | to 'break' the functionality of an application. 204 | Algorithm: 205 | Input x, y, F, k, I 206 | x_adv <- x 207 | Gamma = {1...|x|} 208 | while arg max_jF_j(x_adv) != y and ||δ_X|| < k do 209 | Compute the forward derivative ∇F(adv_x) 210 | i_max = arg max_j∈Γ∩I,X_j=0 ∂Fy(X)/∂Xj 211 | if i_max <= 0 then 212 | :return Failure 213 | end if 214 | adv_x_i_max = 1 215 | δ_x <- x_adv - x 216 | :return adv_x 217 | :param x: input feature vector 218 | :param y: target class 219 | :param F: the trained model 220 | :param k: index of the hidden layer 221 | :return: adversarial sample based on feature vector x 222 | """ 223 | x_adv = x 224 | gamma = [1] * len(x) 225 | delta_x = [0] 226 | changes = 0 227 | 228 | if np.argmax(F.predict(x_adv), 1) == 0: # if misclassification achieved return adv_x 229 | return x_adv, -1 230 | 231 | while np.argmax(F.predict(x_adv), 1) != y and np.linalg.norm(delta_x, ord=1) < k and changes < 20: 232 | # compute forward derivative (Jacobian) 233 | prob, forward_derivative = forward_backward(F, x_adv) 234 | 235 | tmp = np.multiply(forward_derivative[0], gamma) 236 | for i, feature in enumerate(x_adv[0]): 237 | if feature == 1: 238 | tmp[i] = 0 239 | i_max = np.argmax(tmp) 240 | if i_max <= 0: 241 | raise ValueError('FAILURE: We can only add features to an application!') 242 | 243 | x_adv[0][i_max] = 1 244 | delta_x = np.subtract(x_adv, x) 245 | # print(i_max) 246 | if i_max not in changes_dict: 247 | changes_dict[i_max] = 1 248 | else: 249 | changes_dict[i_max] += 1 250 | changes += 1 251 | print("Changes:", changes) 252 | 253 | return x_adv, changes 254 | 255 | 256 | def evaluate_other_models(): 257 | """ 258 | specify models to evaluate adv examples produced from another model. We give two examples with a neural net with 259 | SGD optimizer and complement naive bayes. 260 | """ 261 | second_trained_model = tf.keras.models.load_model('model_SGD.h5') 262 | predictions = second_trained_model.predict(val_data) 263 | confusion = confusion_matrix(val_labels, np.argmax(predictions, axis=1)) 264 | print(confusion) 265 | TP = confusion[1, 1] 266 | TN = confusion[0, 0] 267 | FP = confusion[0, 1] 268 | FN = confusion[1, 0] 269 | FNR = FN / float(FN + TP) * 100 270 | FPR = FP / float(FP + TN) * 100 271 | accuracy = ((TP + TN) / float(TP + TN + FP + FN)) * 100 272 | print("Adversarial FP:", FP, "- FN:", FN, "- TP:", TP, "- TN", TN) 273 | print("Adversarial Accuracy:", accuracy, "- FPR:", FPR, "- FNR:", FNR) 274 | 275 | '''CNB = models.ComplementNaiveBayes() 276 | second_trained_model = load("models_incremental_learning/model_incremental_ComplementNaiveBayes.joblib") 277 | CNB.test_complement_naive_bayes_classifier(second_trained_model, val_data, val_labels)''' 278 | 279 | 280 | if __name__ == "__main__": 281 | total_features = 545333 # total unique features 282 | print("Creating data-labels...") 283 | onehot.create_list_of_apps() # function from set_one_encoding.py 284 | 285 | changes_dict = {} # dictionary for perturbations (added features) 286 | 287 | 288 | def fn(correct, predicted): 289 | train_temp = 1 290 | return tf.nn.softmax_cross_entropy_with_logits(labels=correct, logits=(predicted / train_temp)) 291 | 292 | 293 | # model trained on 1500 samples 294 | trained_model = tf.keras.models.load_model('models/best_model_DNN_Adam.h5') 295 | # incremental learned model 296 | #trained_model = tf.keras.models.load_model('models_incremental_learning/model_Adam.h5') 297 | # adversarial trained model 298 | #trained_model = tf.keras.models.load_model('Adam_adversarial_training_adv_800_0.3.h5') 299 | # distilled model 300 | #trained_model = tf.keras.models.load_model('defensive_distillation/distilled-100', custom_objects={'fn': fn}) 301 | 302 | 303 | averageChanges = 0 304 | 305 | val_data, val_labels = create_set() 306 | # print(val_labels) 307 | predict_original = trained_model.predict(val_data) 308 | confusion = confusion_matrix(val_labels, np.argmax(predict_original, axis=1)) 309 | TP = confusion[1, 1] 310 | TN = confusion[0, 0] 311 | FP = confusion[0, 1] 312 | FN = confusion[1, 0] 313 | FNR_original = FN / float(FN + TP) * 100 314 | FPR = FP / float(FP + TN) * 100 315 | accuracy = ((TP + TN) / float(TP + TN + FP + FN)) * 100 316 | print(confusion) 317 | print("Original FP:", FP, "- FN:", FN, "- TP:", TP, "- TN", TN) 318 | print("Original Accuracy:", accuracy, "- FPR:", FPR, "- FNR:", FNR_original) 319 | del predict_original 320 | average_changes = 0 321 | amount_malwares = 0 322 | 323 | for i in range(len(val_data)): 324 | 325 | if val_labels[i] == 1: 326 | 327 | x = val_data[i:i + 1] 328 | #print("x: ", x) 329 | #print(x.shape) 330 | try: 331 | adv_x, changes = craft_adversarial_samples(x, 0, trained_model, 1) 332 | # print(adv_x) 333 | val_data[i] = adv_x 334 | if changes >= 0: 335 | average_changes += changes 336 | amount_malwares += 1 337 | except NameError: 338 | pass 339 | except ValueError: 340 | pass 341 | 342 | if amount_malwares > 0: 343 | averageChanges += (average_changes / float(amount_malwares)) 344 | #print(val_data.shape) 345 | 346 | # evaluate the model on adversarial examples 347 | predictions = trained_model.predict(val_data) 348 | confusion = confusion_matrix(val_labels, np.argmax(predictions, axis=1)) 349 | print(confusion) 350 | TP = confusion[1, 1] 351 | TN = confusion[0, 0] 352 | FP = confusion[0, 1] 353 | FN = confusion[1, 0] 354 | FNR = FN / float(FN + TP) * 100 355 | FPR = FP / float(FP + TN) * 100 356 | accuracy = ((TP + TN) / float(TP + TN + FP + FN)) * 100 357 | print("Adversarial FP:", FP, "- FN:", FN, "- TP:", TP, "- TN", TN) 358 | print("Adversarial Accuracy:", accuracy, "- FPR:", FPR, "- FNR:", FNR) 359 | print("Misclassification Rate:", FNR - FNR_original) 360 | print("Distortion:", averageChanges) 361 | print(changes_dict) 362 | 363 | '''adv_trained_model = tf.keras.models.load_model("Adam_adversarial_training_adv_100_0.7.h5") 364 | predictions = adv_trained_model.predict(val_data) 365 | confusion = confusion_matrix(val_labels, np.argmax(predictions, axis=1)) 366 | print(confusion) 367 | TP = confusion[1, 1] 368 | TN = confusion[0, 0] 369 | FP = confusion[0, 1] 370 | FN = confusion[1, 0] 371 | FNR = FN / float(FN + TP) * 100 372 | FPR = FP / float(FP + TN) * 100 373 | accuracy = ((TP + TN) / float(TP + TN + FP + FN)) * 100 374 | print("Adversarial FP:", FP, "- FN:", FN, "- TP:", TP, "- TN", TN) 375 | print("Adversarial Accuracy:", accuracy, "- FPR:", FPR, "- FNR:", FNR) 376 | print("Misclassification Rate:", 100 - accuracy) 377 | print("Distortion:", averageChanges) 378 | print(changes_dict)''' 379 | 380 | # evaluate adversarial examples produced from one model to other ML models 381 | #evaluate_other_models() 382 | 383 | 384 | 385 | -------------------------------------------------------------------------------- /feature_based_original_dataset/label_encoding.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file maps features from an app to integer values, i.e. label encoding. 3 | We create a dictionary with all features present in dataset as {feature:index}, 4 | for example the feature android.hardware.touchscreen maps as 0 5 | First we take every app from the feature_vectors directory and we recreate every file 6 | of the dataset in another directory. The new file has the indexes mapped to features. 7 | For example, an app is represented as 6-0-2345-98776-1331110-1-45 8 | """ 9 | import os 10 | 11 | feature_vector = {} # dictionary with indexes mapped to features 12 | index = 0 # index value 13 | feature_vectors_dir = '../feature_vectors/' 14 | feature_indexes_dir = 'features_indexes/' 15 | if not os.path.exists(feature_indexes_dir): 16 | os.makedirs(feature_indexes_dir) 17 | 18 | not_assignable_feature_type = [''] # found from extract_feature_types.py 19 | 20 | print("Creating a dictionary that maps features to numeric values...") 21 | for filename in os.listdir(feature_vectors_dir): # read all apps 22 | with open(feature_vectors_dir + filename, "r") as file: # open an app 23 | for line in file: # read app line by line 24 | feature_type = line[:line.find('::')] # extract feature type 25 | feature = line.strip() # remove whitespace chars 26 | if feature_type not in not_assignable_feature_type: # check if feature type is '' 27 | # if a feature is not present in the feature vector, map feature to index and increment index 28 | if feature not in feature_vector: 29 | feature_vector[feature] = index 30 | index += 1 31 | 32 | print("Creating files with numeric values as features...") 33 | for filename in os.listdir(feature_vectors_dir): # recreate files with indexes 34 | with open(feature_vectors_dir + filename, "r") as file: # first open the orignal feature vectors 35 | f = open(feature_indexes_dir + filename, "a") # create a new file with the same SHA name in another dir 36 | for line in file: # read original feature vectors line by line 37 | feature_type = line[:line.find('::')] # extract feature type 38 | feature = line.strip() # remove whitespace chars 39 | if feature_type not in not_assignable_feature_type: # check if feature type is '' 40 | f.write(str(feature_vector[feature]) + '\n') # append the index of the feature to the new file 41 | f.close() 42 | print(str(feature_vector['feature::android.hardware.touchscreen'])) # 0 43 | print("Finished!") 44 | print("Total features in dataset: ", len(feature_vector)) # 545333 45 | 46 | ''' 47 | ffff64617c42e24fd1e572478279d547b834ef5e497f093ec59b3fb49ecec25f maps to 48 | 0-15597-15-16-17-15598-3297-18-15599-20-114-178-15600-21-458-23-87-10-25-15602-36-27-32 49 | -68-159-69-76-11-236-15603-216-782-43-3302-15604-66-2288-71-47-481598-79-415-289-13-15605 50 | -15606-74-28-15607-58-447-162-35-80-290-139-188-790-464-15608-419-15609-61-15610-293-2 51 | -37-38-39-78-40-84-26-3316-46-8-29-15611-44 52 | ''' 53 | -------------------------------------------------------------------------------- /feature_based_original_dataset/models.py: -------------------------------------------------------------------------------- 1 | from sklearn.naive_bayes import GaussianNB, MultinomialNB, ComplementNB, BernoulliNB 2 | from sklearn.tree import DecisionTreeClassifier 3 | from sklearn.ensemble import RandomForestClassifier 4 | from sklearn.neighbors import KNeighborsClassifier 5 | from sklearn.linear_model import LogisticRegression 6 | from sklearn import svm 7 | from sklearn.metrics import confusion_matrix 8 | from joblib import dump 9 | import timeit 10 | import numpy as np 11 | import os 12 | 13 | # init models 14 | GNB = GaussianNB() # Gaussian Naive Bayes 15 | MNB = MultinomialNB() # Multinomial Naive Bayes 16 | CNB = ComplementNB() # Complement Naive Bayes 17 | BNB = BernoulliNB() # Bernouli Naive Bayes 18 | DT = DecisionTreeClassifier(criterion='gini', max_features=None, splitter='best') # Decision Tree 19 | RF = RandomForestClassifier(n_estimators=10, criterion='entropy', max_features='log2') # Random Forest 20 | KNN = KNeighborsClassifier(n_neighbors=3, weights='distance', metric='minkowski') # K Nearest Neighbors 21 | LR = LogisticRegression(solver='lbfgs', C=2.0, fit_intercept=True, max_iter=100) # Logistic Regression model 22 | SVM = svm.SVC(kernel='linear', C=0.5, gamma='scale', decision_function_shape='ovr') # Support Vector Machines 23 | # the parameters are found from the grid search procedure. 24 | 25 | path = "models/" 26 | if not os.path.exists(path): 27 | os.mkdir(path) 28 | 29 | """ 30 | Each class defines a model for our classification task, containing four methods, namely 31 | train_, evaluate_, test_, get_average_metrics. 32 | In train_ methods we fit the models. 33 | :param features: train data 34 | :param labels: train labels 35 | :param labels: save the model if True 36 | :return: 37 | In evaluate_ methods we get the accuracy on random test sets 38 | :param model: the classifier 39 | :param features: test data 40 | :param labels: test labels 41 | :return: 42 | Train and evaluate are used in train_random_subsampling.py and in train_models.py 43 | In test_ methods we evaluate our models 44 | :param test_features: validation data 45 | :param test_labels: validation labes 46 | :return: 47 | In get_average_metrics methods we get the average performance of each model because we evaluate each model x times 48 | on unseen data. 49 | :param val_runs: times to evaluate a model 50 | Test and average metrics methods are used in evaluate_models.py 51 | """ 52 | 53 | 54 | class GaussianNaiveBayes: 55 | # metrics for the evaluation stage: FNR, FPR, accuracy 56 | average_FNR = 0 57 | average_FPR = 0 58 | average_accuracy = 0 59 | scores = [] # list for the testing training sets 60 | 61 | @staticmethod 62 | def train_gaussian_naive_bayes_classifier(features, labels, save=False): 63 | print("\n\n--- Training", type(GNB).__name__, "---") 64 | start_time = timeit.default_timer() # timer 65 | model = GNB.fit(features, labels) # fit model on training set 66 | stop_time = timeit.default_timer() 67 | print(type(GNB).__name__, "training time: ", stop_time - start_time, "seconds\n\n") 68 | if save: # if defined, save the model 69 | print("Saving model...") 70 | filename = path + "model_" + type(GNB).__name__ + ".joblib" 71 | dump(model, filename) 72 | return model 73 | 74 | def evaluate_gaussian_naive_bayes_classifier(self, model, features, labels): 75 | print("\n\n--- Evaluating", type(GNB).__name__, "---") 76 | score = model.score(features, labels) # evaluate the model in training stage 77 | print("Accuracy:", score * 100) 78 | self.scores.append(score) 79 | return self.scores 80 | 81 | def test_gaussian_naive_bayes_classifier(self, model, test_features, test_labels): 82 | print(type(GNB).__name__, "predicting...") 83 | start_time = timeit.default_timer() 84 | predicted = model.predict(test_features) # evaluate the model in evaluation stage 85 | confusion = confusion_matrix(test_labels, predicted) # confusion matrix metrics 86 | print(confusion) 87 | TP = confusion[1, 1] 88 | TN = confusion[0, 0] 89 | FP = confusion[0, 1] 90 | FN = confusion[1, 0] 91 | FNR = FN / float(FN + TP) * 100 92 | FPR = FP / float(FP + TN) * 100 93 | accuracy = (TP + TN) / float(TP + TN + FP + FN) * 100 94 | print("FP:", FP, "- FN:", FN, "- TP:", TP, "- TN", TN) 95 | print("Accuracy:", accuracy, "- FPR:", FPR, "- FNR:", FNR) 96 | 97 | stop_time = timeit.default_timer() 98 | print(type(GNB).__name__, "prediction time: ", stop_time - start_time, "seconds\n\n") 99 | self.average_FNR += FNR 100 | self.average_FPR += FPR 101 | self.average_accuracy += accuracy 102 | 103 | def get_average_metrics(self, val_runs): 104 | self.average_FNR = self.average_FNR/val_runs 105 | self.average_FPR = self.average_FPR/val_runs 106 | self.average_accuracy = self.average_accuracy/val_runs 107 | print("Average Accuracy:", self.average_accuracy, "- Average FPR:", self.average_FPR, 108 | "- Average FNR:", self.average_FNR) 109 | 110 | 111 | class MultinomialNaiveBayes: 112 | 113 | average_FNR = 0 114 | average_FPR = 0 115 | average_accuracy = 0 116 | scores = [] 117 | 118 | @staticmethod 119 | def train_multi_naive_bayes_classifier(features, labels, save=False): 120 | print("\n\n--- Training", type(MNB).__name__, "---") 121 | start_time = timeit.default_timer() 122 | model = MNB.fit(features, labels) 123 | stop_time = timeit.default_timer() 124 | print(type(MNB).__name__, "training time: ", stop_time - start_time, "seconds\n\n") 125 | if save: 126 | print("Saving model...") 127 | filename = path + "model_" + type(MNB).__name__ + ".joblib" 128 | dump(model, filename) 129 | return model 130 | 131 | def evaluate_multi_naive_bayes_classifier(self, model, features, labels): 132 | print("\n\n--- Evaluating", type(MNB).__name__, "---") 133 | score = model.score(features, labels) 134 | print("Accuracy:", score * 100) 135 | self.scores.append(score) 136 | return self.scores 137 | 138 | def test_multi_naive_bayes_classifier(self, model, test_features, test_labels): 139 | print(type(MNB).__name__, "predicting...") 140 | start_time = timeit.default_timer() 141 | predicted = model.predict(test_features) 142 | confusion = confusion_matrix(test_labels, predicted) 143 | print(confusion) 144 | TP = confusion[1, 1] 145 | TN = confusion[0, 0] 146 | FP = confusion[0, 1] 147 | FN = confusion[1, 0] 148 | FNR = FN / float(FN + TP) * 100 149 | FPR = FP / float(FP + TN) * 100 150 | accuracy = (TP + TN) / float(TP + TN + FP + FN) * 100 151 | print("FP:", FP, "- FN:", FN, "- TP:", TP, "- TN", TN) 152 | print("Accuracy:", accuracy, "- FPR:", FPR, "- FNR:", FNR) 153 | 154 | stop_time = timeit.default_timer() 155 | print(type(MNB).__name__, "prediction time: ", stop_time - start_time, "seconds\n\n") 156 | self.average_FNR += FNR 157 | self.average_FPR += FPR 158 | self.average_accuracy += accuracy 159 | 160 | def get_average_metrics(self, val_runs): 161 | self.average_FNR = self.average_FNR / val_runs 162 | self.average_FPR = self.average_FPR / val_runs 163 | self.average_accuracy = self.average_accuracy / val_runs 164 | print("Average Accuracy:", self.average_accuracy, "- Average FPR:", self.average_FPR, 165 | "- Average FNR:", self.average_FNR) 166 | 167 | @staticmethod 168 | def train_incremental(features, labels): 169 | print("\n\n--- Training", type(MNB).__name__, "---") 170 | start_time = timeit.default_timer() 171 | model = MNB.partial_fit(features, labels, classes=np.unique(labels)) 172 | stop_time = timeit.default_timer() 173 | print(type(MNB).__name__, "training time: ", stop_time - start_time, "seconds\n\n") 174 | return model 175 | 176 | 177 | class ComplementNaiveBayes: 178 | 179 | average_FNR = 0 180 | average_FPR = 0 181 | average_accuracy = 0 182 | scores = [] 183 | 184 | @staticmethod 185 | def train_complement_naive_bayes_classifier(features, labels, save=False): 186 | print("\n\n--- Training", type(CNB).__name__, "---") 187 | start_time = timeit.default_timer() 188 | model = CNB.fit(features, labels) 189 | stop_time = timeit.default_timer() 190 | print(type(CNB).__name__, "training time: ", stop_time - start_time, "seconds\n\n") 191 | if save: 192 | print("Saving model...") 193 | filename = path + "model_" + type(CNB).__name__ + ".joblib" 194 | dump(model, filename) 195 | return model 196 | 197 | def evaluate_complement_naive_bayes_classifier(self, model, features, labels): 198 | print("\n\n--- Evaluating", type(CNB).__name__, "---") 199 | score = model.score(features, labels) 200 | print("Accuracy:", score * 100) 201 | self.scores.append(score) 202 | return self.scores 203 | 204 | def test_complement_naive_bayes_classifier(self, model, test_features, test_labels): 205 | print(type(CNB).__name__, "predicting...") 206 | start_time = timeit.default_timer() 207 | predicted = model.predict(test_features) 208 | confusion = confusion_matrix(test_labels, predicted) 209 | print(confusion) 210 | TP = confusion[1, 1] 211 | TN = confusion[0, 0] 212 | FP = confusion[0, 1] 213 | FN = confusion[1, 0] 214 | FNR = FN / float(FN + TP) * 100 215 | FPR = FP / float(FP + TN) * 100 216 | accuracy = (TP + TN) / float(TP + TN + FP + FN) * 100 217 | print("FP:", FP, "- FN:", FN, "- TP:", TP, "- TN", TN) 218 | print("Accuracy:", accuracy, "- FPR:", FPR, "- FNR:", FNR) 219 | 220 | stop_time = timeit.default_timer() 221 | print(type(CNB).__name__, "prediction time: ", stop_time - start_time, "seconds\n\n") 222 | self.average_FNR += FNR 223 | self.average_FPR += FPR 224 | self.average_accuracy += accuracy 225 | 226 | def get_average_metrics(self, val_runs): 227 | self.average_FNR = self.average_FNR / val_runs 228 | self.average_FPR = self.average_FPR / val_runs 229 | self.average_accuracy = self.average_accuracy / val_runs 230 | print("Average Accuracy:", self.average_accuracy, "- Average FPR:", self.average_FPR, 231 | "- Average FNR:", self.average_FNR) 232 | 233 | @staticmethod 234 | def train_incremental(features, labels): 235 | print("\n\n--- Training", type(CNB).__name__, "---") 236 | start_time = timeit.default_timer() 237 | model = CNB.partial_fit(features, labels, classes=np.unique(labels)) 238 | stop_time = timeit.default_timer() 239 | print(type(CNB).__name__, "training time: ", stop_time - start_time, "seconds\n\n") 240 | return model 241 | 242 | 243 | class BernoulliNaiveBayes: 244 | 245 | average_FNR = 0 246 | average_FPR = 0 247 | average_accuracy = 0 248 | scores = [] 249 | 250 | @staticmethod 251 | def train_bernoulli_naive_bayes_classifier(features, labels, save=False): 252 | print("\n\n--- Training", type(BNB).__name__, "---") 253 | start_time = timeit.default_timer() 254 | model = BNB.fit(features, labels) 255 | stop_time = timeit.default_timer() 256 | print(type(BNB).__name__, "training time: ", stop_time - start_time, "seconds\n\n") 257 | if save: 258 | print("Saving model...") 259 | filename = path + "model_" + type(BNB).__name__ + ".joblib" 260 | dump(model, filename) 261 | return model 262 | 263 | def evaluate_bernoulli_naive_bayes_classifier(self, model, features, labels): 264 | print("\n\n--- Evaluating", type(BNB).__name__, "---") 265 | score = model.score(features, labels) 266 | print("Accuracy:", score * 100) 267 | self.scores.append(score) 268 | return self.scores 269 | 270 | def test_bernoulli_naive_bayes_classifier(self, model, test_features, test_labels): 271 | print(type(BNB).__name__, "predicting...") 272 | start_time = timeit.default_timer() 273 | predicted = model.predict(test_features) 274 | confusion = confusion_matrix(test_labels, predicted) 275 | print(confusion) 276 | TP = confusion[1, 1] 277 | TN = confusion[0, 0] 278 | FP = confusion[0, 1] 279 | FN = confusion[1, 0] 280 | FNR = FN / float(FN + TP) * 100 281 | FPR = FP / float(FP + TN) * 100 282 | accuracy = (TP + TN) / float(TP + TN + FP + FN) * 100 283 | print("FP:", FP, "- FN:", FN, "- TP:", TP, "- TN", TN) 284 | print("Accuracy:", accuracy, "- FPR:", FPR, "- FNR:", FNR) 285 | 286 | stop_time = timeit.default_timer() 287 | print(type(BNB).__name__, "prediction time: ", stop_time - start_time, "seconds\n\n") 288 | self.average_FNR += FNR 289 | self.average_FPR += FPR 290 | self.average_accuracy += accuracy 291 | 292 | def get_average_metrics(self, val_runs): 293 | self.average_FNR = self.average_FNR / val_runs 294 | self.average_FPR = self.average_FPR / val_runs 295 | self.average_accuracy = self.average_accuracy / val_runs 296 | print("Average Accuracy:", self.average_accuracy, "- Average FPR:", self.average_FPR, 297 | "- Average FNR:", self.average_FNR) 298 | 299 | 300 | class DecisionTree: 301 | 302 | average_FNR = 0 303 | average_FPR = 0 304 | average_accuracy = 0 305 | scores = [] 306 | 307 | @staticmethod 308 | def train_decision_tree_classifier(features, labels, save=False): 309 | print("\n\n--- Training", type(DT).__name__, "---") 310 | start_time = timeit.default_timer() 311 | model = DT.fit(features, labels) 312 | stop_time = timeit.default_timer() 313 | print(type(DT).__name__, "training time: ", stop_time - start_time, "seconds\n\n") 314 | if save: 315 | print("Saving model...") 316 | filename = path + "model_" + type(DT).__name__ + ".joblib" 317 | dump(model, filename) 318 | return model 319 | 320 | def evaluate_decision_tree_classifier(self, model, features, labels): 321 | print("\n\n--- Evaluating", type(DT).__name__, "---") 322 | score = model.score(features, labels) 323 | print("Accuracy:", score * 100) 324 | self.scores.append(score) 325 | return self.scores 326 | 327 | def test_decision_tree_classifier(self, model, test_features, test_labels): 328 | print(type(DT).__name__, "predicting...") 329 | start_time = timeit.default_timer() 330 | predicted = model.predict(test_features) 331 | confusion = confusion_matrix(test_labels, predicted) 332 | print(confusion) 333 | TP = confusion[1, 1] 334 | TN = confusion[0, 0] 335 | FP = confusion[0, 1] 336 | FN = confusion[1, 0] 337 | FNR = FN / float(FN + TP) * 100 338 | FPR = FP / float(FP + TN) * 100 339 | accuracy = (TP + TN) / float(TP + TN + FP + FN) * 100 340 | print("FP:", FP, "- FN:", FN, "- TP:", TP, "- TN", TN) 341 | print("Accuracy:", accuracy, "- FPR:", FPR, "- FNR:", FNR) 342 | 343 | stop_time = timeit.default_timer() 344 | print(type(DT).__name__, "prediction time: ", stop_time - start_time, "seconds\n\n") 345 | self.average_FNR += FNR 346 | self.average_FPR += FPR 347 | self.average_accuracy += accuracy 348 | 349 | def get_average_metrics(self, val_runs): 350 | self.average_FNR = self.average_FNR / val_runs 351 | self.average_FPR = self.average_FPR / val_runs 352 | self.average_accuracy = self.average_accuracy / val_runs 353 | print("Average Accuracy:", self.average_accuracy, "- Average FPR:", self.average_FPR, 354 | "- Average FNR:", self.average_FNR) 355 | 356 | 357 | class RandomForest: 358 | 359 | average_FNR = 0 360 | average_FPR = 0 361 | average_accuracy = 0 362 | scores = [] 363 | 364 | @staticmethod 365 | def train_random_forest_classifier(features, labels, save=False): 366 | print("\n\n--- Training", type(RF).__name__, "---") 367 | start_time = timeit.default_timer() 368 | model = RF.fit(features, labels) 369 | stop_time = timeit.default_timer() 370 | print(type(RF).__name__, "training time: ", stop_time - start_time, "seconds\n\n") 371 | if save: 372 | print("Saving model...") 373 | filename = path + "model_" + type(RF).__name__ + ".joblib" 374 | dump(model, filename) 375 | return model 376 | 377 | def evaluate_random_forest_classifier(self, model, features, labels): 378 | print("\n\n--- Evaluating", type(RF).__name__, "---") 379 | score = model.score(features, labels) 380 | print("Accuracy:", score * 100) 381 | self.scores.append(score) 382 | return self.scores 383 | 384 | def test_random_forest_classifier(self, model, test_features, test_labels): 385 | print(type(RF).__name__, "predicting...") 386 | start_time = timeit.default_timer() 387 | predicted = model.predict(test_features) 388 | confusion = confusion_matrix(test_labels, predicted) 389 | print(confusion) 390 | TP = confusion[1, 1] 391 | TN = confusion[0, 0] 392 | FP = confusion[0, 1] 393 | FN = confusion[1, 0] 394 | FNR = FN / float(FN + TP) * 100 395 | FPR = FP / float(FP + TN) * 100 396 | accuracy = (TP + TN) / float(TP + TN + FP + FN) * 100 397 | print("FP:", FP, "- FN:", FN, "- TP:", TP, "- TN", TN) 398 | print("Accuracy:", accuracy, "- FPR:", FPR, "- FNR:", FNR) 399 | 400 | stop_time = timeit.default_timer() 401 | print(type(RF).__name__, "prediction time: ", stop_time - start_time, "seconds\n\n") 402 | self.average_FNR += FNR 403 | self.average_FPR += FPR 404 | self.average_accuracy += accuracy 405 | 406 | def get_average_metrics(self, val_runs): 407 | self.average_FNR = self.average_FNR / val_runs 408 | self.average_FPR = self.average_FPR / val_runs 409 | self.average_accuracy = self.average_accuracy / val_runs 410 | print("Average Accuracy:", self.average_accuracy, "- Average FPR:", self.average_FPR, 411 | "- Average FNR:", self.average_FNR) 412 | 413 | 414 | class KNearestNeighbors: 415 | 416 | average_FNR = 0 417 | average_FPR = 0 418 | average_accuracy = 0 419 | scores = [] 420 | 421 | @staticmethod 422 | def train_knn_classifier(features, labels, save=False): 423 | print("\n\n--- Training", type(KNN).__name__, "---") 424 | start_time = timeit.default_timer() 425 | model = KNN.fit(features, labels) 426 | stop_time = timeit.default_timer() 427 | print(type(KNN).__name__, "training time: ", stop_time - start_time, "seconds\n\n") 428 | if save: 429 | print("Saving model...") 430 | filename = path + "model_" + type(KNN).__name__ + ".joblib" 431 | dump(model, filename) 432 | return model 433 | 434 | def evaluate_knn_classifier(self, model, features, labels): 435 | print("\n\n--- Evaluating", type(KNN).__name__, "---") 436 | score = model.score(features, labels) 437 | print("Accuracy:", score * 100) 438 | self.scores.append(score) 439 | return self.scores 440 | 441 | def test_knn_classifier(self, model, test_features, test_labels): 442 | print(type(KNN).__name__, "predicting...") 443 | start_time = timeit.default_timer() 444 | predicted = model.predict(test_features) 445 | confusion = confusion_matrix(test_labels, predicted) 446 | print(confusion) 447 | TP = confusion[1, 1] 448 | TN = confusion[0, 0] 449 | FP = confusion[0, 1] 450 | FN = confusion[1, 0] 451 | FNR = FN / float(FN + TP) * 100 452 | FPR = FP / float(FP + TN) * 100 453 | accuracy = (TP + TN) / float(TP + TN + FP + FN) * 100 454 | print("FP:", FP, "- FN:", FN, "- TP:", TP, "- TN", TN) 455 | print("Accuracy:", accuracy, "- FPR:", FPR, "- FNR:", FNR) 456 | 457 | stop_time = timeit.default_timer() 458 | print(type(KNN).__name__, "prediction time: ", stop_time - start_time, "seconds\n\n") 459 | self.average_FNR += FNR 460 | self.average_FPR += FPR 461 | self.average_accuracy += accuracy 462 | 463 | def get_average_metrics(self, val_runs): 464 | self.average_FNR = self.average_FNR / val_runs 465 | self.average_FPR = self.average_FPR / val_runs 466 | self.average_accuracy = self.average_accuracy / val_runs 467 | print("Average Accuracy:", self.average_accuracy, "- Average FPR:", self.average_FPR, 468 | "- Average FNR:", self.average_FNR) 469 | 470 | 471 | class LogRegression: 472 | 473 | average_FNR = 0 474 | average_FPR = 0 475 | average_accuracy = 0 476 | scores = [] 477 | 478 | @staticmethod 479 | def train_logistic_regression_classifier(features, labels, save=False): 480 | print("\n\n--- Training", type(LR).__name__, "---") 481 | start_time = timeit.default_timer() 482 | model = LR.fit(features, labels) 483 | stop_time = timeit.default_timer() 484 | print(type(LR).__name__, "training time: ", stop_time - start_time, "seconds\n\n") 485 | if save: 486 | print("Saving model...") 487 | filename = path + "model_" + type(LR).__name__ + ".joblib" 488 | dump(model, filename) 489 | return model 490 | 491 | def evaluate_logistic_regression_classifier(self, model, features, labels): 492 | print("\n\n--- Evaluating", type(LR).__name__, "---") 493 | score = model.score(features, labels) 494 | print("Accuracy:", score * 100) 495 | self.scores.append(score) 496 | return self.scores 497 | 498 | def test_logistic_regression_classifier(self, model, test_features, test_labels): 499 | print(type(LR).__name__, "predicting...") 500 | start_time = timeit.default_timer() 501 | predicted = model.predict(test_features) 502 | confusion = confusion_matrix(test_labels, predicted) 503 | print(confusion) 504 | TP = confusion[1, 1] 505 | TN = confusion[0, 0] 506 | FP = confusion[0, 1] 507 | FN = confusion[1, 0] 508 | FNR = FN / float(FN + TP) * 100 509 | FPR = FP / float(FP + TN) * 100 510 | accuracy = (TP + TN) / float(TP + TN + FP + FN) * 100 511 | print("FP:", FP, "- FN:", FN, "- TP:", TP, "- TN", TN) 512 | print("Accuracy:", accuracy, "- FPR:", FPR, "- FNR:", FNR) 513 | 514 | stop_time = timeit.default_timer() 515 | print(type(LR).__name__, "prediction time: ", stop_time - start_time, "seconds\n\n") 516 | self.average_FNR += FNR 517 | self.average_FPR += FPR 518 | self.average_accuracy += accuracy 519 | 520 | def get_average_metrics(self, val_runs): 521 | self.average_FNR = self.average_FNR / val_runs 522 | self.average_FPR = self.average_FPR / val_runs 523 | self.average_accuracy = self.average_accuracy / val_runs 524 | print("Average Accuracy:", self.average_accuracy, "- Average FPR:", self.average_FPR, 525 | "- Average FNR:", self.average_FNR) 526 | 527 | 528 | class SupportVectorMachine: 529 | 530 | average_FNR = 0 531 | average_FPR = 0 532 | average_accuracy = 0 533 | scores = [] 534 | 535 | @staticmethod 536 | def train_svm_classifier(features, labels, save=False): 537 | print("\n\n--- Training", type(SVM).__name__, "---") 538 | start_time = timeit.default_timer() 539 | model = SVM.fit(features, labels) 540 | stop_time = timeit.default_timer() 541 | print(type(SVM).__name__, "training time: ", stop_time - start_time, "seconds\n\n") 542 | if save: 543 | print("Saving model...") 544 | filename = path + "model_" + type(SVM).__name__ + ".joblib" 545 | dump(model, filename) 546 | return model 547 | 548 | def evaluate_svm_classifier(self, model, features, labels): 549 | print("\n\n--- Evaluating", type(SVM).__name__, "---") 550 | score = model.score(features, labels) 551 | print("Accuracy:", score * 100) 552 | self.scores.append(score) 553 | return self.scores 554 | 555 | def test_svm_classifier(self, model, test_features, test_labels): 556 | print(type(SVM).__name__, "predicting...") 557 | start_time = timeit.default_timer() 558 | predicted = model.predict(test_features) 559 | confusion = confusion_matrix(test_labels, predicted) 560 | print(confusion) 561 | TP = confusion[1, 1] 562 | TN = confusion[0, 0] 563 | FP = confusion[0, 1] 564 | FN = confusion[1, 0] 565 | FNR = FN / float(FN + TP) * 100 566 | FPR = FP / float(FP + TN) * 100 567 | accuracy = (TP + TN) / float(TP + TN + FP + FN) * 100 568 | print("FP:", FP, "- FN:", FN, "- TP:", TP, "- TN", TN) 569 | print("Accuracy:", accuracy, "- FPR:", FPR, "- FNR:", FNR) 570 | 571 | stop_time = timeit.default_timer() 572 | print(type(SVM).__name__, "prediction time: ", stop_time - start_time, "seconds\n\n") 573 | self.average_FNR += FNR 574 | self.average_FPR += FPR 575 | self.average_accuracy += accuracy 576 | 577 | def get_average_metrics(self, val_runs): 578 | self.average_FNR = self.average_FNR / val_runs 579 | self.average_FPR = self.average_FPR / val_runs 580 | self.average_accuracy = self.average_accuracy / val_runs 581 | print("Average Accuracy:", self.average_accuracy, "- Average FPR:", self.average_FPR, 582 | "- Average FNR:", self.average_FNR) 583 | -------------------------------------------------------------------------------- /feature_based_original_dataset/models_grid_search.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file performs grid search for 'classic' machine learning algorithms. 3 | """ 4 | import set_onehot_encoding as onehot 5 | import os 6 | from sklearn.linear_model import LogisticRegression 7 | from sklearn.ensemble import RandomForestClassifier 8 | from sklearn.neighbors import KNeighborsClassifier 9 | from sklearn.model_selection import GridSearchCV 10 | from sklearn import svm 11 | from sklearn.tree import DecisionTreeClassifier 12 | 13 | 14 | def grid_RF(): 15 | print("--- Random Forest ---") 16 | n_estimators = [10, 50, 100, 200] # number of trees 17 | criterion = ['gini', 'entropy'] # measurement for the quality of split 18 | max_features = ['sqrt', 'log2', None] # Number of features to consider at every split 19 | min_samples_split = [2, 5, 10] # Minimum number of samples required to split a node 20 | min_samples_leaf = [1, 2, 4] # Minimum number of samples required at each leaf node 21 | # Create the grid 22 | param_grid = dict(n_estimators=n_estimators, criterion=criterion, max_features=max_features, 23 | min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf) 24 | 25 | rf = RandomForestClassifier() # create the base model to tune 26 | # Use the grid search to search for best hyperparameters, using 3-fold cross validation 27 | rf_random = GridSearchCV(estimator=rf, param_grid=param_grid, cv=4, verbose=1, 28 | n_jobs=1) # Fit the grid search model 29 | grid_result = rf_random.fit(data, labels) 30 | print("Best: ", grid_result.best_score_, "using", grid_result.best_params_) # find the best hyperparameter 31 | 32 | 33 | def grid_KNN(): 34 | print("--- K Nearest Neighbors ---") 35 | n_neighbors = [3, 5, 10, 20, 50] # number of neighbors 36 | weights = ['uniform', 'distance'] # weight function to use in prediction 37 | metric = ['euclidean', 'manhattan', 'minkowski'] # distance metric to use 38 | 39 | param_grid = dict(n_neighbors=n_neighbors, weights=weights, metric=metric) 40 | 41 | knn = KNeighborsClassifier() 42 | 43 | knn_grid = GridSearchCV(estimator=knn, param_grid=param_grid, cv=4, n_jobs=-1) 44 | grid_result = knn_grid.fit(data, labels) 45 | print("Best: ", grid_result.best_score_, "using", grid_result.best_params_) 46 | 47 | 48 | def grid_LR(): 49 | print("--- Logistic Regression ---") 50 | C = [0.5, 1.0, 1.5, 2.0, 2.5] # regularization strength 51 | max_iter = [100, 110, 120, 130, 140] # maximum number of iterations 52 | fit_intercept = [True, False] # add a bias or not to the decision function 53 | 54 | param_grid = dict(max_iter=max_iter, C=C, fit_intercept=fit_intercept) 55 | 56 | lr = LogisticRegression(penalty="l2", solver="lbfgs") 57 | 58 | grid = GridSearchCV(estimator=lr, param_grid=param_grid, cv=4, n_jobs=-1, verbose=1) 59 | grid_result = grid.fit(data, labels) 60 | print("Best: ", grid_result.best_score_, "using", grid_result.best_params_) 61 | 62 | 63 | def grid_SVM(): 64 | print("--- Support Vector Machines ---") 65 | C = [0.25, 0.5, 1.0] # penalty parameter 66 | kernel = ['linear', 'rbf', 'poly'] # kernel type 67 | gamma = ['auto', 'scale'] # kernel coefficient 68 | decision_function_shape = ['ovo', 'ovr'] # one vs rest or one vs one 69 | 70 | param_grid = dict(C=C, kernel=kernel, gamma=gamma, decision_function_shape=decision_function_shape) 71 | 72 | SVM = svm.SVC() 73 | 74 | grid = GridSearchCV(estimator=SVM, param_grid=param_grid, cv=4, n_jobs=1, verbose=1) 75 | grid_result = grid.fit(data, labels) 76 | print("Best: ", grid_result.best_score_, "using", grid_result.best_params_) 77 | 78 | 79 | def grid_DT(): 80 | print("--- Decision Tree ---") 81 | criterion = ['gini', 'entropy'] # measurement for the quality of split 82 | splitter = ['best', 'random'] 83 | max_features = ['sqrt', 'log2', None] # Number of features to consider at every split 84 | min_samples_split = [2, 5, 10] # Minimum number of samples required to split a node 85 | min_samples_leaf = [1, 2, 4] # Minimum number of samples required at each leaf node 86 | 87 | param_grid = dict(criterion=criterion, splitter=splitter, max_features=max_features, 88 | min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf) 89 | 90 | DT = DecisionTreeClassifier() 91 | 92 | rf_random = GridSearchCV(estimator=DT, param_grid=param_grid, cv=4, n_jobs=1) 93 | grid_result = rf_random.fit(data, labels) 94 | print("Best: ", grid_result.best_score_, "using", grid_result.best_params_) 95 | 96 | 97 | if __name__ == "__main__": 98 | total_features = 545333 # total unique features 99 | set_size = 2000 # set site that will be used to create random training and testing set 100 | malware_ratio = 0.3 # malware ratio in the set size 101 | 102 | onehot.create_list_of_apps() # function from set_one_encoding.py 103 | 104 | # check if a predefined training sample exists 105 | if os.path.isfile("training_set_2000.txt") is False: 106 | print("Creating data-labels...") 107 | print("Generating TRAINING set...") 108 | training_set = onehot.generate_set(set_size, malware_ratio) # generate random training set 109 | with open("training_set_2000.txt", "w") as file: 110 | for item in training_set: 111 | file.write(str(item) + "\n") 112 | 113 | training_set = [] # the list of training set 114 | 115 | with open("training_set_2000.txt", "r") as file: # read training set file and append applications to list 116 | for line in file: 117 | line.strip() 118 | line = line[:-1] 119 | training_set.append(line) 120 | 121 | print("Generating TRAINING input...") 122 | data, labels = onehot.generate_input(training_set, total_features) # perform one-hot encoding 123 | print("Grid searching...") 124 | 125 | grid_RF() 126 | #grid_KNN() 127 | #grid_LR() 128 | #grid_SVM() 129 | #grid_DT() 130 | -------------------------------------------------------------------------------- /feature_based_original_dataset/neural_network.py: -------------------------------------------------------------------------------- 1 | from sklearn.metrics import confusion_matrix 2 | import timeit 3 | from keras import Sequential 4 | from keras.layers import Dense, Dropout 5 | import numpy as np 6 | import matplotlib.pyplot as plt 7 | from keras.callbacks import TensorBoard 8 | from keras.callbacks import EarlyStopping 9 | from keras.callbacks import ModelCheckpoint 10 | from keras.optimizers import Adam, SGD, RMSprop, Adagrad, Adadelta, Adamax, Nadam 11 | 12 | average_FNR = 0 13 | average_FPR = 0 14 | average_accuracy = 0 15 | 16 | 17 | def generate_neural_network(total_features, units, dropout, learn_rate, kernel, bias, activation_function): 18 | """ 19 | :param total_features: the total features (input_dim) we used to train our network 20 | :param units: neurons in the hidden layers 21 | :param dropout: the dropout rate 22 | :param learn_rate: learning rate 23 | :param kernel: (kernel_initializer) weights initialization 24 | :param bias: (bias_initializer) bias init initialization 25 | :param activation_function: activation function 26 | :return: 27 | """ 28 | model = Sequential() # neural net init 29 | """ 30 | add input layer dimension with 545333 features 31 | hidden layers with the defined units, dropout rate, weight and biases initialization, 32 | relu activation function and softmax in output layer 33 | """ 34 | model.add(Dense(units=units[0], activation=activation_function, input_dim=total_features, kernel_initializer=kernel, 35 | bias_initializer=bias)) 36 | model.add(Dropout(dropout)) # add dropout rate 37 | 38 | for hidden_layer_units in units[1:]: # add hidden layers defined units in train_models.py 39 | model.add(Dense(units=hidden_layer_units, activation=activation_function, kernel_initializer=kernel, 40 | bias_initializer=bias)) 41 | model.add(Dropout(dropout)) 42 | 43 | model.add(Dense(2, activation="softmax")) # output layer, with softmax activation function and 2 neurons 44 | 45 | # loss: sparse categorical cross entropy, Optimizer: Adam 46 | model.compile(loss="sparse_categorical_crossentropy", 47 | optimizer=Adam(lr=learn_rate), 48 | metrics=["accuracy"]) 49 | 50 | """ 51 | information about the NN, such as the number of layers, the output shape, 52 | the number of weights in each layer and the total weights. 53 | """ 54 | #model.summary() 55 | 56 | # plot of the neural network graph 57 | #plot_model(model, to_file="figures/DNN_model_plot.png", show_shapes=True, show_layer_names=True) 58 | 59 | return model 60 | 61 | 62 | def train_neural_network(model, epochs, batch_size, features, labels, verbose=0, 63 | validation=False, val_data=None, val_labels=None, 64 | callbacks=False, plot_history=False, path="logs/fit/", model_name="DNN_200_200"): 65 | """ 66 | :param modelh5: neural network model from generate_neural_network() 67 | :param epochs: number of epochs 68 | :param batch_size: batch size 69 | :param features: training data 70 | :param labels: training labels 71 | :param verbose: verbosity level 72 | :param validation: if True validate data 73 | :param val_data: validation data 74 | :param val_labels: validation labels 75 | :param callbacks: if True use Tensorboard callback 76 | :param plot_history: if True plots accuracy and loss history per epoch 77 | :param path: 78 | :param model_name: 79 | :return: 80 | """ 81 | print("\n\n--- Training", type(model).__name__, "---") 82 | start_time = timeit.default_timer() 83 | 84 | # get the name of the optimizer in the defined model 85 | opt_config = model.optimizer.get_config() 86 | if 'name' not in opt_config.keys(): 87 | _name = str(model.optimizer.__class__).split('.')[-1].replace('\'', '').replace('>', '') 88 | opt_config.update({'name': _name}) 89 | 90 | if callbacks: 91 | # directory to save callbacks 92 | log_dir = path + model_name + opt_config['name'] 93 | # callbacks: TensorBoard, EarlyStopping, ModelCheckPoint 94 | # TensorBoard for storing visualizations of the neural net 95 | tensorboard_callback = TensorBoard(log_dir=log_dir, histogram_freq=1, write_graph=True, write_images=True) 96 | # EarlyStopping to monitor validation loss. If there is any improve after 10 epochs,the training procedure stops 97 | early_stopping_callback = EarlyStopping(monitor='val_loss', mode='min', patience=10, verbose=verbose) 98 | # ModelCheckpoint to monitor validation accuracy. It stores the model with the highest accuracy 99 | model_checkpoint_callback = ModelCheckpoint('best_model_' + opt_config['name'] + '.h5', monitor='val_accuracy', mode='max', 100 | verbose=verbose, save_best_only=True) 101 | if not validation: 102 | # fit the model 103 | print("Note: Validation data is not included...Only Tensorboard callback is used!") 104 | history = model.fit(features, labels, epochs=epochs, batch_size=batch_size, verbose=verbose, 105 | callbacks=[tensorboard_callback]) # train the neural network 106 | else: 107 | # fit the model 108 | history = model.fit(features, labels, epochs=epochs, batch_size=batch_size, verbose=verbose, 109 | validation_data=(val_data, val_labels), 110 | callbacks=[tensorboard_callback, early_stopping_callback, model_checkpoint_callback]) 111 | else: # train the model without the use of callbacks 112 | history = model.fit(features, labels, epochs=epochs, batch_size=batch_size, verbose=verbose) 113 | 114 | if plot_history: # plots the accuracy and loss per epoch 115 | if validation: 116 | # summarize history for training and validation accuracy 117 | plt.plot(history.history['accuracy']) 118 | plt.plot(history.history['val_accuracy']) 119 | plt.title('model accuracy') 120 | plt.ylabel('accuracy') 121 | plt.xlabel('epoch') 122 | plt.legend(['train', 'test'], loc='upper left') 123 | plt.show() 124 | # summarize history for training and validation loss 125 | plt.plot(history.history['loss']) 126 | plt.plot(history.history['val_loss']) 127 | plt.title('model loss') 128 | plt.ylabel('loss') 129 | plt.xlabel('epoch') 130 | plt.legend(['train', 'test'], loc='upper left') 131 | plt.show() 132 | else: 133 | # print(history.history.keys()) 134 | # summarize history for training accuracy 135 | plt.plot(history.history['accuracy']) 136 | plt.title('model accuracy') 137 | plt.ylabel('accuracy') 138 | plt.xlabel('epoch') 139 | plt.legend(['train'], loc='upper left') 140 | plt.show() 141 | # summarize history for training loss 142 | plt.plot(history.history['loss']) 143 | plt.title('model loss') 144 | plt.ylabel('loss') 145 | plt.xlabel('epoch') 146 | plt.legend(['train'], loc='upper left') 147 | plt.show() 148 | 149 | stop_time = timeit.default_timer() 150 | print(type(model).__name__, "training time: ", stop_time - start_time, "seconds\n\n") 151 | 152 | 153 | def evaluate_neural_network(model, features, labels): 154 | """ 155 | :param model: neural network model from generate_neural_network() 156 | :param features: test data 157 | :param labels: test labels 158 | :return: 159 | """ 160 | scores = model.evaluate(features, labels, verbose=0) 161 | print(model.metrics_names[1], "%.2f%%" % (scores[1] * 100)) 162 | return scores[1] * 100 163 | 164 | 165 | def test_neural_network(model, test_data, test_labels): 166 | """ 167 | :param model: neural network model from generate_neural_network() 168 | :param test_data: validation data 169 | :param test_labels: validation labels 170 | :return: 171 | """ 172 | global average_FNR, average_FPR, average_accuracy 173 | print(type(model).__name__, "predicting...") 174 | start_time = timeit.default_timer() 175 | predicted = model.predict(test_data) 176 | stop_time = timeit.default_timer() 177 | # print(predicted) 178 | # prick the class with higher probability 179 | confusion = confusion_matrix(test_labels, np.argmax(predicted, axis=1)) # confusion matrix 180 | print(confusion) 181 | # confusion matrix metrics 182 | TP = confusion[1, 1] 183 | TN = confusion[0, 0] 184 | FP = confusion[0, 1] 185 | FN = confusion[1, 0] 186 | FNR = FN / float(FN + TP) * 100 187 | FPR = FP / float(FP + TN) * 100 188 | accuracy = (TP + TN) / float(TP + TN + FP + FN) * 100 189 | print("FP:", FP, "- FN:", FN, "- TP:", TP, "- TN", TN) 190 | print("Accuracy:", accuracy, "- FPR:", FPR, "- FNR:", FNR) 191 | print(type(model).__name__, "prediction time: ", stop_time - start_time, "seconds\n\n") 192 | average_FNR += FNR 193 | average_FPR += FPR 194 | average_accuracy += accuracy 195 | 196 | 197 | def get_average_metrics(val_runs): 198 | global average_FNR, average_FPR, average_accuracy 199 | average_FNR = average_FNR / val_runs 200 | average_FPR = average_FPR / val_runs 201 | average_accuracy = average_accuracy / val_runs 202 | print("Average Accuracy:", average_accuracy, "- Average FPR:", average_FPR, "- Average FNR:", average_FNR) 203 | -------------------------------------------------------------------------------- /feature_based_original_dataset/nn_grid_search.py: -------------------------------------------------------------------------------- 1 | import set_onehot_encoding as onehot 2 | import neural_network as NN 3 | from sklearn.model_selection import StratifiedKFold 4 | import numpy as np 5 | import os 6 | 7 | def tune_neural_network(): 8 | kfold = StratifiedKFold(n_splits=4, shuffle=True) # 4-fold cross-validation 9 | # neural net parameters 10 | units = [200, 200] # neurons in each hidden layer 11 | dropout = 0.2 # dropout rate 12 | epochs = 5 # epochs 13 | batch_size = 150 # size in batch 14 | learn_rate = 0.001 15 | #momentum = 0.8 # to work with SGD 16 | kernel_initializer = 'normal' # weight init 17 | bias_initializer = 'normal' # bias initi 18 | activation_function = 'relu' 19 | 20 | scores = [] 21 | 22 | for train, test in kfold.split(data, labels): # train on 3 pieces evaluate on 1 (4 total runs) 23 | model = NN.generate_neural_network(total_features, units, dropout, learn_rate, kernel_initializer, 24 | bias_initializer, activation_function) 25 | 26 | NN.train_neural_network(model, epochs, batch_size, data[train], labels[train]) # train neural network 27 | 28 | score = NN.evaluate_neural_network(model, data[test], labels[test]) # evaluate neural net 29 | scores.append(score) 30 | 31 | print("Average accuracy: ", np.mean(scores), "Standard Deviation:", np.std(scores)) 32 | 33 | 34 | if __name__ == "__main__": 35 | total_features = 545333 # total unique features 36 | set_size = 2000 # set site that will be used to create random training set 37 | testing_set_size = 2000 # set site that will be used to create random test set 38 | malware_ratio = 0.3 # malware ratio in the set size 39 | 40 | onehot.create_list_of_apps() # function from set_one_encoding.py 41 | 42 | # check if a predefined training 43 | if os.path.isfile("training_set_2000.txt") is False and os.path.isfile("testing_set_2000.txt") is False: 44 | print("Creating data-labels...") 45 | print("Generating TRAINING set...") 46 | training_set = onehot.generate_set(set_size, malware_ratio) # generate random training set 47 | with open("training_set_2000.txt", "w") as file: 48 | for item in training_set: 49 | file.write(str(item) + "\n") 50 | 51 | print("Generating TESTING set...") 52 | testing_set = onehot.generate_set(testing_set_size, malware_ratio) # generate random testing set 53 | with open("testing_set_2000.txt", "w") as file: 54 | for item in testing_set: 55 | file.write(str(item) + "\n") 56 | 57 | training_set = [] # the list of training set 58 | testing_set = [] # the list of testing set 59 | 60 | with open("training_set_2000.txt", "r") as file: # read training set file and append applications to list 61 | for line in file: 62 | line.strip() # remove whitespace 63 | line = line[:-1] # remove \n 64 | training_set.append(line) # add item to list 65 | with open("testing_set_2000.txt", "r") as file: # read testing set file and append applications to list 66 | for line in file: 67 | line.strip() 68 | line = line[:-1] 69 | testing_set.append(line) 70 | 71 | print("Generating TRAINING input...") 72 | data, labels = onehot.generate_input(training_set, total_features) # perform one-hot encoding 73 | print("Generating TESTING input...") 74 | test_data, test_labels = onehot.generate_input(testing_set, total_features) # perform one-hot encoding 75 | tune_neural_network() 76 | 77 | 78 | """ 79 | # use above code for grid search if you have enough RAM, modifying tune_batch_epochs() method and comment everything above 80 | import set_onehot_encoding as onehot 81 | from sklearn.model_selection import GridSearchCV 82 | from keras.wrappers.scikit_learn import KerasClassifier 83 | from keras.models import Sequential 84 | from keras.layers import Dense, Dropout 85 | import os 86 | total_features = 545333 # total unique features 87 | set_size = 2000 # set site that will be used to create random training and testing set 88 | malware_ratio = 0.3 # malware ratio in the set size 89 | 90 | onehot.create_list_of_apps() # function from set_one_encoding.py 91 | 92 | if os.path.isfile("training_set.txt") is False and os.path.isfile("testing_set.txt") is False: 93 | print("Creating data-labels...") 94 | print("Generating TRAINING set...") 95 | training_set = onehot.generate_set(set_size, malware_ratio) # generate random training set 96 | with open("training_set.txt", "w") as file: 97 | for item in training_set: 98 | file.write(str(item) + "\n") 99 | 100 | print("Generating TESTING set...") 101 | testing_set = onehot.generate_set(testing_set_size, malware_ratio) # generate random testing set 102 | with open("testing_set.txt", "w") as file: 103 | for item in testing_set: 104 | file.write(str(item) + "\n") 105 | 106 | training_set = [] 107 | 108 | with open("training_set.txt", "r") as file: 109 | for line in file: 110 | line.strip() 111 | line = line[:-1] 112 | training_set.append(line) 113 | 114 | print("Generating TRAINING input...") 115 | data, labels = onehot.generate_input(training_set, total_features) # perform one-hot encoding 116 | 117 | 118 | def create_model(): 119 | model = Sequential() 120 | 121 | model.add(Dense(units=200, activation="relu", input_dim=total_features)) 122 | model.add(Dropout(0.5)) # add dropout 123 | 124 | model.add(Dense(units=200, activation="relu")) 125 | model.add(Dropout(0.5)) 126 | 127 | model.add(Dense(2, activation="softmax")) # output layer, with softmax activation function and 2 neurons 128 | 129 | model.compile(loss="sparse_categorical_crossentropy", 130 | optimizer='adam', 131 | metrics=["accuracy"]) 132 | # loss sparse categorical, Adam optimizer 133 | model.summary() 134 | return model 135 | 136 | 137 | def tune_batch_epochs(): 138 | 139 | model = KerasClassifier(build_fn=create_model, verbose=1) 140 | 141 | epochs = [5, 10, 15, 20] 142 | batch_size = [50, 100, 128, 200] 143 | optimizer = ['SGD', 'RMSprop', 'Adagrad', 'Adadelta', 'Adam', 'Adamax', 'Nadam'] 144 | learn_rate = [0.001, 0.01, 0.1, 0.2, 0.3] 145 | # momentum = [0.0, 0.2, 0.4, 0.6, 0.8, 0.9] # to work with SGD 146 | kernel_initializer = ['uniform', 'lecun_uniform', 'normal', 'zero', 'glorot_normal', 'glorot_uniform', 'he_normal', 'he_uniform'] # weight init 147 | param_grid = dict(batch_size=batch_size, epochs=epochs, optimizer=optimizer, learn_rate=learn_rate, kernel_initializer=kernel_initializer) 148 | 149 | grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=1, cv=3) 150 | grid_result = grid.fit(data, labels) 151 | 152 | print("Best: ", grid_result.best_score_, "using", grid_result.best_params_)) 153 | means = grid_result.cv_results_['mean_test_score'] 154 | stds = grid_result.cv_results_['std_test_score'] 155 | params = grid_result.cv_results_['params'] 156 | for mean, stdev, param in zip(means, stds, params): 157 | print(mean, stdev, "with", param)) 158 | 159 | 160 | tune_batch_epochs() 161 | """ 162 | -------------------------------------------------------------------------------- /feature_based_original_dataset/set_onehot_encoding.py: -------------------------------------------------------------------------------- 1 | import csv 2 | from random import randint, shuffle 3 | import os 4 | import numpy as np 5 | 6 | csv_malware = "../sha256_family.csv" # csv file with malware apps 7 | feature_index_dir = 'features_indexes/' # directory with indexed features for all apps 8 | 9 | malware = [] 10 | benign = [] 11 | 12 | 13 | def create_list_of_apps(): 14 | print("Creating list of malicious apps...") 15 | with open(csv_malware, 'r') as file: # open malware csv file 16 | next(file) # skip the header line 17 | reader = csv.reader(file, delimiter=',') # read the csv malware families 18 | for row in reader: 19 | malware.append(row[0]) # append every row from the csv file into a list 20 | print("Malware apps found: ", len(malware)) # 5560 21 | print("Malware sample: ", malware[randint(0, len(malware) - 1)]) # print a random malware sample 22 | 23 | print("Creating list of benign apps...") 24 | for filename in os.listdir(feature_index_dir): # read all apps 25 | if filename not in malware: # if a SHA name not in malware list, append it to benign list 26 | benign.append(filename) 27 | print("Benign apps found: ", len(benign)) # 123453 28 | print("Benign app sample: ", benign[randint(0, len(benign) - 1)], ) # print a random benign app 29 | 30 | print("Total apps (Benign & Malicious) found: ", len(malware) + len(benign)) # 129013 31 | 32 | 33 | malware_incremental_counter = 0 34 | benign_incremental_counter = 0 35 | 36 | 37 | def generate_set_incremental(set_size, malware_ratio): 38 | global malware_incremental_counter, benign_incremental_counter 39 | set = [] # list that will fill with app set 40 | 41 | print("Creating set with", set_size, "samples...") 42 | print("Malware ratio:", int(malware_ratio * 100), "%, totaling", int(set_size * malware_ratio), "apps in", set_size) 43 | print("Creating malware set...") 44 | 45 | while len(set) < (set_size * malware_ratio): 46 | app = malware[malware_incremental_counter] # locate malware based on random index in malware list 47 | malware_incremental_counter += 1 48 | if malware_incremental_counter >= 5560: 49 | break 50 | if app not in set: 51 | set.append(app) # append malware to set list 52 | 53 | print("Total malware apps in set: ", len(set)) 54 | print("Malware sample in set: ", set[0]) 55 | 56 | print("Creating benign set...") 57 | 58 | while len(set) < set_size: 59 | app = benign[benign_incremental_counter] # locate benign based on random index in benign list 60 | benign_incremental_counter += 1 61 | if benign_incremental_counter >= 123453: 62 | break 63 | if app not in set: 64 | set.append(app) # append benign to set list 65 | print(malware_incremental_counter) 66 | print("Total apps (malicious and benign) in set: ", len(set)) 67 | return set 68 | 69 | 70 | def generate_set(set_size, malware_ratio): 71 | set = [] # list that will fill with app set 72 | 73 | print("Creating set with", set_size, "samples...") 74 | print("Malware ratio:", int(malware_ratio * 100), "%, totaling", int(set_size * malware_ratio), "apps in", set_size) 75 | print("Creating malware set...") 76 | 77 | while len(set) < (set_size * malware_ratio): 78 | index = randint(0, len(malware) - 1) # choose a random index between (0,5559) 79 | app = malware[index] # locate malware based on random index in malware list 80 | if app not in set: 81 | set.append(app) # append malware to set list 82 | 83 | print("Total malware apps in set: ", len(set)) 84 | print("Malware sample in set: ", set[0]) 85 | 86 | print("Creating benign set...") 87 | while len(set) < set_size: 88 | index = randint(0, len(benign) - 1) # choose a random index between (0,129012) 89 | app = benign[index] # locate benign based on random index in benign list 90 | if app not in set: 91 | set.append(app) # append benign to set list 92 | 93 | print("Total apps (malicious and benign) in set: ", len(set)) 94 | return set 95 | 96 | 97 | def generate_input(set, total_features): 98 | print("performing one hot encoding...") 99 | # return a 2D array filled with zeros that will be used for the features of each app 100 | data = np.zeros((len(set), total_features), dtype=float) 101 | # return an array filled with zeros that will be used for the label of each app {0-benign 1-malicious} 102 | labels = np.zeros((len(set),), dtype=int) 103 | 104 | shuffle(set) # shuffle the set 105 | for id_app, app in enumerate(set): # iterate through set with a counter 106 | with open(feature_index_dir + app, 'r') as file: # open apps in set 107 | for index in file: # read line by line 108 | data[id_app][int(index)] = 1.0 # update corresponding element of the array with 1.0 109 | 110 | if app in malware: 111 | labels[id_app] = 1 # update corresponding label to 1 if it is malware 112 | else: 113 | labels[id_app] = 0 114 | 115 | #print(data) 116 | #print(labels) 117 | #print(data.shape) 118 | #print(labels.shape) 119 | return data, labels 120 | -------------------------------------------------------------------------------- /feature_based_original_dataset/train_models.py: -------------------------------------------------------------------------------- 1 | import set_onehot_encoding as onehot 2 | import models 3 | import neural_network as NN 4 | 5 | 6 | def create_sets(): 7 | training_set = [] # the list of training set 8 | testing_set = [] # the list of testing set 9 | 10 | with open("training_set_1500.txt", "r") as file: # read training set file and append applications to list 11 | for line in file: 12 | line.strip() # remove whitespace 13 | line = line[:-1] # remove \n 14 | training_set.append(line) # add item to list 15 | with open("testing_set_1500.txt", "r") as file: # read testing set file and append applications to list 16 | for line in file: 17 | line.strip() 18 | line = line[:-1] 19 | testing_set.append(line) 20 | print("Generating TRAINING input...") 21 | data, labels = onehot.generate_input(training_set, total_features) # perform one-hot encoding 22 | print("Generating TESTING input...") 23 | test_data, test_labels = onehot.generate_input(testing_set, total_features) # perform one-hot encoding 24 | return data, labels, test_data, test_labels 25 | 26 | 27 | def train_models(): 28 | data, labels, test_data, test_labels = create_sets() 29 | """ 30 | for classic machine learning model, e.g., Naive Bayes, Decision Tree etc, first we fit the classifier and 31 | then we evaluate on the test. The best hyperparameters found from the grid search procedure are defined 32 | in the models.py helper script. 33 | """ 34 | #model = GNB.train_gaussian_naive_bayes_classifier(data, labels, save=True) # train Naive Bayes 35 | #GNB.evaluate_gaussian_naive_bayes_classifier(model, test_data, test_labels) # test performance 36 | 37 | #model = MNB.train_multi_naive_bayes_classifier(data, labels, save=True) 38 | #MNB.evaluate_multi_naive_bayes_classifier(model, test_data, test_labels) 39 | 40 | #model = CNB.train_complement_naive_bayes_classifier(data, labels, save=True) 41 | #CNB.evaluate_complement_naive_bayes_classifier(model, test_data, test_labels) 42 | 43 | #model = BNB.train_bernoulli_naive_bayes_classifier(data, labels, save=True) 44 | #BNB.evaluate_bernoulli_naive_bayes_classifier(model, test_data, test_labels) 45 | 46 | #model = DT.train_decision_tree_classifier(data, labels, save=True) 47 | #DT.evaluate_decision_tree_classifier(model, test_data, test_labels) 48 | 49 | #model = RF.train_random_forest_classifier(data, labels, save=True) 50 | #RF.evaluate_random_forest_classifier(model, test_data, test_labels) 51 | 52 | #model = KNN.train_knn_classifier(data, labels, save=True) 53 | #KNN.evaluate_knn_classifier(model, test_data, test_labels) 54 | 55 | #model = LR.train_logistic_regression_classifier(data, labels, save=True) 56 | #LR.evaluate_logistic_regression_classifier(model, test_data, test_labels) 57 | 58 | #model = SVM.train_svm_classifier(data, labels, save=True) 59 | #SVM.evaluate_svm_classifier(model, test_data, test_labels) 60 | 61 | # init the neural net 62 | model = NN.generate_neural_network(total_features, units, dropout, learn_rate, kernel_initializer, 63 | # bias_initializer, activation_function) 64 | """ 65 | train the neural network with the given model, epochs, batch size, train data-labels. 66 | Specify verbosity level, validation data, callbacks and plots (if needed). 67 | Default parameters: 68 | verbose=0, validation=False, val_data=None, val_labels=None, callbacks=False, plot_history=False 69 | example: 70 | NN.train_neural_network(model, epochs, batch_size, data, labels, verbose=0, 71 | validation=True, val_data=test_data, val_labels=test_labels, 72 | callbacks=True, plot_history=True) 73 | This is the main training stage and thus we want to save the best models at the 'right time'. This is done 74 | setting the callback to True. Keras will seek for the minimum validation loss and it saves the model with 75 | the highest validation accuracy. 76 | """ 77 | NN.train_neural_network(model, epochs, batch_size, data, labels, verbose=2, 78 | validation=True, val_data=test_data, val_labels=test_labels, 79 | callbacks=True) 80 | 81 | 82 | if __name__ == "__main__": 83 | total_features = 545333 # total unique features 84 | set_size = 1500 # set site that will be used to create random training set 85 | testing_set_size = 1500 # set site that will be used to create random test set 86 | malware_ratio = 0.3 # malware ratio in the set size 87 | 88 | print("Creating data-labels...") 89 | onehot.create_list_of_apps() # function from set_one_encoding.py 90 | 91 | # initialize sklearn models (classic machine learning) 92 | GNB = models.GaussianNaiveBayes() 93 | MNB = models.MultinomialNaiveBayes() 94 | CNB = models.ComplementNaiveBayes() 95 | BNB = models.BernoulliNaiveBayes() 96 | DT = models.DecisionTree() 97 | RF = models.RandomForest() 98 | KNN = models.KNearestNeighbors() 99 | LR = models.LogRegression() 100 | SVM = models.SupportVectorMachine() 101 | 102 | # neural net parameters 103 | units = [200, 200] 104 | dropout = 0.2 # dropout rate to avoid over fitting (Note that dropout alone is not efficient) 105 | epochs = 4 # set maximum epochs to 20. If callbacks are specified Keras will automatically stop the procedure 106 | batch_size = 150 # we found that the batch size of 150 fits better in our task 107 | learn_rate = 0.001 # specify the learning rate according to the optimizer used 108 | kernel_initializer = 'glorot_uniform' # weight initialization 109 | bias_initializer = 'zeros' # bias initialization 110 | activation_function = 'relu' # activation function 111 | 112 | train_models() 113 | -------------------------------------------------------------------------------- /feature_based_original_dataset/train_random_subsampling.py: -------------------------------------------------------------------------------- 1 | import set_onehot_encoding as onehot 2 | import models 3 | import neural_network as NN 4 | import numpy as np 5 | 6 | 7 | def create_random_sets(): 8 | print("Generating TRAINING set...") 9 | training_set = onehot.generate_set(set_size, malware_ratio) # generate random training set 10 | print("Generating TRAINING input...") 11 | data, labels = onehot.generate_input(training_set, total_features) # perform one-hot encoding 12 | print("Generating TESTING set...") 13 | testing_set = onehot.generate_set(testing_set_size, malware_ratio) # generate random testing set 14 | print("Generating TESTING input...") 15 | test_data, test_labels = onehot.generate_input(testing_set, total_features) # perform one-hot encoding 16 | return data, labels, test_data, test_labels # return train data - labels and test data - labels 17 | 18 | 19 | def random_sub_sampling(runs): 20 | score_gnb = [] # score for gaussian naive bayes 21 | score_mnb = [] # scores for multinomial naive bayes 22 | score_cnb = [] # scores for complement naive bayes 23 | score_bnb = [] # scores for bernoulli naive bayes 24 | score_dt = [] # scores for decision trees 25 | score_rf = [] # scores for random forest 26 | score_knn = [] # scores for k nearest neighbors 27 | score_lr = [] # scores for logistic regression 28 | score_svm = [] # scores for support vector machines 29 | score_nn = [] # scores for neural network 30 | 31 | for i in range(runs): 32 | 33 | data, labels, test_data, test_labels = create_random_sets() # choose random training and testing sets 34 | 35 | #model = GNB.train_gaussian_naive_bayes_classifier(data, labels) # train Gaussian Naive Bayes 36 | #score_gnb = GNB.evaluate_gaussian_naive_bayes_classifier(model, test_data, test_labels) # evaluate performance 37 | 38 | #model = MNB.train_multi_naive_bayes_classifier(data, labels) # train Multinomial Naive Bayes 39 | #score_mnb = MNB.evaluate_multi_naive_bayes_classifier(model, test_data, test_labels) 40 | 41 | #model = CNB.train_complement_naive_bayes_classifier(data, labels) # train Complement Naive Bayes 42 | #score_cnb = CNB.evaluate_complement_naive_bayes_classifier(model, test_data, test_labels) 43 | 44 | #model = BNB.train_bernoulli_naive_bayes_classifier(data, labels) # train Bernoulli Naive Bayes 45 | #score_bnb = BNB.evaluate_bernoulli_naive_bayes_classifier(model, test_data, test_labels) 46 | 47 | #model = DT.train_decision_tree_classifier(data, labels) # train Decision Tree Classifier 48 | #score_dt = DT.evaluate_decision_tree_classifier(model, test_data, test_labels) 49 | 50 | #model = RF.train_random_forest_classifier(data, labels) # train Random Forest 51 | #score_rf = RF.evaluate_random_forest_classifier(model, test_data, test_labels) 52 | 53 | #model = KNN.train_knn_classifier(data, labels) # train k-Nearest Neighbors Classifier 54 | #score_knn = KNN.evaluate_knn_classifier(model, test_data, test_labels) 55 | 56 | #model = LR.train_logistic_regression_classifier(data, labels) # train logistic Regression 57 | #score_lr = LR.evaluate_logistic_regression_classifier(model, test_data, test_labels) 58 | 59 | #model = SVM.train_svm_classifier(data, labels) # train Support Vector Machines 60 | #score_svm = SVM.evaluate_svm_classifier(model, test_data, test_labels) 61 | 62 | # init neural net 63 | model = NN.generate_neural_network(total_features, units, dropout, learn_rate, kernel_initializer, 64 | bias_initializer, activation_function) 65 | """ 66 | this is not the actual training procedure and we don't want to save the models. To save models and implement 67 | the early stopping technique refer to train_models.py 68 | The goal of this operation is only to determine the behavior of models to random training sets and random 69 | testing sets! 70 | So, only train and evaluate models. 71 | """ 72 | NN.train_neural_network(model, epochs, batch_size, data, labels, verbose=2) 73 | score = NN.evaluate_neural_network(model, test_data, test_labels) 74 | score_nn.append(score) 75 | 76 | # get average accuracy and standard deviation for each model for each model 77 | #print("NB Average accuracy: ", np.mean(score_gnb), "Standard Deviation:", np.std(score_gnb)) 78 | #print("MNB Average accuracy: ", np.mean(score_mnb), "Standard Deviation:", np.std(score_mnb)) 79 | #print("CNB Average accuracy: ", np.mean(score_cnb), "Standard Deviation:", np.std(score_cnb)) 80 | #print("BNB Average accuracy: ", np.mean(score_bnb), "Standard Deviation:", np.std(score_bnb)) 81 | #print("DT Average accuracy: ", np.mean(score_dt), "Standard Deviation:", np.std(score_dt)) 82 | #print("RF Average accuracy: ", np.mean(score_rf), "Standard Deviation:", np.std(score_rf)) 83 | #print("kNN Average accuracy: ", np.mean(score_knn), "Standard Deviation:", np.std(score_knn)) 84 | #print("LR Average accuracy: ", np.mean(score_lr), "Standard Deviation:", np.std(score_lr)) 85 | #print("SVM Average accuracy: ", np.mean(score_svm), "Standard Deviation:", np.std(score_svm)) 86 | print("NN Average accuracy: ", np.mean(score_nn), "Standard Deviation:", np.std(score_nn)) 87 | 88 | 89 | if __name__ == "__main__": 90 | total_features = 545333 # total unique features 91 | set_size = 1500 # set site that will be used to create random training set 92 | testing_set_size = 1500 # set site that will be used to create random test set 93 | malware_ratio = 0.3 # malware ratio in the set size 94 | 95 | print("Creating data-labels...") 96 | onehot.create_list_of_apps() # function from set_one_encoding.py 97 | 98 | # initialize sklearn models 99 | GNB = models.GaussianNaiveBayes() 100 | MNB = models.MultinomialNaiveBayes() 101 | CNB = models.ComplementNaiveBayes() 102 | BNB = models.BernoulliNaiveBayes() 103 | DT = models.DecisionTree() 104 | RF = models.RandomForest() 105 | KNN = models.KNearestNeighbors() 106 | LR = models.LogRegression() 107 | SVM = models.SupportVectorMachine() 108 | 109 | val_runs = 8 # number of times to train and test a model 110 | 111 | # neural net parameters 112 | units = [200, 200] # number of neurons in each layer (2 hidden layers) 113 | dropout = 0.001 # dropout rate 114 | epochs = 4 # epochs per iteration 115 | batch_size = 150 # batch size 116 | learn_rate = 0.001 # learning rate of the specified optimizer 117 | kernel_initializer = 'glorot_uniform' # weight initialization 118 | bias_initializer = 'zeros' # bias initialization 119 | activation_function = 'relu' # activation function in hidden layers (We use Softmax in the output layer) 120 | 121 | random_sub_sampling(val_runs) -------------------------------------------------------------------------------- /feature_based_reduced_dataset/README.md: -------------------------------------------------------------------------------- 1 | The process is similar to the feature_based_original_dataset. In the reduced feature space we introduce an adversarial sample detector. 2 | 3 | ### 1) Feature Reduction 4 | 5 | The huge variety of features in the dataset leads to a high dimensionality and as a result the data become sparse. The sparsity leads to vast computation cost making the whole dataset unavailable for models to process it in a single run. Moreover, this means that in the dataset present features which have a minimal importance for the final decision of a classifier. In literature there exists algorithmic approaches to feature elimination and dimensionality reduction, but we stick with a manual/“regular” reduction due the high computation cost. An adversary can change a network address without much effort. With the elimination of this class, the features are decreased to 234,845, less than half of the original feature space. Furthermore, each Android application has activities, which in essence is the user interface. The user interface does not constitute a significant matter for the classification task as the names can be random and changed. Activities are contained in Components class with a total of 185,729 features. Therefore, the feature space is reduced to 49,116. Moreover, some features are found to a large extent to both benign and malware applications and as such, these features are not important for classification. These are only 6. Consequently, the feature space is reduced to 49,110 from the original 545,333 features. 6 | 7 | ``` 8 | python3 eliminate_low_high_support_features.py 9 | ``` 10 | ``` 11 | python3 eliminate_features.py 12 | ``` 13 | 14 | As for the applications themselves, we observed that there exist duplicates, 37,077 in particular. The duplicates, we are referring to, are applications that have the same features. This does not mean that the applications are exactly the same, but that they have similar functionality. Specifically for malicious applications, the duplication may mean that these applications are variants of the original malware. With duplicate applications, the amount of applications can be significantly reduced and thus, the algorithms can fit more applications into one run. Finally, we were able to reduce the dataset from to 129,013 (5, 560 malicious) to 91,936 applications (2591 malicious) and from 545,333 unique features to 3880. To find and remove the duplicate applications, we adjusted [this](http://www.davespace.co.uk/python/remove-duplicates.html) python script. 15 | 16 | ### 2) Training, Evaluating, Crafting adversarial examples 17 | 18 | Followe README file in feature_based_original_dataset folder. 19 | 20 | ### 3) Defense by [detecting adversarial examples](https://arxiv.org/pdf/1702.06280.pdf) 21 | 22 | Another defensive approach, similar to the adversarial training, is to detect adversarial examples prior the feeding of samples into the main classification model. Our approach produces an external classifier, which is able to classify samples as legitimate or adversarial. 23 | 24 | Steps for our model: 25 | 1) Train a classifier F on the original data D={X,0}, labeling all the training set as 0. 26 | 2) Craft adversarial examples A for F using the JSMA crafting method. 27 | 3) Train a new model F' on an augmented dataset X⋃A, labeling each adversarial example as 1. 28 | 4) Before feeding a new sample to the classifier F, the sample passes through the detector for classification. If the sample is recognized as malicious the process stops. 29 | 30 | We evaluate the classification accuracy of the newly generated classifier in the reduced feature space as an attempt to defend against the adversarial examples that deceive the model trained with the Adam optimizer. The original performance of the learned model on the reduced feature space is 98.42% (1.58% FNR). With the JSMA variant the model is completely destroyed as the attack makes the model unable to recognize any malicious application, increasing the FNR to 100%. Similar to adversarial training, there is any specific methodology to follow for the mixing of adversarial examples with a specific legitimate samples ratio. 31 | 32 | ``` 33 | python3 detector.py 34 | ``` 35 | 36 | We begin training a model only on malware space without the presence of any benign applications. Note that we use every piece of malware sample to craft adversarial examples and to train the adversarial detector. We expect the detector to be highly efficient in distinguish adversarial examples from legitimate samples, but without the ability to accurate classify the samples when mixed with benign and malware samples. Indeed, our detector gets a training accuracy of 99.21% at epoch 15. This means that it may be able to classify adversarial samples with high probability. However, the detector is only trained on the malware space. To get a good estimation, we mix benign and malicious applications, we craft adversarial examples for the original model and we evaluate the detector. As expected, the classification performance is not as high, achieving 83.7% with 16.3% FPR without the presence of an adversary. When adversarial examples crafted, the accuracy slightly decreased to 83.5% with 23.14% FPR and only 1% FNR. This means that only a few adversarial examples bypassed the detector. 37 | 38 | The FPR in our first evaluation in the detector trained only with malicious applications is quite high. Therefore, we hope that mixing the benign with malicious applications and crafting adversarial examples only for malware will increase the overall accuracy. We get a sample totaling 2000 applications, 600 of them malicious. This means that the detector is trained on 2000 legitimate samples and 600 adversarial. We re-implement the ModelCheckpoint callback to store only the best model in terms of accuracy. The model stored at epoch 24 where the training accuracy is 98.58%. Next, we craft adversarial examples for the original model in a testing set totaling 8500 applications, where almost malicious applications is present (totaling 2,550 in the testing set) and we evaluate the performance of the trained detector. Surprisingly, the performance is extremely high with and without the presence of adversarial examples. In legitimate applications the model achieves 99.72% accuracy (with 0.28% FPR) and with the presence of an adversary, the model achieves 98.1% accuracy with 5.46% FNR and 0.39% FPR. 39 | 40 | Training on a sample of the dataset results in extremely high performance: only 5% of the adversarial examples can bypass our security mechanism. However, this means that the 5% of the adversarial malware will be mistakenly identified as benign in the main detector. We also evaluate whether the training on a larger samples, in the presence of almost every malicious application, can create a more efficient model. This can be described as an incremental procedure of the detector. As mentioned, incremental learning may give a false sense of performance, since the malicious applications that are in the wild are by no means covered. We train the detector with a set of 8500 applications, 2550 of them malicious. The detector achieves 99.27% accuracy in the training stage (with 0.45% FPR). In the testing stage it achieves an extremely high performance of 99.98% (only 2 legitimate applications are recognized as adversarial) and with the presence of adversarial examples achieves 99.54% accuracy with only 1.49% FNR. Therefore, the efficiency of the detector is dramatically improved. The higher the performance of the adversarial detector, the better for our main model, as the detector can eliminate adversarial examples. 41 | 42 | We showed that training a second classifier that distinguishes original samples from adversarial ones can be used as a defensive mechanism. Only the applications that map as legitimate is passed through the main detector for classification. However, since an intelligent attacker can deceive the main classifier, then it will be easy to deceive the adversarial detector. If the adversary is aware of the external classifier, its goal is to produce adversarial examples that are classified as legitimate. The procedure is similar to the adversarial examples crafting for the main model. The purpose is to classify an adversarial sample that correctly classified as adversarial to a benign one. As such, we craft adversarial examples for the adversarial detector in a test set of 2000 applications. Its original performance is 100% without the presence of an adversary and 99.5% (1.67% FNR) on adversarial examples produced for the main classifier. As expected, when crafting adversarial examples for the detector, the classifier is almost completely fooled. Its accuracy is 70.2%, with 99.33% FNR and 9 average perturbations in the feature space. 43 | -------------------------------------------------------------------------------- /feature_based_reduced_dataset/count_feature_variance.py: -------------------------------------------------------------------------------- 1 | import os 2 | import csv 3 | import pandas as pd 4 | feature_vector = {} # dictionary with indexes mapped to features 5 | index = 0 # index value 6 | 7 | feature_vectors_dir = '../new_feature_vectors/' 8 | new_feature_vectors_dir = 'more_new_feature_vectors/' 9 | csv_file = "../feature_variances.csv" 10 | 11 | if not os.path.exists(new_feature_vectors_dir): 12 | os.makedirs(new_feature_vectors_dir) 13 | 14 | 15 | def export_to_csv(): 16 | not_assignable_feature_type = [''] # found from extract_feature_types.py 17 | features_variance = {} 18 | 19 | for filename in os.listdir(feature_vectors_dir): # read all apps 20 | with open(feature_vectors_dir + filename, "r") as file: # open an app 21 | for line in file: # read app line by line 22 | 23 | line = line.strip() # remove whitespace chars 24 | if line not in not_assignable_feature_type: # check if feature type is '' 25 | if line not in features_variance: 26 | features_variance[line] = 1 27 | else: 28 | features_variance[line] += 1 29 | 30 | print(len(features_variance)) 31 | 32 | with open(csv_file, 'w', newline="") as csvfile: 33 | writer = csv.writer(csvfile) 34 | writer.writerow(["Feature", "Variance"]) 35 | for key, value in features_variance.items(): 36 | writer.writerow([key, value]) 37 | 38 | def export_eliminated_csv(): 39 | df = pd.read_csv(csv_file) 40 | #print(df.head(3)) 41 | #print(df.sort_values('Variance')) 42 | drop = (df[df.Variance >= 5]) 43 | print(len(drop)) 44 | export_to_csv = drop.to_csv("eliminated_variance.csv", index=None, header=True) 45 | 46 | 47 | 48 | if __name__ == "__main__": 49 | #export_to_csv() 50 | export_eliminated_csv() 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | -------------------------------------------------------------------------------- /feature_based_reduced_dataset/eliminate_features.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas as pd 3 | feature_vector = {} # dictionary with indexes mapped to features 4 | index = 0 # index value 5 | feature_vectors_dir = '../eliminated_apps_feature_vectors/' 6 | new_feature_vectors_dir = 'new_feature_vectors/' 7 | if not os.path.exists(new_feature_vectors_dir): 8 | os.makedirs(new_feature_vectors_dir) 9 | 10 | print("Eliminating features with low variance...") 11 | 12 | df = pd.read_csv('eliminated_variance.csv') 13 | column = df[df.columns[0]] 14 | features = column.tolist() 15 | print(features) 16 | 17 | 18 | for filename in os.listdir(feature_vectors_dir): # read all apps 19 | with open(feature_vectors_dir + filename, "r") as file: # open an app 20 | f = open(new_feature_vectors_dir + filename, "a") # create a new file with the same SHA name in another dir 21 | for line in file: # read app line by line 22 | feature = line.strip() 23 | if feature in features: 24 | f.write(feature + "\n") 25 | print("finished!") 26 | -------------------------------------------------------------------------------- /feature_based_reduced_dataset/eliminate_low_high_support_features.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | feature_vector = {} # dictionary with indexes mapped to features 4 | index = 0 # index value 5 | feature_vectors_dir = 'eliminated_apps_feature_vectors/' 6 | new_feature_vectors_dir = 'new_feature_vectors/' 7 | if not os.path.exists(new_feature_vectors_dir): 8 | os.makedirs(new_feature_vectors_dir) 9 | 10 | not_assignable_feature_type = [''] # found from extract_feature_types.py 11 | 12 | print("Eliminating urls & activities...") 13 | for filename in os.listdir(feature_vectors_dir): # read all apps 14 | with open(feature_vectors_dir + filename, "r") as file: # open an app 15 | f = open(new_feature_vectors_dir + filename, "a") # create a new file with the same SHA name in another dir 16 | for line in file: # read app line by line 17 | feature_type = line[:line.find('::')] # extract feature type 18 | feature = line.strip() # remove whitespace chars 19 | if feature_type not in not_assignable_feature_type: # check if feature type is '' 20 | if feature_type != "url" and feature_type != "activity" and \ 21 | feature != "feature::android.hardware.touchscreen" and \ 22 | feature != "intent::android.intent.action.MAIN" and \ 23 | feature != "intent::android.intent.category.LAUNCHER" and \ 24 | feature != "call::getSystemService" and \ 25 | feature != "real_permission::android.permission.INTERNET" and \ 26 | feature != "permission::android.permission.INTERNET": 27 | 28 | f.write(feature + "\n") 29 | 30 | print("finished!") 31 | -------------------------------------------------------------------------------- /feature_based_reduced_dataset/jsma.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | from tensorflow import keras 4 | from sklearn.metrics import confusion_matrix 5 | import set_onehot_encoding as onehot 6 | import os 7 | import joblib 8 | import models 9 | 10 | 11 | def create_set(): 12 | if os.path.isfile("training_set_8500.txt") is False: 13 | set_size = 8500 14 | malware_ratio = 0.3 15 | print("Creating data-labels...") 16 | print("Generating TESTING set...") 17 | testing_set = onehot.generate_set(set_size, malware_ratio) # generate random testing set 18 | with open("training_set_8500.txt", "w") as file: 19 | for item in testing_set: 20 | file.write(str(item) + "\n") 21 | testing_set = [] # the list of testing set 22 | with open("training_set_8500.txt", "r") as file: # read testing set file and append applications to list 23 | for line in file: 24 | line.strip() 25 | line = line[:-1] 26 | testing_set.append(line) 27 | print("Generating TESTING input...") 28 | test_data, test_labels = onehot.generate_input(testing_set, total_features) # perform one-hot encoding 29 | return test_data, test_labels 30 | 31 | 32 | """ 33 | functions to compute Jacobian with numpy. 34 | https://medium.com/unit8-machine-learning-publication/computing-the-jacobian-matrix-of-a-neural-network-in-python-4f162e5db180 35 | First we specify the the forward and backward passes of each layer to implement backpropagation manually. 36 | """ 37 | 38 | 39 | def affine_forward(x, w, b): 40 | """ 41 | Forward pass of an affine layer 42 | :param x: input of dimension (I, ) 43 | :param w: weights matrix of dimension (I, O) 44 | :param b: biais vector of dimension (O, ) 45 | :return output of dimension (O, ), and cache needed for backprop 46 | """ 47 | out = np.dot(x, w) + b 48 | cache = (x, w) 49 | return out, cache 50 | 51 | 52 | def affine_backward(dout, cache): 53 | """ 54 | Backward pass for an affine layer. 55 | :param dout: Upstream Jacobian, of shape (M, O) 56 | :param cache: Tuple of: 57 | - x: Input data, of shape (I, ) 58 | - w: Weights, of shape (I, O) 59 | :return the jacobian matrix containing derivatives of the M neural network outputs with respect to 60 | this layer's inputs, evaluated at x, of shape (M, I) 61 | """ 62 | x, w = cache 63 | dx = np.dot(dout, w.T) 64 | return dx 65 | 66 | 67 | def relu_forward(x): 68 | """ Forward ReLU 69 | """ 70 | out = np.maximum(np.zeros(x.shape), x) 71 | cache = x 72 | return out, cache 73 | 74 | 75 | def relu_backward(dout, cache): 76 | """ 77 | Backward pass of ReLU 78 | :param dout: Upstream Jacobian 79 | :param cache: the cached input for this layer 80 | :return: the jacobian matrix containing derivatives of the M neural network outputs with respect to 81 | this layer's inputs, evaluated at x. 82 | """ 83 | x = cache 84 | dx = dout * np.where(x > 0, np.ones(x.shape), np.zeros(x.shape)) 85 | return dx 86 | 87 | 88 | def softmax_forward(x): 89 | """ Forward softmax 90 | """ 91 | exps = np.exp(x - np.max(x)) 92 | s = exps / exps.sum() 93 | return s, s 94 | 95 | 96 | def softmax_backward(dout, cache): 97 | """ 98 | Backward pass for softmax 99 | :param dout: Upstream Jacobian 100 | :param cache: contains the cache (in this case the output) for this layer 101 | """ 102 | s = cache 103 | ds = np.diag(s) - np.outer(s, s.T) 104 | dx = np.dot(dout, ds) 105 | return dx 106 | 107 | 108 | def get_activations(model, layer_id, X): 109 | """ 110 | Computes outputs of intermediate layers 111 | :param model: the trained model 112 | :param layer_id: the id of the layer that we want the output from 113 | :param X: input feature vector 114 | :return: output of layer (layer_id) 115 | """ 116 | intermediate_layer_model = keras.models.Model(inputs=model.input, 117 | outputs=model.layers[layer_id].output) 118 | intermediate_output = intermediate_layer_model.predict(X) 119 | return intermediate_output 120 | 121 | 122 | def forward_backward(model, x): 123 | """ 124 | computes the forward derivative for the given input 125 | :param model: the trained model 126 | :param x: input feature vector 127 | :return: prediction result and forward derivative 128 | """ 129 | layer_to_cache = dict() # for each layer, we store the cache needed for backward pass 130 | forward_values = [] 131 | 132 | for i in range(0, len(model.layers), 2): 133 | values = {} 134 | w, b = model.layers[i].get_weights() 135 | values['w'] = w 136 | values['b'] = b 137 | forward_values.append(values) 138 | 139 | # Forward pass 140 | a1, cache_a1 = affine_forward(x, forward_values[0]['w'], forward_values[0]['b']) 141 | _, cache_r1 = relu_forward(a1) 142 | r1 = get_activations(model, 0, x) 143 | forward_values[0]['a'] = a1 144 | forward_values[0]['cache_a'] = cache_a1 145 | forward_values[0]['r'] = r1 146 | forward_values[0]['cache_r'] = cache_r1 147 | 148 | for i, layer_index in zip(range(1, len(forward_values) - 1), range(2, len(model.layers), 2)): 149 | a, cache_a = affine_forward(forward_values[i - 1]['r'], forward_values[i]['w'], forward_values[i]['b']) 150 | _, cache_r = relu_forward(a) 151 | r = get_activations(model, layer_index, x) 152 | forward_values[i]['a'] = a 153 | forward_values[i]['cache_a'] = cache_a 154 | forward_values[i]['r'] = r 155 | forward_values[i]['cache_r'] = cache_r 156 | 157 | a, cache_a = affine_forward(forward_values[len(forward_values) - 2]['r'], 158 | forward_values[len(forward_values) - 1]['w'], 159 | forward_values[len(forward_values) - 1]['b']) 160 | forward_values[len(forward_values) - 1]['a'] = a 161 | forward_values[len(forward_values) - 1]['cache_a'] = cache_a 162 | out, cache_out = softmax_forward(a) 163 | 164 | # backward pass 165 | dout = np.diag(np.ones(out.size, )) # the derivatives of each output w.r.t. each output. 166 | dout = softmax_backward(dout, cache_out) 167 | dout = affine_backward(dout, forward_values[len(forward_values) - 1]['cache_a']) 168 | 169 | for i in range(len(forward_values) - 2, 0, -1): 170 | dout = relu_backward(dout, forward_values[i]['cache_r']) 171 | dout = affine_backward(dout, forward_values[i]['cache_a']) 172 | 173 | dout = relu_backward(dout, forward_values[0]['cache_r']) 174 | dx = affine_backward(dout, forward_values[0]['cache_a']) 175 | 176 | return out, dx 177 | 178 | 179 | def craft_adversarial_samples(x, y, F, k): 180 | """ 181 | JSMA variant for adversarial examples crafting algorithm as described in https://arxiv.org/abs/1606.04435 182 | JSMA iteratively selects the most useful features to perturb a small magnitude of value until the target class is 183 | achived. The perturbed featured are selected based on the saliency map. Saliency maps are used for a network's 184 | visualization and describe which features are the most important for a particular output class. The goal 185 | is to eliminate those attributes from a legitimate sample and bring up the most important ones for the target class 186 | in oder to cause the model to misclassify. This is done by pushing the features away from the original label 187 | and closer to the target class. 188 | Steps: 189 | 1)Compute the gradient of F with respect to the input X to estimate the direction in which a perturbation in X 190 | would change F's output. That is, compute the forward derivative (the Jacobian of the learned function for 191 | a legitimate sample). 192 | ∇F(x)=(∂F(x))/∂x=[(∂F_j (x))/(∂x_i )]_(iϵ1…M,jϵ1…N) 193 | where x is the model’s input, F is the network, F(x) the predicted class, M the input dimension, 194 | N the output dimension, (i, j) is the derivative class of class j with respect to the input feature i. 195 | In essence, it computes the gradient of F with respect to input x to estimate the direction in which 196 | a perturbation in x would change the output. In backpropagation, the forward derivative is calculated 197 | with respect to the loss function and the gradients with respect to the network parameters with the goal of 198 | updating the weights. On the contrary, in JSMA the forward derivative is taken with respect to the network 199 | directly and the gradients with respect to the input data. 200 | 2)Choose a perturbation δ of X with maximal positive gradient into the the target class y'. 201 | In other words, choose the index that maximizes the change into the target class 0 by changing X_i. 202 | The limitation is that we can only add features and not discard them as in a real world scenario an adversary doesnt want 203 | to 'break' the functionality of an application. 204 | Algorithm: 205 | Input x, y, F, k, I 206 | x_adv <- x 207 | Gamma = {1...|x|} 208 | while arg max_jF_j(x_adv) != y and ||δ_X|| < k do 209 | Compute the forward derivative ∇F(adv_x) 210 | i_max = arg max_j∈Γ∩I,X_j=0 ∂Fy(X)/∂Xj 211 | if i_max <= 0 then 212 | :return Failure 213 | end if 214 | adv_x_i_max = 1 215 | δ_x <- x_adv - x 216 | :return adv_x 217 | :param x: input feature vector 218 | :param y: target class 219 | :param F: the trained model 220 | :param k: index of the hidden layer 221 | :return: adversarial sample based on feature vector x 222 | """ 223 | x_adv = x 224 | gamma = [1] * len(x) 225 | delta_x = [0] 226 | changes = 0 227 | 228 | if np.argmax(F.predict(x_adv), 1) == 0: # if misclassification achieved return adv_x 229 | return x_adv, -1 230 | 231 | while np.argmax(F.predict(x_adv), 1) != y and np.linalg.norm(delta_x, ord=1) < k and changes < 20: 232 | # compute forward derivative (Jacobian) 233 | prob, forward_derivative = forward_backward(F, x_adv) 234 | 235 | tmp = np.multiply(forward_derivative[0], gamma) 236 | for i, feature in enumerate(x_adv[0]): 237 | if feature == 1: 238 | tmp[i] = 0 239 | i_max = np.argmax(tmp) 240 | if i_max <= 0: 241 | raise ValueError('FAILURE: We can only add features to an application!') 242 | 243 | x_adv[0][i_max] = 1 244 | delta_x = np.subtract(x_adv, x) 245 | # print(i_max) 246 | if i_max not in changes_dict: 247 | changes_dict[i_max] = 1 248 | else: 249 | changes_dict[i_max] += 1 250 | changes += 1 251 | print("Changes:", changes) 252 | 253 | return x_adv, changes 254 | 255 | 256 | def evaluate_detector_on_adversarial_examples(): 257 | average_changes = 0 258 | amount_malwares = 0 259 | averageChanges = 0 260 | # attack the detector 261 | for i in range(len(val_data)): 262 | 263 | if val_labels[i] == 1: 264 | 265 | x = val_data[i:i + 1] 266 | # print("x: ", x) 267 | # print(x.shape) 268 | try: 269 | adv_x, changes = craft_adversarial_samples(x, 0, detector, 1) 270 | # print(adv_x) 271 | val_data[i] = adv_x 272 | if changes >= 0: 273 | average_changes += changes 274 | amount_malwares += 1 275 | except NameError: 276 | pass 277 | except ValueError: 278 | pass 279 | if amount_malwares > 0: 280 | averageChanges += (average_changes / float(amount_malwares)) 281 | 282 | # evaluate the detector 283 | predictions = detector.predict(val_data) 284 | confusion = confusion_matrix(val_labels, np.argmax(predictions, axis=1)) 285 | print(confusion) 286 | TP = confusion[1, 1] 287 | TN = confusion[0, 0] 288 | FP = confusion[0, 1] 289 | FN = confusion[1, 0] 290 | FNR = FN / float(FN + TP) * 100 291 | FPR = FP / float(FP + TN) * 100 292 | accuracy = ((TP + TN) / float(TP + TN + FP + FN)) * 100 293 | print("Adversarial FP:", FP, "- FN:", FN, "- TP:", TP, "- TN", TN) 294 | print("Adversarial Accuracy:", accuracy, "- FPR:", FPR, "- FNR:", FNR) 295 | print("Misclassification Rate:", FNR - FNR_original) 296 | print("Distortion:", averageChanges) 297 | print(changes_dict) 298 | 299 | 300 | if __name__ == "__main__": 301 | total_features = 3880 # total unique features 302 | print("Creating data-labels...") 303 | onehot.create_list_of_apps() # function from set_one_encoding.py 304 | 305 | changes_dict = {} # dictionary for perturbations (added features) 306 | 307 | trained_model = tf.keras.models.load_model('best_model_Adam.h5') 308 | #detector = tf.keras.models.load_model('external_detector.h5') 309 | 310 | averageChanges = 0 311 | 312 | val_data, val_labels = create_set() 313 | predict_original = trained_model.predict(val_data) 314 | confusion = confusion_matrix(val_labels, np.argmax(predict_original, axis=1)) 315 | 316 | TP = confusion[1, 1] 317 | TN = confusion[0, 0] 318 | FP = confusion[0, 1] 319 | FN = confusion[1, 0] 320 | FNR_original = FN / float(FN + TP) * 100 321 | FPR = FP / float(FP + TN) * 100 322 | accuracy = ((TP + TN) / float(TP + TN + FP + FN)) * 100 323 | print(confusion) 324 | print("Original FP:", FP, "- FN:", FN, "- TP:", TP, "- TN", TN) 325 | print("Original Accuracy:", accuracy, "- FPR:", FPR, "- FNR:", FNR_original) 326 | del predict_original 327 | average_changes = 0 328 | amount_malwares = 0 329 | val_data, val_labels = create_set() 330 | 331 | for i in range(len(val_data)): 332 | 333 | if val_labels[i] == 1: 334 | 335 | x = val_data[i:i + 1] 336 | #print("x: ", x) 337 | #print(x.shape) 338 | try: 339 | adv_x, changes = craft_adversarial_samples(x, 0, trained_model, 1) 340 | # print(adv_x) 341 | val_data[i] = adv_x 342 | if changes >= 0: 343 | average_changes += changes 344 | amount_malwares += 1 345 | except NameError: 346 | pass 347 | except ValueError: 348 | pass 349 | 350 | if amount_malwares > 0: 351 | averageChanges += (average_changes / float(amount_malwares)) 352 | #print(val_data.shape) 353 | 354 | # evaluate the model on adversarial examples 355 | predictions = trained_model.predict(val_data) 356 | confusion = confusion_matrix(val_labels, np.argmax(predictions, axis=1)) 357 | print(confusion) 358 | TP = confusion[1, 1] 359 | TN = confusion[0, 0] 360 | FP = confusion[0, 1] 361 | FN = confusion[1, 0] 362 | FNR = FN / float(FN + TP) * 100 363 | FPR = FP / float(FP + TN) * 100 364 | accuracy = ((TP + TN) / float(TP + TN + FP + FN)) * 100 365 | print("Adversarial FP:", FP, "- FN:", FN, "- TP:", TP, "- TN", TN) 366 | print("Adversarial Accuracy:", accuracy, "- FPR:", FPR, "- FNR:", FNR) 367 | print("Misclassification Rate:", FNR - FNR_original) 368 | print("Distortion:", averageChanges) 369 | print(changes_dict) 370 | 371 | #evaluate_detector_on_adversarial_examples() 372 | -------------------------------------------------------------------------------- /feature_based_reduced_dataset/label_encoding.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | feature_vector = {} 4 | index = 0 5 | feature_vectors_dir = 'new_feature_vectors/' 6 | feature_indexes_dir = 'features_indexes/' 7 | 8 | if not os.path.exists(feature_indexes_dir): 9 | os.makedirs(feature_indexes_dir) 10 | 11 | print("Creating a dictionary that maps features to numeric values...") 12 | for filename in os.listdir(feature_vectors_dir): # read all apps 13 | with open(feature_vectors_dir + filename, "r") as file: # open an app 14 | for line in file: # read app line by line 15 | feature_type = line[:line.find('::')] # extract feature type 16 | feature = line.strip() # remove whitespace chars 17 | # if a feature is not present in the feature vector, map feature to index and increment index 18 | if feature not in feature_vector: 19 | feature_vector[feature] = index 20 | index = index + 1 21 | print("Finished!") 22 | 23 | print("Creating files with numeric values as features...") 24 | for filename in os.listdir(feature_vectors_dir): # recreate files with indexes 25 | with open(feature_vectors_dir + filename, "r") as file: # first open the orignal feature vectors 26 | f = open(feature_indexes_dir + filename, "a") # create a new file with the same SHA name in another dir 27 | for line in file: # read original feature vectors line by line 28 | feature_type = line[:line.find('::')] # extract feature type 29 | feature = line.strip() # remove whitespace chars 30 | f.write(str(feature_vector[feature]) + '\n') # append the index of the feature to the new file 31 | f.close() 32 | 33 | print("Total features in dataset: ", len(feature_vector)) # 34 | -------------------------------------------------------------------------------- /feature_based_reduced_dataset/neural_network.py: -------------------------------------------------------------------------------- 1 | from sklearn.metrics import confusion_matrix 2 | import timeit 3 | from keras import Sequential 4 | from keras.layers import Dense, Dropout 5 | import numpy as np 6 | import matplotlib.pyplot as plt 7 | from keras.callbacks import TensorBoard 8 | from keras.callbacks import EarlyStopping 9 | from keras.callbacks import ModelCheckpoint 10 | from keras.optimizers import Adam, SGD, RMSprop, Adagrad, Adadelta, Adamax, Nadam 11 | 12 | average_FNR = 0 13 | average_FPR = 0 14 | average_accuracy = 0 15 | 16 | 17 | def generate_neural_network(total_features, units, dropout, learn_rate, kernel, bias, activation_function): 18 | """ 19 | :param total_features: the total features (input_dim) we used to train our network 20 | :param units: neurons in the hidden layers 21 | :param dropout: the dropout rate 22 | :param learn_rate: learning rate 23 | :param kernel: (kernel_initializer) weights initialization 24 | :param bias: (bias_initializer) bias init initialization 25 | :param activation_function: activation function 26 | :return: 27 | """ 28 | model = Sequential() # neural net init 29 | """ 30 | add input layer dimension with 545333 features 31 | hidden layers with the defined units, dropout rate, weight and biases initialization, 32 | relu activation function and softmax in output layer 33 | """ 34 | model.add(Dense(units=units[0], activation=activation_function, input_dim=total_features, kernel_initializer=kernel, 35 | bias_initializer=bias)) 36 | model.add(Dropout(dropout)) # add dropout rate 37 | 38 | for hidden_layer_units in units[1:]: # add hidden layers defined units in train_models.py 39 | model.add(Dense(units=hidden_layer_units, activation=activation_function, kernel_initializer=kernel, 40 | bias_initializer=bias)) 41 | model.add(Dropout(dropout)) 42 | 43 | model.add(Dense(2, activation="softmax")) # output layer, with softmax activation function and 2 neurons 44 | 45 | # loss: sparse categorical cross entropy, Optimizer: Adam 46 | model.compile(loss="sparse_categorical_crossentropy", 47 | optimizer=Adam(lr=learn_rate), 48 | metrics=["accuracy"]) 49 | 50 | """ 51 | information about the NN, such as the number of layers, the output shape, 52 | the number of weights in each layer and the total weights. 53 | """ 54 | #model.summary() 55 | 56 | # plot of the neural network graph 57 | #plot_model(model, to_file="figures/DNN_model_plot.png", show_shapes=True, show_layer_names=True) 58 | 59 | return model 60 | 61 | 62 | def train_neural_network(model, epochs, batch_size, features, labels, verbose=0, 63 | validation=False, val_data=None, val_labels=None, 64 | callbacks=False, plot_history=False): 65 | """ 66 | :param modelh5: neural network model from generate_neural_network() 67 | :param epochs: number of epochs 68 | :param batch_size: batch size 69 | :param features: training data 70 | :param labels: training labels 71 | :param verbose: verbosity level 72 | :param validation: if True validate data 73 | :param val_data: validation data 74 | :param val_labels: validation labels 75 | :param callbacks: if True use Tensorboard callback 76 | :param plot_history: if True plots accuracy and loss history per epoch 77 | :return: 78 | """ 79 | print("\n\n--- Training", type(model).__name__, "---") 80 | start_time = timeit.default_timer() 81 | 82 | # get the name of the optimizer in the defined model 83 | opt_config = model.optimizer.get_config() 84 | if 'name' not in opt_config.keys(): 85 | _name = str(model.optimizer.__class__).split('.')[-1].replace('\'', '').replace('>', '') 86 | opt_config.update({'name': _name}) 87 | 88 | if callbacks: 89 | # directory to save callbacks 90 | log_dir = "logs/fit/" + "DNN_200_200_" + opt_config['name'] 91 | # callbacks: TensorBoard, EarlyStopping, ModelCheckPoint 92 | # TensorBoard for storing visualizations of the neural net 93 | tensorboard_callback = TensorBoard(log_dir=log_dir, histogram_freq=1, write_graph=True, write_images=True) 94 | # EarlyStopping to monitor validation loss. If there is any improve after 10 epochs,the training procedure stops 95 | early_stopping_callback = EarlyStopping(monitor='val_loss', mode='min', patience=10, verbose=verbose) 96 | # ModelCheckpoint to monitor validation accuracy. It stores the model with the highest accuracy 97 | model_checkpoint_callback = ModelCheckpoint('best_model_' + opt_config['name'] + '.h5', monitor='val_accuracy', mode='max', 98 | verbose=verbose, save_best_only=True) 99 | if not validation: 100 | # fit the model 101 | print("Note: Validation data is not included...Only Tensorboard callback is used!") 102 | history = model.fit(features, labels, epochs=epochs, batch_size=batch_size, verbose=verbose, 103 | callbacks=[tensorboard_callback]) # train the neural network 104 | else: 105 | # fit the model 106 | history = model.fit(features, labels, epochs=epochs, batch_size=batch_size, verbose=verbose, 107 | validation_data=(val_data, val_labels), 108 | callbacks=[tensorboard_callback, early_stopping_callback, model_checkpoint_callback]) 109 | else: # train the model without the use of callbacks 110 | history = model.fit(features, labels, epochs=epochs, batch_size=batch_size, verbose=verbose) 111 | 112 | if plot_history: # plots the accuracy and loss per epoch 113 | if not validation: 114 | # print(history.history.keys()) 115 | # summarize history for training accuracy 116 | plt.plot(history.history['accuracy']) 117 | plt.title('model accuracy') 118 | plt.ylabel('accuracy') 119 | plt.xlabel('epoch') 120 | plt.legend(['train'], loc='upper left') 121 | plt.show() 122 | # summarize history for training loss 123 | plt.plot(history.history['loss']) 124 | plt.title('model loss') 125 | plt.ylabel('loss') 126 | plt.xlabel('epoch') 127 | plt.legend(['train'], loc='upper left') 128 | plt.show() 129 | else: 130 | # summarize history for training and validation accuracy 131 | plt.plot(history.history['accuracy']) 132 | plt.plot(history.history['val_accuracy']) 133 | plt.title('model accuracy') 134 | plt.ylabel('accuracy') 135 | plt.xlabel('epoch') 136 | plt.legend(['train', 'test'], loc='upper left') 137 | plt.show() 138 | # summarize history for training and validation loss 139 | plt.plot(history.history['loss']) 140 | plt.plot(history.history['val_loss']) 141 | plt.title('model loss') 142 | plt.ylabel('loss') 143 | plt.xlabel('epoch') 144 | plt.legend(['train', 'test'], loc='upper left') 145 | plt.show() 146 | stop_time = timeit.default_timer() 147 | print(type(model).__name__, "training time: ", stop_time - start_time, "seconds\n\n") 148 | 149 | 150 | def evaluate_neural_network(model, features, labels): 151 | """ 152 | :param model: neural network model from generate_neural_network() 153 | :param features: test data 154 | :param labels: test labels 155 | :return: 156 | """ 157 | scores = model.evaluate(features, labels, verbose=0) 158 | print(model.metrics_names[1], "%.2f%%" % (scores[1] * 100)) 159 | return scores[1] * 100 160 | 161 | 162 | def test_neural_network(model, test_data, test_labels): 163 | """ 164 | :param model: neural network model from generate_neural_network() 165 | :param test_data: validation data 166 | :param test_labels: validation labels 167 | :return: 168 | """ 169 | global average_FNR, average_FPR, average_accuracy 170 | print(type(model).__name__, "predicting...") 171 | start_time = timeit.default_timer() 172 | predicted = model.predict(test_data) 173 | stop_time = timeit.default_timer() 174 | # print(predicted) 175 | # prick the class with higher probability 176 | confusion = confusion_matrix(test_labels, np.argmax(predicted, axis=1)) # confusion matrix 177 | print(confusion) 178 | # confusion matrix metrics 179 | TP = confusion[1, 1] 180 | TN = confusion[0, 0] 181 | FP = confusion[0, 1] 182 | FN = confusion[1, 0] 183 | FNR = FN / float(FN + TP) * 100 184 | FPR = FP / float(FP + TN) * 100 185 | accuracy = (TP + TN) / float(TP + TN + FP + FN) * 100 186 | print("FP:", FP, "- FN:", FN, "- TP:", TP, "- TN", TN) 187 | print("Accuracy:", accuracy, "- FPR:", FPR, "- FNR:", FNR) 188 | print(type(model).__name__, "prediction time: ", stop_time - start_time, "seconds\n\n") 189 | average_FNR += FNR 190 | average_FPR += FPR 191 | average_accuracy += accuracy 192 | 193 | 194 | def get_average_metrics(val_runs): 195 | global average_FNR, average_FPR, average_accuracy 196 | average_FNR = average_FNR / val_runs 197 | average_FPR = average_FPR / val_runs 198 | average_accuracy = average_accuracy / val_runs 199 | print("Average Accuracy:", average_accuracy, "- Average FPR:", average_FPR, "- Average FNR:", average_FNR) 200 | -------------------------------------------------------------------------------- /feature_based_reduced_dataset/set_onehot_encoding.py: -------------------------------------------------------------------------------- 1 | import csv 2 | from random import randint, shuffle 3 | import os 4 | import numpy as np 5 | 6 | csv_malware = "../../sha256_family.csv" # csv file with malware apps 7 | feature_index_dir = 'features_indexes/' # directory with indexed features for all apps 8 | 9 | original_malware = [] 10 | eliminated_malware = [] 11 | benign = [] 12 | 13 | 14 | def create_list_of_apps(): 15 | print("Creating list of malicious apps...") 16 | with open(csv_malware, 'r') as file: # open malware csv file 17 | next(file) # skip the header line 18 | reader = csv.reader(file, delimiter=',') # read the csv malware families 19 | for row in reader: 20 | original_malware.append(row[0]) # append every row from the csv file into a list 21 | for filename in os.listdir(feature_index_dir): 22 | if filename in original_malware: 23 | eliminated_malware.append(filename) 24 | print("Malware apps found: ", len(eliminated_malware)) # 5560 25 | print("Malware sample: ", eliminated_malware[randint(0, len(eliminated_malware) - 1)]) # print a random malware sample 26 | 27 | print("Creating list of benign apps...") 28 | for filename in os.listdir(feature_index_dir): # read all apps 29 | if filename not in original_malware: # if a SHA name not in malware list, append it to benign list 30 | benign.append(filename) 31 | print("Benign apps found: ", len(benign)) # 123453 32 | print("Benign app sample: ", benign[randint(0, len(benign) - 1)], ) # print a random benign app 33 | 34 | print("Total apps (Benign & Malicious) found: ", len(eliminated_malware) + len(benign)) # 129013 35 | 36 | malware_incremental_counter = 0 37 | benign_incremental_counter = 0 38 | 39 | 40 | def generate_set_incremental(set_size, malware_ratio): 41 | global malware_incremental_counter, benign_incremental_counter 42 | set = [] # list that will fill with app set 43 | 44 | print("Creating set with", set_size, "samples...") 45 | print("Malware ratio:", int(malware_ratio * 100), "%, totaling", int(set_size * malware_ratio), "apps in", set_size) 46 | print("Creating malware set...") 47 | 48 | while len(set) < (set_size * malware_ratio): 49 | app = eliminated_malware[malware_incremental_counter] # locate malware based on random index in malware list 50 | malware_incremental_counter += 1 51 | if malware_incremental_counter >= 2591: 52 | break 53 | if app not in set: 54 | set.append(app) # append malware to set list 55 | 56 | print("Total malware apps in set: ", len(set)) 57 | print("Malware sample in set: ", set[0]) 58 | 59 | print("Creating benign set...") 60 | 61 | while len(set) < set_size: 62 | app = benign[benign_incremental_counter] # locate benign based on random index in benign list 63 | benign_incremental_counter += 1 64 | if benign_incremental_counter >= 89345: 65 | break 66 | if app not in set: 67 | set.append(app) # append benign to set list 68 | print(malware_incremental_counter) 69 | print("Total apps (malicious and benign) in set: ", len(set)) 70 | return set 71 | 72 | 73 | def generate_set(set_size, malware_ratio): 74 | set = [] # list that will fill with app set 75 | 76 | print("Creating set with", set_size, "samples...") 77 | print("Malware ratio:", int(malware_ratio * 100), "%, totaling", int(set_size * malware_ratio), "apps in", set_size) 78 | print("Creating malware set...") 79 | 80 | while len(set) < (set_size * malware_ratio): 81 | index = randint(0, len(eliminated_malware) - 1) # choose a random index between (0,5559) 82 | app = eliminated_malware[index] # locate malware based on random index in malware list 83 | if app not in set: 84 | set.append(app) # append malware to set list 85 | 86 | print("Total malware apps in set: ", len(set)) 87 | print("Malware sample in set: ", set[0]) 88 | 89 | print("Creating benign set...") 90 | while len(set) < set_size: 91 | index = randint(0, len(benign) - 1) # choose a random index between (0,129012) 92 | app = benign[index] # locate benign based on random index in benign list 93 | if app not in set: 94 | set.append(app) # append benign to set list 95 | 96 | print("Total apps (malicious and benign) in set: ", len(set)) 97 | return set 98 | 99 | 100 | def generate_input(set, total_features): 101 | print("performing one hot encoding...") 102 | # return a 2D array filled with zeros that will be used for the features of each app 103 | data = np.zeros((len(set), total_features), dtype=float) 104 | # return an array filled with zeros that will be used for the label of each app {0-benign 1-malicious} 105 | labels = np.zeros((len(set),), dtype=int) 106 | shuffle(set) # shuffle the set, comment out to work with pre defined training and test set 107 | for id_app, app in enumerate(set): # iterate through set with a counter 108 | with open(feature_index_dir + app, 'r') as file: # open apps in set 109 | for index in file: # read line by line 110 | data[id_app][int(index)] = 1.0 # update corresponding element of the array with 1.0 111 | 112 | if app in eliminated_malware: 113 | labels[id_app] = 1 # update corresponding label to 1 if it is malware 114 | else: 115 | labels[id_app] = 0 116 | 117 | #print(data) 118 | #print(labels) 119 | #print(data.shape) 120 | #print(labels.shape) 121 | return data, labels -------------------------------------------------------------------------------- /feature_based_reduced_dataset/train_models.py: -------------------------------------------------------------------------------- 1 | import set_onehot_encoding as onehot 2 | import feature_based_original_dataset.models as models 3 | import neural_network as NN 4 | import numpy as np 5 | import os 6 | 7 | 8 | def create_sets(): 9 | 10 | if os.path.isfile("training_set_8500.txt") is False: 11 | set_size = 8500 12 | malware_ratio = 0.3 13 | print("Creating data-labels...") 14 | print("Generating TESTING set...") 15 | training_set = onehot.generate_set(set_size, malware_ratio) # generate random testing set 16 | with open("training_set_1500.txt", "w") as file: 17 | for item in training_set: 18 | file.write(str(item) + "\n") 19 | 20 | if os.path.isfile("testing_set_8500.txt") is False: 21 | set_size = 8500 22 | malware_ratio = 0.3 23 | print("Creating data-labels...") 24 | print("Generating TESTING set...") 25 | testing_set = onehot.generate_set(set_size, malware_ratio) # generate random testing set 26 | with open("testing_set_1500.txt", "w") as file: 27 | for item in testing_set: 28 | file.write(str(item) + "\n") 29 | 30 | training_set = [] 31 | testing_set = [] 32 | 33 | with open("training_set_8500.txt", "r") as file: # read training set file and append applications to list 34 | for line in file: 35 | line.strip() # remove whitespace 36 | line = line[:-1] # remove \n 37 | training_set.append(line) # add item to list 38 | with open("testing_set_8500.txt", "r") as file: # read testing set file and append applications to list 39 | for line in file: 40 | line.strip() 41 | line = line[:-1] 42 | testing_set.append(line) 43 | print("Generating TRAINING input...") 44 | data, labels = onehot.generate_input(training_set, total_features) # perform one-hot encoding 45 | print("Generating TESTING input...") 46 | test_data, test_labels = onehot.generate_input(testing_set, total_features) # perform one-hot encoding 47 | return data, labels, test_data, test_labels 48 | 49 | 50 | def train_models(): 51 | data, labels, test_data, test_labels = create_sets() 52 | 53 | model = NB.train_multi_naive_bayes_classifier(data,labels, save=False) 54 | NB.test_multi_naive_bayes_classifier(model, test_data, test_labels) 55 | 56 | model = DT.train_decision_tree_classifier(data, labels, save=False) 57 | DT.test_decision_tree_classifier(model, test_data, test_labels) 58 | 59 | model = RF.train_random_forest_classifier(data, labels, save=False) 60 | RF.test_random_forest_classifier(model, test_data, test_labels) 61 | 62 | model = KNN.train_knn_classifier(data, labels, save=False) 63 | KNN.test_knn_classifier(model, test_data, test_labels) 64 | 65 | model = LR.train_logistic_regression_classifier(data, labels, save=False) 66 | LR.test_logistic_regression_classifier(model, test_data, test_labels) 67 | 68 | model = SVM.train_svm_classifier(data, labels, save=False) 69 | SVM.test_svm_classifier(model, test_data, test_labels) 70 | 71 | # init the neural net 72 | model = NN.generate_neural_network(total_features, units, dropout, learn_rate, kernel_initializer, 73 | bias_initializer, activation_function) 74 | """ 75 | train the neural network with the given model, epochs, batch size, train data-labels. 76 | Specify verbosity level, validation data, callbacks and plots if needed. 77 | Default parameters: 78 | verbose=0, validation=False, val_data=None, val_labels=None, callbacks=False, plot_history=False 79 | example: 80 | NN.train_neural_network(model, epochs, batch_size, data, labels, verbose=0, 81 | validation=True, val_data=test_data, val_labels=test_labels, 82 | callbacks=True, plot_history=True) 83 | This is the main training stage and thus we want to save the best models at the right times. This is done 84 | setting the callback to True. Keras will seek for the minimum validation loss and it saves the model with 85 | the highest validation accuracy. 86 | """ 87 | NN.train_neural_network(model, epochs, batch_size, data, labels, verbose=2, 88 | validation=True, val_data=test_data, val_labels=test_labels, 89 | callbacks=True) 90 | NN.test_neural_network(model, test_data, test_labels) 91 | 92 | 93 | if __name__ == "__main__": 94 | total_features = 3880 # total unique features 95 | set_size = 8500 # set site that will be used to create random training set 96 | testing_set_size = 8500 # set site that will be used to create random test set 97 | malware_ratio = 0.3 # malware ratio in the set size 98 | 99 | print("Creating data-labels...") 100 | onehot.create_list_of_apps() # function from set_one_encoding.py 101 | 102 | # initialize sklearn models 103 | NB = models.MultinomialNaiveBayes() 104 | DT = models.DecisionTree() 105 | RF = models.RandomForest() 106 | KNN = models.KNearestNeighbors() 107 | LR = models.LogRegression() 108 | SVM = models.SupportVectorMachine() 109 | 110 | val_runs = 8 111 | # neural net parameters 112 | units = [200, 200] 113 | dropout = 0.2 114 | epochs = 20 115 | batch_size = 150 116 | learn_rate = 0.001 117 | # momentum = 0.0 # to work with SGD 118 | kernel_initializer = 'glorot_uniform' 119 | bias_initializer = 'zeros' 120 | activation_function = 'relu' 121 | 122 | train_models() 123 | -------------------------------------------------------------------------------- /feature_based_reduced_dataset/train_random_subsampl.py: -------------------------------------------------------------------------------- 1 | import set_onehot_encoding as onehot 2 | import models 3 | import neural_network as NN 4 | import numpy as np 5 | 6 | def create_random_sets(): 7 | print("Generating TRAINING set...") 8 | training_set = onehot.generate_set(set_size, malware_ratio) # generate random training set 9 | print("Generating TRAINING input...") 10 | data, labels = onehot.generate_input(training_set, total_features) # perform one-hot encoding 11 | print("Generating TESTING set...") 12 | testing_set = onehot.generate_set(testing_set_size, malware_ratio) # generate random testing set 13 | print("Generating TESTING input...") 14 | test_data, test_labels = onehot.generate_input(testing_set, total_features) # perform one-hot encoding 15 | return data, labels, test_data, test_labels # return train data - labels and test data - labels 16 | 17 | 18 | def random_sub_sampling(runs): 19 | 20 | score_nn = [] 21 | score_rf = [] 22 | score_lr = [] 23 | score_dt = [] 24 | score_svm = [] 25 | score_knn = [] 26 | 27 | for i in range(runs): 28 | 29 | data, labels, test_data, test_labels = create_random_sets() # choose random training and testing sets 30 | 31 | """# init neural net 32 | model = NN.generate_neural_network(total_features, units, dropout, learn_rate, kernel_initializer, 33 | bias_initializer, activation_function) 34 | ''' 35 | this is not the actual training procedure and we don't want to save the models. To save models and implement 36 | the early stopping technique refer to train_models.py 37 | The goal of this operation is only to determine the behavior of models to random training sets and random 38 | testing sets! 39 | So, only train and evaluate models. 40 | ''' 41 | NN.train_neural_network(model, epochs, batch_size, data, labels, verbose=2) 42 | score = NN.evaluate_neural_network(model, test_data, test_labels) 43 | score_nn.append(score)""" 44 | 45 | #model = DT.train_decision_tree_classifier(data, labels) # train Decision Tree Classifier 46 | #score_dt = DT.evaluate_decision_tree_classifier(model, test_data, test_labels) 47 | 48 | #model = RF.train_random_forest_classifier(data, labels) # train Random Forest 49 | #score_rf = RF.evaluate_random_forest_classifier(model, test_data, test_labels) 50 | 51 | model = KNN.train_knn_classifier(data, labels) # train k-Nearest Neighbors Classifier 52 | score_knn = KNN.evaluate_knn_classifier(model, test_data, test_labels) 53 | 54 | #model = LR.train_logistic_regression_classifier(data, labels) # train logistic Regression 55 | #score_lr = LR.evaluate_logistic_regression_classifier(model, test_data, test_labels) 56 | 57 | #model = SVM.train_svm_classifier(data, labels) # train Support Vector Machines 58 | #score_svm = SVM.evaluate_svm_classifier(model, test_data, test_labels) 59 | 60 | #print("NN Average accuracy: ", np.mean(score_nn), "Standard Deviation:", np.std(score_nn)) 61 | #print("DT Average accuracy: ", np.mean(score_dt), "Standard Deviation:", np.std(score_dt)) 62 | #print("RF Average accuracy: ", np.mean(score_rf), "Standard Deviation:", np.std(score_rf)) 63 | print("kNN Average accuracy: ", np.mean(score_knn), "Standard Deviation:", np.std(score_knn)) 64 | #print("LR Average accuracy: ", np.mean(score_lr), "Standard Deviation:", np.std(score_lr)) 65 | #print("SVM Average accuracy: ", np.mean(score_svm), "Standard Deviation:", np.std(score_svm)) 66 | 67 | 68 | 69 | if __name__ == "__main__": 70 | total_features = 3880 # total unique features 71 | set_size = 8500 # set site that will be used to create random training set 72 | testing_set_size = 8500 # set site that will be used to create random test set 73 | malware_ratio = 0.3 # malware ratio in the set size 74 | 75 | print("Creating data-labels...") 76 | onehot.create_list_of_apps() # function from set_one_encoding.py 77 | 78 | DT = models.DecisionTree() 79 | RF = models.RandomForest() 80 | KNN = models.KNearestNeighbors() 81 | LR = models.LogRegression() 82 | SVM = models.SupportVectorMachine() 83 | 84 | val_runs = 8 # number of times to train and test a model 85 | 86 | # neural net parameters 87 | units = [200, 200] # number of neurons in each layer (2 hidden layers) 88 | dropout = 0.2 # dropout rate 89 | epochs = 18 # epochs per iteration 90 | batch_size = 150 # batch size 91 | learn_rate = 0.001 # learning rate of the specified optimizer 92 | kernel_initializer = 'glorot_uniform' # weight initialization 93 | bias_initializer = 'zeros' # bias initialization 94 | activation_function = 'relu' # activation function in hidden layers (We use Softmax in the output layer) 95 | 96 | random_sub_sampling(val_runs) 97 | -------------------------------------------------------------------------------- /preprocessing/README.md: -------------------------------------------------------------------------------- 1 | ## 1) Extracting feature types 2 | 3 | First, we extract the features types present in the dataset. The extraction is done by parsing each file in feature_vectors folder, line by line, where we extract the feature type, finding chars before “::” occurs. Afterwards, it is checked if the feature type is a key to a dictionary and otherwise, it appends as a key with a numeric value. After this operation, we found 11 features types, which are feature, activity, intent, provider, call, api_call, url, permission, real_permission, service_receiver and an empty feature type. 4 | 5 | ``` 6 | python3 extract_features.py 7 | ``` 8 | 9 | 10 | | Feature Type Found | Feature Type | Class | 11 | | ------------- | ------------- | ------------- | 12 | | provider | Hardware Components | S1 | 13 | | permission | Requested Permissions | S2 | 14 | | activity | Components | S3 | 15 | | service_receiver | Components | S3 | 16 | | intent | Intents | S4 | 17 | | call | Restricted Permissions | S5 | 18 | | real_permission | Used Permissions | S6 | 19 | | api_call | Suspicious API Calls | S7 | 20 | | url | Network Addresses | S8 | 21 | | feature | - | - | 22 | 23 | 24 | ## 2) Counting features for each class 25 | 26 | We use three dictionaries for this operation. The first one is predefined with keys, the feature types found in the previous step. With this dictionary, we extract the number, which corresponds to a class to increment a counter that counts the total features for a specific class. The second dictionary is used to check whether a feature in the form of feature_type::feature is present or not. We parse each file line by line, like in the previous step. For each feature found, which is unique, i.e., it has not inserted into the second dictionary, we check if this particular feature is in the dictionary and otherwise, we increment the counter for its corresponding class. 27 | 28 | ``` 29 | python3 count_features_for_each_class.py 30 | ``` 31 | 32 | 33 | | Class | Feature Type | Amount | 34 | | ------------- | ------------- | ------------- | 35 | | S1 | provider | 4,513 | 36 | | S2 | permission | 3,812 | 37 | | S3 | activity | 185,729 | 38 | | S3 | service_receiver | 33,222 | 39 | | S4 | intent | 6,379 | 40 | | S5 | call | 733 | 41 | | S6 | real_permission | 70 | 42 | | S7 | api_call | 315 | 43 | | S8 | url| 310,488 | 44 | | - | feature | 72 | 45 | 46 | 47 | ## 3) Extracting top features 48 | 49 | We parse all application’s feature vectors to get the top 10 features in malicious and benign apps separately to observe features with high support in both benign and malware applications. We iterate line by line, each malware application and each feature found is appended into a dictionary as a key with an integer value. The value increments by one for the same feature appearance. Similarly, the same procedure goes for benign applications. 50 | 51 | ``` 52 | python3 extract_feature_occurences.py 53 | ``` 54 | 55 | 56 | | Top 10 malware features | Amount Malware | Amount Benign | 57 | | ------------- | ------------- | ------------- | 58 | | feature::android.hardware.touchscreen | 5,524 | 123,178 | 59 | | intent::android.intent.action.MAIN | 5,351 | 120,345 | 60 | | permission::android.permission.INTERNET | 5,323| 102,986 | 61 | | intent::android.intent.category.LAUNCHER | 5,224| 118,504 | 62 | | call::getSystemService | 5,185 | 104,538 | 63 | | real_permission::android.permission.INTERNET | 4,992 | 103,434 | 64 | | permission::android.permission.READ_PHONE_STATE | 4,931 | 45,085 | 65 | | real_permission::android.permission.READ_PHONE_STATE | 4,186 | 41,877 | 66 | | call::getDeviceId | 3,761 | 41,877 | 67 | | permission::android.permission.WRITE_EXTERNAL_STORAGE | 3,713 | 45,244 | 68 | 69 | 70 | ## 4) Getting mean features 71 | 72 | ``` 73 | python3 mean.py 74 | ``` 75 | 76 | 77 | A mean of 46.7 features present in benign applications, unlike malicious applications that have a mean of 61.7 features per application. 78 | -------------------------------------------------------------------------------- /preprocessing/count_features_for_each_class.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file counts the amount of features that belong to each class. 3 | """ 4 | import os 5 | 6 | # predefined dictionary with keys the feature types found from extract_feature_types.py 7 | features_types = { 8 | "provider": 1, 9 | "permission": 2, 10 | "activity": 3, 11 | "service_receiver": 4, 12 | "intent": 5, 13 | "call": 6, 14 | "real_permission": 7, 15 | "api_call": 8, 16 | "url": 9, 17 | "feature": 10 18 | } 19 | 20 | feature_vector = {} 21 | features_occurrences = {x: 0 for x in range(1, 11)} # we have 10 feature types 22 | # print(features_occurrences) 23 | feature_vectors_dir = '../feature_vectors/' # directory with features for all apps 24 | 25 | not_assignable_feature_type = [''] # found from extract_feature_types.py 26 | print("Counting features for each class...") 27 | for filename in os.listdir(feature_vectors_dir): # read all apps 28 | with open(feature_vectors_dir + filename, "r") as file: # open an app 29 | for line in file: # read app line by line 30 | feature_type = line[:line.find('::')] # extract feature type 31 | feature = line.strip() # remove whitespace chars 32 | if feature_type not in not_assignable_feature_type: # check if feature type is '' 33 | # if a feature is not present in the feature vector: map feature to index and increment index 34 | if feature not in feature_vector: 35 | feature_vector[feature] = 1 # append feature to dictionary 36 | temp = features_types.get(feature_type, None) # get the corresponding value from the predefined dict 37 | features_occurrences[temp] += 1 # increment index 38 | 39 | print(features_occurrences) 40 | #print(len(feature_vector)) 41 | print("Total unique features present in dataset: ", sum(features_occurrences.values())) 42 | 43 | """ 44 | {1: 4513, 2: 3812, 3: 185729, 4: 33222, 5: 6379, 6: 733, 7: 70, 8: 315, 9: 310488, 10: 72} 45 | Total features present in dataset: 545333 46 | 47 | Category Type Amount 48 | S1 provider 4513 49 | S2 permission 3812 50 | S3 activity 185729 51 | S3 service_receiver 33222 52 | S4 intent 6379 53 | S5 call 733 54 | S6 real_permission 70 55 | S7 api_call 315 56 | S8 url 310488 57 | - feature 72 58 | """ 59 | -------------------------------------------------------------------------------- /preprocessing/extract_feature_occurrences.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file extract the feature occurrences for each feature present in database. 3 | """ 4 | import os 5 | from collections import Counter 6 | import csv 7 | 8 | feature_vectors_dir = '../feature_vectors/' # directory with features for all apps 9 | csv_malware = "../sha256_family.csv" # csv file with malware apps 10 | features = {} # dictionary for counting features in apps (malicious & benign) 11 | malware = [] # list of malware 12 | malware_features = {} # dictionary for counting features in malicious apps 13 | benign_features = {} # dictionary for counting features in benign apps 14 | not_assignable_feature_type = [''] # found from extract_feature_types.py 15 | 16 | 17 | def count_features_in_apps(): 18 | print("Counting features present in apps...") 19 | for filename in os.listdir(feature_vectors_dir): # read all apps 20 | with open(feature_vectors_dir + filename, "r") as file: # open an app 21 | for line in file: # read app line by line 22 | feature_type = line[:line.find('::')] # extract feature type 23 | feature = line.strip() # remove whitespace chars 24 | if feature_type not in not_assignable_feature_type: # check if feature type is '' 25 | # if a feature not present in dictionary append it as a key with a value of 1 26 | if feature not in features: 27 | features[feature] = 1 28 | # if a feature present in dictionary update its value by one 29 | else: 30 | features[feature] += 1 31 | 32 | print("Total unique features: ", len(features)) 33 | print("Total features: ", sum(features.values())) 34 | 35 | print("\n[+]Top 10 features present in apps:") 36 | top10_features = Counter(features).most_common(10) 37 | for i in top10_features: 38 | print(i[0], ":", i[1]) 39 | 40 | # write a csv file with all feature occurrences 41 | sorted_features = sorted(features.items(), key=lambda kv: kv[1]) 42 | write_features = csv.writer(open("features_counter.csv", "w"), delimiter=' ') 43 | for key, val in sorted_features: 44 | write_features.writerow([key, val]) 45 | 46 | 47 | def count_features_in_malware(): 48 | print("\nCounting features in malware apps...") 49 | 50 | with open(csv_malware, 'r') as file: # open malware csv file 51 | next(file) # skip the header line 52 | reader = csv.reader(file, delimiter=',') # read the csv 53 | for row in reader: 54 | malware.append(row[0]) # append every SHA name from the csv file into a list 55 | 56 | for filename in os.listdir(feature_vectors_dir): # read all apps 57 | if filename in malware: # if a filename present in malware 58 | with open(feature_vectors_dir + filename, "r") as file: # open malware file 59 | for line in file: # read malware line by line 60 | feature_type = line[:line.find('::')] # extract feature 61 | feature = line.strip() # remove whitespace chars 62 | if feature_type not in not_assignable_feature_type: # check if feature type is '' 63 | # if a feature not present in dictionary append it as a key with a value 64 | if feature not in malware_features: 65 | malware_features[feature] = 1 66 | # if a feature present in dictionary update its value by one 67 | else: 68 | malware_features[feature] += 1 69 | 70 | print("Total unique features present in malware: ", len(malware_features)) 71 | print("Total features in malware: ", sum(malware_features.values())) 72 | 73 | print("\nTop 10 features present in malware:") 74 | top10_features_malware = Counter(malware_features).most_common(10) 75 | for i in top10_features_malware: 76 | print(i[0], ":", i[1]) 77 | 78 | 79 | def count_features_in_benign(): 80 | print("\nCounting features in benign apps...") 81 | for filename in os.listdir(feature_vectors_dir): 82 | if filename not in malware: 83 | with open(feature_vectors_dir + filename, "r") as file: 84 | for line in file: 85 | feature_type = line[:line.find('::')] 86 | feature = line.strip() 87 | if feature_type not in not_assignable_feature_type: 88 | if feature not in benign_features: 89 | benign_features[feature] = 1 90 | else: 91 | benign_features[feature] += 1 92 | 93 | print("Total unique features present in benign apps: ", len(malware_features)) 94 | print("Total features in benign apps: ", sum(malware_features.values())) 95 | 96 | print("\nTop 10 features present in benign:") 97 | top10_features_benign = Counter(benign_features).most_common(10) 98 | for i in top10_features_benign: 99 | print(i[0], ":", i[1]) 100 | 101 | 102 | count_features_in_apps() 103 | count_features_in_malware() 104 | count_features_in_benign() 105 | 106 | """ 107 | Counting features present in apps... 108 | Total unique features: 545333 109 | Total features: 6113087 110 | 111 | Top 10 features present in apps: 112 | feature::android.hardware.touchscreen : 128702 113 | intent::android.intent.action.MAIN : 125696 114 | intent::android.intent.category.LAUNCHER : 123728 115 | call::getSystemService : 109723 116 | real_permission::android.permission.INTERNET : 108426 117 | permission::android.permission.INTERNET : 108309 118 | call::getPackageInfo : 73361 119 | call::printStackTrace : 69675 120 | permission::android.permission.ACCESS_NETWORK_STATE : 67487 121 | real_permission::android.permission.ACCESS_NETWORK_STATE : 64800 122 | 123 | Counting features in malware apps... 124 | Total unique features present in malwares: 15590 125 | Total features in malwares: 342794 126 | 127 | Top 10 features present in malware: 128 | feature::android.hardware.touchscreen : 5524 129 | intent::android.intent.action.MAIN : 5351 130 | permission::android.permission.INTERNET : 5323 131 | intent::android.intent.category.LAUNCHER : 5224 132 | call::getSystemService : 5185 133 | real_permission::android.permission.INTERNET : 4992 134 | permission::android.permission.READ_PHONE_STATE : 4931 135 | real_permission::android.permission.READ_PHONE_STATE : 4186 136 | call::getDeviceId : 3761 137 | permission::android.permission.WRITE_EXTERNAL_STORAGE : 3713 138 | 139 | Counting features in benign apps... 140 | Total unique features present in benign apps: 15590 141 | Total features in benign apps: 342794 142 | 143 | Top 10 features present in benign: 144 | feature::android.hardware.touchscreen : 123178 145 | intent::android.intent.action.MAIN : 120345 146 | intent::android.intent.category.LAUNCHER : 118504 147 | call::getSystemService : 104538 148 | real_permission::android.permission.INTERNET : 103434 149 | permission::android.permission.INTERNET : 102986 150 | call::getPackageInfo : 70604 151 | call::printStackTrace : 65963 152 | permission::android.permission.ACCESS_NETWORK_STATE : 63808 153 | real_permission::android.permission.ACCESS_NETWORK_STATE : 61679 154 | """ 155 | -------------------------------------------------------------------------------- /preprocessing/extract_feature_types.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file extracts the unique feature types (classes) present in dataset. 3 | We found a feature type ('') that doesn't belong in any category. 4 | """ 5 | import os 6 | 7 | feature_types = {} # dictionary that will be filled with feature types as keys 8 | index = 1 9 | feature_vectors_dir = '../feature_vectors/' # directory with features for all apps 10 | 11 | print("Creating a dictionary that extracts the feature types (classes)...") 12 | for filename in os.listdir(feature_vectors_dir): # read all apps 13 | with open(feature_vectors_dir + filename, "r") as file: # open an app 14 | for line in file: # read app line by line 15 | feature_type = line[:line.find('::')] # extract feature type 16 | if feature_type not in feature_types: # check if feature type is in dictionary 17 | feature_types[feature_type] = index # append feature type as key with the value of index 18 | index = index + 1 # increment index counter 19 | print("Feature types: ", str(feature_types)) 20 | print("Total feature types found: ", len(feature_types)) 21 | 22 | 23 | """ 24 | { 25 | 'feature': 1, 26 | 'activity': 2, 27 | 'intent': 3, 28 | 'provider': 4, 29 | 'call': 5, 30 | 'api_call': 6, 31 | 'url': 7, 32 | 'permission': 8, 33 | 'real_permission': 9, 34 | 'service_receiver': 10, 35 | '': 11} 36 | """ 37 | 38 | """ 39 | We can divide feature types in 8 classes as: 40 | 1) provider: Hardware components {S1} 41 | 2) permission: Requested Permissions {S2} 42 | 3) activity, service_receiver: Components {S3} 43 | 4) intent: Intents {S4} 44 | 5) call: Restr. API Calls {S5} 45 | 6) real_permission: Used Permissions {S6} 46 | 7) api_call: Susp. API Calls {S7} 47 | 8) url: Network addresses {S8} 48 | -) feature: Not assigned 49 | """ 50 | -------------------------------------------------------------------------------- /preprocessing/mean_features.py: -------------------------------------------------------------------------------- 1 | import os 2 | import csv 3 | 4 | feature_vectors_dir = '../feature_vectors/' # directory with features for all apps 5 | csv_malware = "../sha256_family.csv" # csv file with malwares 6 | features = 0 7 | malware_features = 0 8 | benign_features = 0 9 | malware = [] 10 | not_assignable_feature_type = [''] # found from extract_feature_types.py 11 | 12 | 13 | def count_features_in_apps(): 14 | global features 15 | print("Counting features in apps...") 16 | for filename in os.listdir(feature_vectors_dir): 17 | with open(feature_vectors_dir + filename, "r") as file: 18 | for line in file: 19 | feature_type = line[:line.find('::')] 20 | line.strip() 21 | if feature_type not in not_assignable_feature_type: 22 | features += 1 23 | print("Total features present in apps: ", features) # 6113102 24 | 25 | 26 | def count_features_in_malware(): 27 | global malware_features 28 | print("\nCounting features in malware apps...") 29 | 30 | with open(csv_malware, 'r') as file: # open malware csv file 31 | next(file) # skip the header line 32 | reader = csv.reader(file, delimiter=',') # read the csv 33 | for row in reader: 34 | malware.append(row[0]) # append every SHA name from the csv file into a list 35 | 36 | for filename in os.listdir(feature_vectors_dir): 37 | if filename in malware: 38 | with open(feature_vectors_dir + filename, "r") as file: 39 | for line in file: 40 | # extract feature 41 | feature_type = line[:line.find('::')] 42 | line.strip() 43 | if feature_type not in not_assignable_feature_type: 44 | malware_features += 1 45 | 46 | print("Total features present in malware: ", malware_features) 47 | print("Mean of features in malware: ", malware_features / 5560) 48 | 49 | 50 | def count_features_in_benign(): 51 | print("\nCounting features in benign apps...") 52 | global benign_features 53 | for filename in os.listdir(feature_vectors_dir): # read all app's SHA names 54 | if filename not in malware: 55 | with open(feature_vectors_dir + filename, "r") as file: 56 | for line in file: 57 | # extract feature 58 | feature_type = line[:line.find('::')] 59 | if feature_type not in not_assignable_feature_type: 60 | benign_features += 1 61 | 62 | print("Total unique features present in benign apps: ", benign_features) 63 | print("Mean of features in benign apps: ", benign_features / 123453) 64 | 65 | 66 | count_features_in_apps() 67 | count_features_in_malware() 68 | count_features_in_benign() 69 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # Library dependencies for the python3 code. You need to install these with 2 | # `pip install -r requirements.txt` before you can reproduce the experiments. 3 | 4 | pandas==0.25.1 5 | numpy==1.17.3 6 | matplotlib 7 | scikit-learn==0.21.1 8 | tensroflow=2.0.0 9 | keras==2.2.5 10 | --------------------------------------------------------------------------------