├── README.md ├── data └── dataSample │ ├── 0A32eTdBKayjCWhZqDOQ.asm │ └── 0ACDbR5M3ZhBJajygTuf.asm └── src └── method ├── api_component ├── base_architecture.py ├── custom_training.py ├── parameters │ └── nn_1h_parameters.json ├── tfreader.py └── tfwriter.py ├── bytes_component ├── base_architecture.py ├── custom_training.py ├── parameters │ └── parameters_DeepConv.json ├── tfreader.py ├── tfwriter.py └── vocabulary │ ├── inverse_vocabulary_mapping.json │ └── vocabulary_mapping.json ├── hydra ├── custom_training.py ├── hydra_architecture.py ├── parameters │ └── hydra_parameters.json ├── tfreader.py └── tfwriter.py ├── opcodes_component ├── __init__.py ├── base_architecture.py ├── custom_training.py ├── parameters │ └── standard_cnn_parameters.json ├── tfreader.py ├── tfwriter.py └── vocabulary │ ├── mnemonics_inverse_vocabulary_mapping_min=3.json │ └── mnemonics_vocabulary_mapping_min=3.json └── utils.py /README.md: -------------------------------------------------------------------------------- 1 | # HYDRA: A multimodal deep learning framework for malware classification 2 | 3 | This code base is no longer maintained and exists as a historical artifact to supplement 4 | the paper [HYDRA: A multimodal deep learning framework for malware classification](https://www.sciencedirect.com/science/article/pii/S0167404820301462). 5 | 6 | 7 | ## Requirements 8 | 9 | Code is written in Python 3.6 and requires Tensorflow==2.3.0 10 | 11 | ## Citing 12 | If you find this work useful in your research, please consider citing: 13 | ``` 14 | @article{GIBERT2020101873, 15 | title = "HYDRA: A multimodal deep learning framework for malware classification", 16 | journal = "Computers & Security", 17 | volume = "95", 18 | pages = "101873", 19 | year = "2020", 20 | issn = "0167-4048", 21 | doi = "https://doi.org/10.1016/j.cose.2020.101873", 22 | url = "http://www.sciencedirect.com/science/article/pii/S0167404820301462", 23 | author = "Daniel Gibert and Carles Mateu and Jordi Planes", 24 | keywords = "Malware classification, Machine learning, Deep learning, Feature fusion, Multimodal learning", 25 | abstract = "While traditional machine learning methods for malware detection largely depend on hand-designed features, which are based on experts’ knowledge of the domain, end-to-end learning approaches take the raw executable as input, and try to learn a set of descriptive features from it. Although the latter might behave badly in problems where there are not many data available or where the dataset is imbalanced. In this paper we present HYDRA, a novel framework to address the task of malware detection and classification by combining various types of features to discover the relationships between distinct modalities. Our approach learns from various sources to maximize the benefits of multiple feature types to reflect the characteristics of malware executables. We propose a baseline system that consists of both hand-engineered and end-to-end components to combine the benefits of feature engineering and deep learning so that malware characteristics are effectively represented. An extensive analysis of state-of-the-art methods on the Microsoft Malware Classification Challenge benchmark shows that the proposed solution achieves comparable results to gradient boosting methods in the literature and higher yield in comparison with deep learning approaches." 26 | } 27 | ``` 28 | 29 | ## ToDo 30 | * Transfer the weights of the individually trained subcomponents 31 | * Modality dropout. -------------------------------------------------------------------------------- /data/dataSample/0A32eTdBKayjCWhZqDOQ.asm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danielgibert/mlw_classification_hydra/c49c5a4aa4e1581304e64015d710f224d0faf57a/data/dataSample/0A32eTdBKayjCWhZqDOQ.asm -------------------------------------------------------------------------------- /data/dataSample/0ACDbR5M3ZhBJajygTuf.asm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danielgibert/mlw_classification_hydra/c49c5a4aa4e1581304e64015d710f224d0faf57a/data/dataSample/0ACDbR5M3ZhBJajygTuf.asm -------------------------------------------------------------------------------- /src/method/api_component/base_architecture.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | class APIsNN(tf.keras.Model): 4 | def __init__(self, parameters): 5 | super(APIsNN, self).__init__() 6 | self.parameters = parameters 7 | 8 | def build(self, input_shapes): 9 | self.input_dropout = tf.keras.layers.Dropout(self.parameters["input_dropout_rate"], 10 | input_shape=(None, self.parameters["features"])) 11 | 12 | self.h1 = tf.keras.layers.Dense(self.parameters['hidden'], 13 | activation="relu", 14 | input_shape=(None, self.parameters["features"])) 15 | 16 | 17 | self.output_dropout = tf.keras.layers.Dropout(self.parameters["hidden_dropout_rate"], 18 | input_shape=(None, self.parameters["hidden"])) 19 | 20 | self.out = tf.keras.layers.Dense(self.parameters['output'], 21 | activation="softmax", 22 | input_shape=(None, self.parameters["hidden"])) 23 | 24 | def call(self, input_tensor, training=False): 25 | input_dropout = self.input_dropout(input_tensor, training=training) 26 | hidden1 = self.h1(input_dropout) 27 | output_dropout = self.output_dropout(hidden1, training=training) 28 | out = self.out(output_dropout) 29 | return out 30 | -------------------------------------------------------------------------------- /src/method/api_component/custom_training.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import tensorflow as tf 3 | import os 4 | project_path = os.path.dirname(os.path.realpath("../../../")) 5 | import sys 6 | sys.path.append(project_path) 7 | from src.method.api_component.base_architecture import APIsNN 8 | from src.method.api_component.tfreader import make_dataset 9 | from src.method.utils import load_parameters 10 | from sklearn.metrics import confusion_matrix 11 | 12 | if __name__ == "__main__": 13 | parser = argparse.ArgumentParser(description='API-based NN Model Training') 14 | parser.add_argument("model", 15 | type=str, 16 | help="Model name") 17 | parser.add_argument("tr_tfrecord", 18 | type=str, 19 | help="Training TFRecord file") 20 | parser.add_argument("val_tfrecord", 21 | type=str, 22 | help="Validation TFrecord file") 23 | parser.add_argument("parameters", 24 | type=str, 25 | help="JSON file containing the parameters of the model") 26 | parser.add_argument("--test_tfrecord", 27 | type=str, 28 | help="Testing TFRecord file", 29 | default=None) 30 | args = parser.parse_args() 31 | 32 | print("TensorFlow version: {}".format(tf.__version__)) 33 | print("Eager execution: {}".format(tf.executing_eagerly())) 34 | print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU'))) 35 | tf.debugging.set_log_device_placement(True) 36 | 37 | gpus = tf.config.experimental.list_physical_devices('GPU') 38 | if gpus: 39 | try: 40 | tf.config.experimental.set_visible_devices(gpus[2], 'GPU') 41 | logical_gpus = tf.config.experimental.list_logical_devices('GPU') 42 | print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPU") 43 | except RuntimeError as e: 44 | # Visible devices must be set before GPUs have been initialized 45 | print(e) 46 | 47 | # Load parameters of the model 48 | parameters = load_parameters(args.parameters) 49 | 50 | # Specify GPU 51 | if "gpu" in parameters.keys(): 52 | os.environ["CUDA_VISIBLE_DEVICES"] = parameters["gpu"] 53 | 54 | 55 | model = APIsNN(parameters) 56 | 57 | loss_func = tf.keras.losses.SparseCategoricalCrossentropy() 58 | accuracy = tf.keras.metrics.SparseCategoricalAccuracy() 59 | optimizer = tf.keras.optimizers.Adam(learning_rate=parameters['learning_rate']) 60 | 61 | 62 | def train_loop(features, labels, training=False): 63 | # Define the GradientTape context 64 | with tf.GradientTape() as tape: 65 | # Get the probabilities 66 | predictions = model(features, training) 67 | #labels = tf.dtypes.cast(labels, tf.float32) 68 | # Calculate the loss 69 | loss = loss_func(labels, predictions) 70 | # Get the gradients 71 | gradients = tape.gradient(loss, model.trainable_variables) 72 | # Update the weights 73 | optimizer.apply_gradients(zip(gradients, model.trainable_variables)) 74 | return loss, predictions 75 | 76 | 77 | # Training loop 78 | # 1/ Iterate each epoch. An epoch is one pass through the dataset 79 | # 2/ Whithin an epoch, iterate over each example in the training Dataset. 80 | # 3/ Calculate model's loss and gradients 81 | # 4/ Use an optimizer to update the model's variables 82 | # 5/ Keep track of stats and repeat 83 | 84 | train_loss_results = [] 85 | train_accuracy_results = [] 86 | 87 | validation_loss_results = [] 88 | validation_accuracy_results = [] 89 | 90 | #checkpoint_path = "models/ShallowCNN/model_ep_{}.ckpt" 91 | #checkpoint_dir = os.path.dirname(checkpoint_path) 92 | 93 | num_epochs = parameters['epochs'] 94 | 95 | initial_loss = 10.0 96 | for epoch in range(num_epochs): 97 | print("Current epoch: {}".format(epoch)) 98 | checkpoint_path = "models/{}/model_001.ckpt".format(args.model) 99 | #checkpoint_dir = os.path.dirname(checkpoint_path) 100 | 101 | d_train = make_dataset(args.tr_tfrecord, 102 | parameters['buffer_size'], 103 | parameters['batch_size'], 104 | 1) 105 | d_val = make_dataset(args.val_tfrecord, 106 | 1024, 107 | 1, 108 | 1) 109 | 110 | 111 | # Training metrics 112 | epoch_loss_avg = tf.keras.metrics.Mean() 113 | epoch_accuracy = tf.keras.metrics.SparseCategoricalAccuracy() 114 | # Validation metrics 115 | val_epoch_loss_avg = tf.keras.metrics.Mean() 116 | val_epoch_accuracy = tf.keras.metrics.SparseCategoricalAccuracy() 117 | tr_step = 0 118 | 119 | # Training loop 120 | for step, (x, y) in enumerate(d_train): 121 | #print("Input: {}".format(x)) 122 | 123 | loss, y_ = train_loop(x, y, True) 124 | 125 | # Track progress 126 | epoch_loss_avg(loss) 127 | epoch_accuracy(y, y_) 128 | print("Iteration step: {}; Loss: {:.3f}, Accuracy: {:.3%}".format(tr_step, 129 | epoch_loss_avg.result(), 130 | epoch_accuracy.result())) 131 | tr_step += 1 132 | 133 | # End epoch 134 | train_loss_results.append(epoch_loss_avg.result()) 135 | train_accuracy_results.append(epoch_accuracy.result()) 136 | 137 | 138 | 139 | # Run a validation loop at the end of each epoch. 140 | for x_batch_val, y_batch_val in d_val: 141 | val_logits = model(x_batch_val, False) 142 | val_loss = loss_func(y_batch_val, val_logits) 143 | 144 | # Update metrics 145 | val_epoch_loss_avg(val_loss) 146 | val_epoch_accuracy(y_batch_val, val_logits) 147 | 148 | val_acc = val_epoch_accuracy.result() 149 | val_loss = val_epoch_loss_avg.result() 150 | print('Epoch: {}; Validation loss {}; acc: {}'.format(epoch, val_loss, val_acc)) 151 | 152 | validation_loss_results.append(val_loss) 153 | validation_accuracy_results.append(val_acc) 154 | 155 | if float(val_loss) < initial_loss: 156 | initial_loss = float(val_loss) 157 | model.save_weights(checkpoint_path) # Save only the weights 158 | 159 | model.load_weights(checkpoint_path) 160 | test_epoch_loss_avg = tf.keras.metrics.Mean() 161 | test_epoch_accuracy = tf.keras.metrics.SparseCategoricalAccuracy() 162 | 163 | y_actual_test = [] 164 | y_pred_test = [] 165 | # Evaluate model on the test set 166 | if args.test_tfrecord is not None: 167 | d_test = make_dataset(args.test_tfrecord, 168 | 1, 169 | 1, 170 | 1) 171 | 172 | for x_batch_test, y_batch_test in d_test: 173 | test_logits = model(x_batch_test, False) 174 | test_loss = loss_func(y_batch_test, test_logits) 175 | 176 | # For the confusion matrix 177 | y_pred = tf.argmax(test_logits, axis=-1) 178 | y_pred_test.extend(y_pred) 179 | y_actual_test.extend(y_batch_test) 180 | 181 | # Update metrics 182 | test_epoch_loss_avg(test_loss) 183 | test_epoch_accuracy(y_batch_test, test_logits) 184 | 185 | test_acc = test_epoch_accuracy.result() 186 | test_loss = test_epoch_loss_avg.result() 187 | print('Test loss {}; acc: {}'.format(test_loss, test_acc)) 188 | 189 | cm = confusion_matrix(y_actual_test, y_pred_test) 190 | print("Confusion Matrix:\n {}".format(cm)) -------------------------------------------------------------------------------- /src/method/api_component/parameters/nn_1h_parameters.json: -------------------------------------------------------------------------------- 1 | { 2 | "output":9, 3 | "hidden":250, 4 | "buffer_size": 1024, 5 | "batch_size":128, 6 | "epochs":25, 7 | "learning_rate":0.001, 8 | "features":4500, 9 | "input_dropout_rate":0.0, 10 | "hidden_dropout_rate":0.5, 11 | "gpu":"0" 12 | } -------------------------------------------------------------------------------- /src/method/api_component/tfreader.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | 4 | def _parse_tfrecord_function(example): 5 | example_fmt = { 6 | 'APIs': tf.io.FixedLenFeature([], tf.string), 7 | 'label': tf.io.FixedLenFeature([], tf.int64) 8 | } 9 | parsed = tf.io.parse_single_example(example, example_fmt) 10 | feature_vector = tf.io.decode_raw(parsed['APIs'], tf.float32) 11 | return feature_vector, parsed['label'] 12 | 13 | 14 | def make_dataset(filepath, SHUFFLE_BUFFER_SIZE=1024, BATCH_SIZE=32, EPOCHS=5): 15 | dataset = tf.data.TFRecordDataset(filepath) 16 | dataset = dataset.shuffle(SHUFFLE_BUFFER_SIZE) 17 | dataset = dataset.repeat(EPOCHS) 18 | dataset = dataset.map(lambda x: _parse_tfrecord_function(x)) 19 | dataset = dataset.batch(batch_size=BATCH_SIZE) 20 | return dataset 21 | -------------------------------------------------------------------------------- /src/method/api_component/tfwriter.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import tensorflow as tf 3 | import os 4 | project_path = os.path.dirname(os.path.realpath("../../../")) 5 | import sys 6 | import csv 7 | sys.path.append(project_path) 8 | from metaphor.metaphor_engine import MetaPHOR 9 | from src.method.utils import serialize_apis_example 10 | 11 | 12 | def dataset_to_tfrecords(pe_filepath, 13 | tfrecords_filepath, 14 | labels_filepath): 15 | 16 | tfwriter = tf.io.TFRecordWriter(tfrecords_filepath) 17 | 18 | i = 0 19 | with open(labels_filepath, "r") as labels_file: 20 | reader = csv.DictReader(labels_file, fieldnames=["Id", 21 | "Class"]) 22 | reader.__next__() 23 | for row in reader: 24 | print("{};{}".format(i, row['Id'])) 25 | metaPHOR = MetaPHOR(pe_filepath + row['Id'] + ".asm") 26 | feature_vector = metaPHOR.count_windows_api_calls() 27 | 28 | example = serialize_apis_example(feature_vector, int(row['Class']) - 1) 29 | tfwriter.write(example) 30 | i += 1 31 | 32 | 33 | if __name__ == "__main__": 34 | parser = argparse.ArgumentParser(description='API-based TFWriter Script') 35 | parser.add_argument("pe_filepath", 36 | type=str, 37 | help="Filepath describing the location of the pe files in asm format") 38 | parser.add_argument("tfrecords_filepath", 39 | type=str, 40 | help="Where the TFRecord files will be stored") 41 | parser.add_argument("labels_filepath", 42 | type=str, 43 | help="CSV filepath containing the ID and class of each PE file in pe_filepath") 44 | args = parser.parse_args() 45 | dataset_to_tfrecords(args.pe_filepath, 46 | args.tfrecords_filepath, 47 | args.labels_filepath) -------------------------------------------------------------------------------- /src/method/bytes_component/base_architecture.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | 4 | class DeepConv(tf.keras.Model): 5 | def __init__(self, parameters): 6 | super(DeepConv, self).__init__() 7 | self.parameters = parameters 8 | 9 | def build(self, input_shapes): 10 | self.emb = tf.keras.layers.Embedding(self.parameters['V'], self.parameters['E'], 11 | input_shape=(None, self.parameters['max_bytes_values'])) 12 | 13 | self.conv_1 = tf.keras.layers.Conv2D(filters=self.parameters['num_filters'][0], 14 | kernel_size=[self.parameters['kernel_sizes'][0], 15 | self.parameters['E']], 16 | strides=(self.parameters['strides'][0],1), 17 | data_format='channels_last', 18 | use_bias=True, 19 | activation="relu") 20 | 21 | self.conv_2 = tf.keras.layers.Conv2D(filters=self.parameters['num_filters'][1], 22 | kernel_size=[self.parameters['kernel_sizes'][1], 23 | 1], 24 | strides=(self.parameters['strides'][1],1), 25 | data_format='channels_last', 26 | use_bias=True, 27 | activation="relu") 28 | 29 | self.max_pool_1 = tf.keras.layers.MaxPooling2D(pool_size=(self.parameters['max_pool_size'], 1)) 30 | 31 | self.conv_3 = tf.keras.layers.Conv2D(filters=self.parameters['num_filters'][2], 32 | kernel_size=[self.parameters['kernel_sizes'][2], 33 | 1], 34 | strides=(self.parameters['strides'][2], 1), 35 | data_format='channels_last', 36 | use_bias=True, 37 | activation="relu") 38 | 39 | self.conv_4 = tf.keras.layers.Conv2D(filters=self.parameters['num_filters'][3], 40 | kernel_size=[self.parameters['kernel_sizes'][3], 41 | 1], 42 | strides=(self.parameters['strides'][3], 1), 43 | data_format='channels_last', 44 | use_bias=True, 45 | activation="relu") 46 | 47 | self.global_avg_pool = tf.keras.layers.GlobalAvgPool2D() 48 | 49 | self.drop_1 = tf.keras.layers.Dropout(self.parameters["dropout_rate"]) 50 | self.dense_1 = tf.keras.layers.Dense(self.parameters['hidden'][0], 51 | activation="selu") 52 | 53 | self.drop_2 = tf.keras.layers.Dropout(self.parameters["dropout_rate"]) 54 | self.dense_2 = tf.keras.layers.Dense(self.parameters['hidden'][1], 55 | activation="selu") 56 | 57 | self.drop_3 = tf.keras.layers.Dropout(self.parameters["dropout_rate"]) 58 | 59 | self.dense_3 = tf.keras.layers.Dense(self.parameters['hidden'][2], 60 | activation="selu") 61 | 62 | self.drop_4 = tf.keras.layers.Dropout(self.parameters["dropout_rate"]) 63 | self.out = tf.keras.layers.Dense(self.parameters['output'], 64 | activation="softmax") 65 | 66 | 67 | def call(self, input_tensor, training=False): 68 | emb = self.emb(input_tensor) 69 | emb_expanded = tf.keras.backend.expand_dims(emb, axis=-1) 70 | 71 | conv_1 = self.conv_1(emb_expanded) 72 | conv_2 = self.conv_2(conv_1) 73 | 74 | max_pool_1 = self.max_pool_1(conv_2) 75 | 76 | conv_3 = self.conv_3(max_pool_1) 77 | conv_4 = self.conv_4(conv_3) 78 | 79 | features = self.global_avg_pool(conv_4) 80 | 81 | drop_1 = self.drop_1(features, training=training) 82 | dense_1 = self.dense_1(drop_1) 83 | 84 | drop_2 = self.drop_2(dense_1, training=training) 85 | dense_2 = self.dense_2(drop_2) 86 | 87 | drop_3 = self.drop_3(dense_2, training=training) 88 | dense_3 = self.dense_3(drop_3) 89 | 90 | drop_4 = self.drop_4(dense_3, training=training) 91 | output = self.out(drop_4) 92 | 93 | return output -------------------------------------------------------------------------------- /src/method/bytes_component/custom_training.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import tensorflow as tf 3 | import os 4 | project_path = os.path.dirname(os.path.realpath("../../../")) 5 | import sys 6 | sys.path.append(project_path) 7 | from src.method.bytes_component.base_architecture import DeepConv 8 | from src.method.bytes_component.tfreader import make_dataset 9 | from src.method.utils import load_parameters 10 | from src.method.utils import load_vocabulary 11 | from src.method.utils import create_lookup_table 12 | from sklearn.metrics import confusion_matrix 13 | 14 | if __name__ == "__main__": 15 | parser = argparse.ArgumentParser(description='Bytes Model Training') 16 | parser.add_argument("model", 17 | type=str, 18 | help="Model name") 19 | parser.add_argument("tr_tfrecord", 20 | type=str, 21 | help="Training TFRecord file") 22 | parser.add_argument("val_tfrecord", 23 | type=str, 24 | help="Validation TFrecord file") 25 | parser.add_argument("parameters", 26 | type=str, 27 | help="JSON file containing the parameters of the model") 28 | parser.add_argument("vocabulary_mapping_filepath", 29 | type=str, 30 | help="Filepath describing the vocabulary mapping between mnemonics and IDs") 31 | parser.add_argument("--test_tfrecord", 32 | type=str, 33 | help="Testing TFRecord file", 34 | default=None) 35 | args = parser.parse_args() 36 | 37 | print("TensorFlow version: {}".format(tf.__version__)) 38 | print("Eager execution: {}".format(tf.executing_eagerly())) 39 | print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU'))) 40 | tf.debugging.set_log_device_placement(True) 41 | 42 | gpus = tf.config.experimental.list_physical_devices('GPU') 43 | if gpus: 44 | try: 45 | tf.config.experimental.set_visible_devices(gpus[2], 'GPU') 46 | logical_gpus = tf.config.experimental.list_logical_devices('GPU') 47 | print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPU") 48 | except RuntimeError as e: 49 | # Visible devices must be set before GPUs have been initialized 50 | print(e) 51 | 52 | #Load vocabulary and create lookup table 53 | vocabulary_mapping = load_vocabulary(args.vocabulary_mapping_filepath) 54 | lookup_table = create_lookup_table(vocabulary_mapping, 1) 55 | 56 | # Load parameters of the model 57 | parameters = load_parameters(args.parameters) 58 | 59 | # Specify GPU 60 | if "gpu" in parameters.keys(): 61 | os.environ["CUDA_VISIBLE_DEVICES"] = parameters["gpu"] 62 | 63 | 64 | model = DeepConv(parameters) 65 | if os.path.isdir("models/{}/".format(args.model)): 66 | print("LOADING WEIGHTS!!!!") 67 | latest = tf.train.latest_checkpoint("models/{}/".format(args.model)) 68 | model.load_weights(latest) 69 | 70 | loss_func = tf.keras.losses.SparseCategoricalCrossentropy() 71 | accuracy = tf.keras.metrics.SparseCategoricalAccuracy() 72 | optimizer = tf.keras.optimizers.Adam(learning_rate=parameters['learning_rate']) 73 | 74 | 75 | def train_loop(features, labels, training=False): 76 | # Define the GradientTape context 77 | with tf.GradientTape() as tape: 78 | # Get the probabilities 79 | predictions = model(features, training) 80 | #labels = tf.dtypes.cast(labels, tf.float32) 81 | # Calculate the loss 82 | loss = loss_func(labels, predictions) 83 | # Get the gradients 84 | gradients = tape.gradient(loss, model.trainable_variables) 85 | # Update the weights 86 | optimizer.apply_gradients(zip(gradients, model.trainable_variables)) 87 | return loss, predictions 88 | 89 | 90 | # Training loop 91 | # 1/ Iterate each epoch. An epoch is one pass through the dataset 92 | # 2/ Whithin an epoch, iterate over each example in the training Dataset. 93 | # 3/ Calculate model's loss and gradients 94 | # 4/ Use an optimizer to update the model's variables 95 | # 5/ Keep track of stats and repeat 96 | 97 | train_loss_results = [] 98 | train_accuracy_results = [] 99 | 100 | validation_loss_results = [] 101 | validation_accuracy_results = [] 102 | 103 | #checkpoint_path = "models/ShallowCNN/model_ep_{}.ckpt" 104 | #checkpoint_dir = os.path.dirname(checkpoint_path) 105 | 106 | num_epochs = parameters['epochs'] 107 | 108 | initial_loss = 10.0 109 | for epoch in range(num_epochs): 110 | print("Current epoch: {}".format(epoch)) 111 | checkpoint_path = "models/{}/model_001.ckpt".format(args.model) 112 | #checkpoint_dir = os.path.dirname(checkpoint_path) 113 | 114 | d_train = make_dataset(args.tr_tfrecord, 115 | lookup_table, 116 | parameters['buffer_size'], 117 | parameters['batch_size'], 118 | 1) 119 | d_val = make_dataset(args.val_tfrecord, 120 | lookup_table, 121 | 1024, 122 | 1, 123 | 1) 124 | 125 | 126 | # Training metrics 127 | epoch_loss_avg = tf.keras.metrics.Mean() 128 | epoch_accuracy = tf.keras.metrics.SparseCategoricalAccuracy() 129 | # Validation metrics 130 | val_epoch_loss_avg = tf.keras.metrics.Mean() 131 | val_epoch_accuracy = tf.keras.metrics.SparseCategoricalAccuracy() 132 | tr_step = 0 133 | 134 | # Training loop 135 | for step, (x, y) in enumerate(d_train): 136 | #print("Input: {}".format(x)) 137 | 138 | loss, y_ = train_loop(x, y, True) 139 | 140 | # Track progress 141 | epoch_loss_avg(loss) 142 | epoch_accuracy(y, y_) 143 | print("Iteration step: {}; Loss: {:.3f}, Accuracy: {:.3%}".format(tr_step, 144 | epoch_loss_avg.result(), 145 | epoch_accuracy.result())) 146 | tr_step += 1 147 | 148 | # End epoch 149 | train_loss_results.append(epoch_loss_avg.result()) 150 | train_accuracy_results.append(epoch_accuracy.result()) 151 | 152 | 153 | 154 | # Run a validation loop at the end of each epoch. 155 | for x_batch_val, y_batch_val in d_val: 156 | val_logits = model(x_batch_val, False) 157 | val_loss = loss_func(y_batch_val, val_logits) 158 | 159 | # Update metrics 160 | val_epoch_loss_avg(val_loss) 161 | val_epoch_accuracy(y_batch_val, val_logits) 162 | 163 | val_acc = val_epoch_accuracy.result() 164 | val_loss = val_epoch_loss_avg.result() 165 | print('Epoch: {}; Validation loss {}; acc: {}'.format(epoch, val_loss, val_acc)) 166 | 167 | validation_loss_results.append(val_loss) 168 | validation_accuracy_results.append(val_acc) 169 | 170 | if float(val_loss) < initial_loss: 171 | initial_loss = float(val_loss) 172 | model.save_weights(checkpoint_path) # Save only the weights 173 | 174 | model.load_weights(checkpoint_path) 175 | test_epoch_loss_avg = tf.keras.metrics.Mean() 176 | test_epoch_accuracy = tf.keras.metrics.SparseCategoricalAccuracy() 177 | 178 | y_actual_test = [] 179 | y_pred_test = [] 180 | # Evaluate model on the test set 181 | if args.test_tfrecord is not None: 182 | d_test = make_dataset(args.test_tfrecord, 183 | lookup_table, 184 | 1, 185 | 1, 186 | 1) 187 | 188 | for x_batch_test, y_batch_test in d_test: 189 | test_logits = model(x_batch_test, False) 190 | test_loss = loss_func(y_batch_test, test_logits) 191 | 192 | # For the confusion matrix 193 | y_pred = tf.argmax(test_logits, axis=-1) 194 | y_pred_test.extend(y_pred) 195 | y_actual_test.extend(y_batch_test) 196 | 197 | # Update metrics 198 | test_epoch_loss_avg(test_loss) 199 | test_epoch_accuracy(y_batch_test, test_logits) 200 | 201 | test_acc = test_epoch_accuracy.result() 202 | test_loss = test_epoch_loss_avg.result() 203 | print('Test loss {}; acc: {}'.format(test_loss, test_acc)) 204 | 205 | cm = confusion_matrix(y_actual_test, y_pred_test) 206 | print("Confusion Matrix:\n {}".format(cm)) -------------------------------------------------------------------------------- /src/method/bytes_component/parameters/parameters_DeepConv.json: -------------------------------------------------------------------------------- 1 | { 2 | "output": 9, 3 | "E": 8, 4 | "max_bytes_values":2000000, 5 | "V":259, 6 | "kernel_sizes":[ 7 | 32, 8 | 32, 9 | 16, 10 | 16 11 | ], 12 | "strides":[ 13 | 4, 14 | 4, 15 | 8, 16 | 8 17 | ], 18 | "num_filters": [ 19 | 48, 20 | 96, 21 | 128, 22 | 192 23 | ], 24 | "max_pool_size":4, 25 | "hidden":[ 26 | 192, 27 | 160, 28 | 128 29 | ], 30 | "dropout_rate":0.5, 31 | "learning_rate":0.0005, 32 | "batch_size":1, 33 | "buffer_size":4, 34 | "epochs":30, 35 | "gpu":"0" 36 | 37 | } -------------------------------------------------------------------------------- /src/method/bytes_component/tfreader.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import tensorflow_text as text 3 | 4 | 5 | def _parse_tfrecord_function(example, lookup_table): 6 | example_fmt = { 7 | 'bytes': tf.io.FixedLenFeature([], tf.string), 8 | 'label': tf.io.FixedLenFeature([], tf.int64) 9 | } 10 | parsed = tf.io.parse_single_example(example, example_fmt) 11 | tokenizer = text.WhitespaceTokenizer() 12 | tokens = tokenizer.tokenize(parsed['bytes']) 13 | IDs = lookup_table.lookup(tokens) 14 | 15 | return IDs, parsed['label'] 16 | 17 | 18 | def make_dataset(filepath, lookup_table, SHUFFLE_BUFFER_SIZE=1024, BATCH_SIZE=32, EPOCHS=5): 19 | dataset = tf.data.TFRecordDataset(filepath) 20 | dataset = dataset.shuffle(SHUFFLE_BUFFER_SIZE) 21 | dataset = dataset.repeat(EPOCHS) 22 | dataset = dataset.map(lambda x: _parse_tfrecord_function(x, lookup_table)) 23 | dataset = dataset.batch(batch_size=BATCH_SIZE) 24 | return dataset 25 | -------------------------------------------------------------------------------- /src/method/bytes_component/tfwriter.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import tensorflow as tf 3 | import os 4 | project_path = os.path.dirname(os.path.realpath("../../../")) 5 | import sys 6 | import csv 7 | sys.path.append(project_path) 8 | from metaphor.metaphor_engine import MetaPHOR 9 | from src.method.utils import load_vocabulary, serialize_bytes_example 10 | 11 | 12 | def dataset_to_tfrecords(pe_filepath, 13 | tfrecords_filepath, 14 | labels_filepath, 15 | vocabulary_mapping_filepath, 16 | max_bytes=2000000): 17 | 18 | tfwriter = tf.io.TFRecordWriter(tfrecords_filepath) 19 | vocabulary_mapping = load_vocabulary(vocabulary_mapping_filepath) 20 | 21 | i = 0 22 | with open(labels_filepath, "r") as labels_file: 23 | reader = csv.DictReader(labels_file, fieldnames=["Id", 24 | "Class"]) 25 | reader.__next__() 26 | for row in reader: 27 | print("{};{}".format(i, row['Id'])) 28 | metaPHOR = MetaPHOR(pe_filepath + row['Id'] + ".asm") 29 | 30 | # Extract bytes 31 | bytes_sequence = metaPHOR.get_hexadecimal_data_as_list() 32 | for i in range(len(bytes_sequence)): 33 | if bytes_sequence[i] not in vocabulary_mapping.keys(): 34 | bytes_sequence[i] = "UNK" 35 | 36 | if len(bytes_sequence) < max_bytes: 37 | while len(bytes_sequence) < max_bytes: 38 | bytes_sequence.append("PAD") 39 | else: 40 | bytes_sequence = bytes_sequence[:max_bytes] 41 | raw_bytes_sequence = " ".join(bytes_sequence) 42 | 43 | example = serialize_bytes_example(raw_bytes_sequence, int(row['Class']) - 1) 44 | tfwriter.write(example) 45 | i += 1 46 | 47 | 48 | if __name__ == "__main__": 49 | parser = argparse.ArgumentParser(description='Bytes-based TFWriter Script') 50 | parser.add_argument("pe_filepath", 51 | type=str, 52 | help="Filepath describing the location of the pe files in asm format") 53 | parser.add_argument("tfrecords_filepath", 54 | type=str, 55 | help="Where the TFRecord files will be stored") 56 | parser.add_argument("labels_filepath", 57 | type=str, 58 | help="CSV filepath containing the ID and class of each PE file in pe_filepath") 59 | parser.add_argument("vocabulary_mapping_filepath", 60 | type=str, 61 | help="Filepath describing the vocabulary mapping between mnemonics and IDs") 62 | parser.add_argument("--max_bytes", 63 | type=int, 64 | help="Maximum number of bytes per file", 65 | default=2000000) 66 | args = parser.parse_args() 67 | dataset_to_tfrecords(args.pe_filepath, 68 | args.tfrecords_filepath, 69 | args.labels_filepath, 70 | args.vocabulary_mapping_filepath, 71 | args.max_bytes) -------------------------------------------------------------------------------- /src/method/bytes_component/vocabulary/inverse_vocabulary_mapping.json: -------------------------------------------------------------------------------- 1 | {"0": "0", "1": "1", "2": "2", "3": "3", "4": "4", "5": "5", "6": "6", "7": "7", "8": "8", "9": "9", "10": "A", "11": "B", "12": "C", "13": "D", "14": "E", "15": "F", "16": "10", "17": "11", "18": "12", "19": "13", "20": "14", "21": "15", "22": "16", "23": "17", "24": "18", "25": "19", "26": "1A", "27": "1B", "28": "1C", "29": "1D", "30": "1E", "31": "1F", "32": "20", "33": "21", "34": "22", "35": "23", "36": "24", "37": "25", "38": "26", "39": "27", "40": "28", "41": "29", "42": "2A", "43": "2B", "44": "2C", "45": "2D", "46": "2E", "47": "2F", "48": "30", "49": "31", "50": "32", "51": "33", "52": "34", "53": "35", "54": "36", "55": "37", "56": "38", "57": "39", "58": "3A", "59": "3B", "60": "3C", "61": "3D", "62": "3E", "63": "3F", "64": "40", "65": "41", "66": "42", "67": "43", "68": "44", "69": "45", "70": "46", "71": "47", "72": "48", "73": "49", "74": "4A", "75": "4B", "76": "4C", "77": "4D", "78": "4E", "79": "4F", "80": "50", "81": "51", "82": "52", "83": "53", "84": "54", "85": "55", "86": "56", "87": "57", "88": "58", "89": "59", "90": "5A", "91": "5B", "92": "5C", "93": "5D", "94": "5E", "95": "5F", "96": "60", "97": "61", "98": "62", "99": "63", "100": "64", "101": "65", "102": "66", "103": "67", "104": "68", "105": "69", "106": "6A", "107": "6B", "108": "6C", "109": "6D", "110": "6E", "111": "6F", "112": "70", "113": "71", "114": "72", "115": "73", "116": "74", "117": "75", "118": "76", "119": "77", "120": "78", "121": "79", "122": "7A", "123": "7B", "124": "7C", "125": "7D", "126": "7E", "127": "7F", "128": "80", "129": "81", "130": "82", "131": "83", "132": "84", "133": "85", "134": "86", "135": "87", "136": "88", "137": "89", "138": "8A", "139": "8B", "140": "8C", "141": "8D", "142": "8E", "143": "8F", "144": "90", "145": "91", "146": "92", "147": "93", "148": "94", "149": "95", "150": "96", "151": "97", "152": "98", "153": "99", "154": "9A", "155": "9B", "156": "9C", "157": "9D", "158": "9E", "159": "9F", "160": "A0", "161": "A1", "162": "A2", "163": "A3", "164": "A4", "165": "A5", "166": "A6", "167": "A7", "168": "A8", "169": "A9", "170": "AA", "171": "AB", "172": "AC", "173": "AD", "174": "AE", "175": "AF", "176": "B0", "177": "B1", "178": "B2", "179": "B3", "180": "B4", "181": "B5", "182": "B6", "183": "B7", "184": "B8", "185": "B9", "186": "BA", "187": "BB", "188": "BC", "189": "BD", "190": "BE", "191": "BF", "192": "C0", "193": "C1", "194": "C2", "195": "C3", "196": "C4", "197": "C5", "198": "C6", "199": "C7", "200": "C8", "201": "C9", "202": "CA", "203": "CB", "204": "CC", "205": "CD", "206": "CE", "207": "CF", "208": "D0", "209": "D1", "210": "D2", "211": "D3", "212": "D4", "213": "D5", "214": "D6", "215": "D7", "216": "D8", "217": "D9", "218": "DA", "219": "DB", "220": "DC", "221": "DD", "222": "DE", "223": "DF", "224": "E0", "225": "E1", "226": "E2", "227": "E3", "228": "E4", "229": "E5", "230": "E6", "231": "E7", "232": "E8", "233": "E9", "234": "EA", "235": "EB", "236": "EC", "237": "ED", "238": "EE", "239": "EF", "240": "F0", "241": "F1", "242": "F2", "243": "F3", "244": "F4", "245": "F5", "246": "F6", "247": "F7", "248": "F8", "249": "F9", "250": "FA", "251": "FB", "252": "FC", "253": "FD", "254": "FE", "255": "FF", "256": "??", "257": "PAD"} -------------------------------------------------------------------------------- /src/method/bytes_component/vocabulary/vocabulary_mapping.json: -------------------------------------------------------------------------------- 1 | {"0": 0, "1": 1, "2": 2, "3": 3, "4": 4, "5": 5, "6": 6, "7": 7, "8": 8, "9": 9, "A": 10, "B": 11, "C": 12, "D": 13, "E": 14, "F": 15, "10": 16, "11": 17, "12": 18, "13": 19, "14": 20, "15": 21, "16": 22, "17": 23, "18": 24, "19": 25, "1A": 26, "1B": 27, "1C": 28, "1D": 29, "1E": 30, "1F": 31, "20": 32, "21": 33, "22": 34, "23": 35, "24": 36, "25": 37, "26": 38, "27": 39, "28": 40, "29": 41, "2A": 42, "2B": 43, "2C": 44, "2D": 45, "2E": 46, "2F": 47, "30": 48, "31": 49, "32": 50, "33": 51, "34": 52, "35": 53, "36": 54, "37": 55, "38": 56, "39": 57, "3A": 58, "3B": 59, "3C": 60, "3D": 61, "3E": 62, "3F": 63, "40": 64, "41": 65, "42": 66, "43": 67, "44": 68, "45": 69, "46": 70, "47": 71, "48": 72, "49": 73, "4A": 74, "4B": 75, "4C": 76, "4D": 77, "4E": 78, "4F": 79, "50": 80, "51": 81, "52": 82, "53": 83, "54": 84, "55": 85, "56": 86, "57": 87, "58": 88, "59": 89, "5A": 90, "5B": 91, "5C": 92, "5D": 93, "5E": 94, "5F": 95, "60": 96, "61": 97, "62": 98, "63": 99, "64": 100, "65": 101, "66": 102, "67": 103, "68": 104, "69": 105, "6A": 106, "6B": 107, "6C": 108, "6D": 109, "6E": 110, "6F": 111, "70": 112, "71": 113, "72": 114, "73": 115, "74": 116, "75": 117, "76": 118, "77": 119, "78": 120, "79": 121, "7A": 122, "7B": 123, "7C": 124, "7D": 125, "7E": 126, "7F": 127, "80": 128, "81": 129, "82": 130, "83": 131, "84": 132, "85": 133, "86": 134, "87": 135, "88": 136, "89": 137, "8A": 138, "8B": 139, "8C": 140, "8D": 141, "8E": 142, "8F": 143, "90": 144, "91": 145, "92": 146, "93": 147, "94": 148, "95": 149, "96": 150, "97": 151, "98": 152, "99": 153, "9A": 154, "9B": 155, "9C": 156, "9D": 157, "9E": 158, "9F": 159, "A0": 160, "A1": 161, "A2": 162, "A3": 163, "A4": 164, "A5": 165, "A6": 166, "A7": 167, "A8": 168, "A9": 169, "AA": 170, "AB": 171, "AC": 172, "AD": 173, "AE": 174, "AF": 175, "B0": 176, "B1": 177, "B2": 178, "B3": 179, "B4": 180, "B5": 181, "B6": 182, "B7": 183, "B8": 184, "B9": 185, "BA": 186, "BB": 187, "BC": 188, "BD": 189, "BE": 190, "BF": 191, "C0": 192, "C1": 193, "C2": 194, "C3": 195, "C4": 196, "C5": 197, "C6": 198, "C7": 199, "C8": 200, "C9": 201, "CA": 202, "CB": 203, "CC": 204, "CD": 205, "CE": 206, "CF": 207, "D0": 208, "D1": 209, "D2": 210, "D3": 211, "D4": 212, "D5": 213, "D6": 214, "D7": 215, "D8": 216, "D9": 217, "DA": 218, "DB": 219, "DC": 220, "DD": 221, "DE": 222, "DF": 223, "E0": 224, "E1": 225, "E2": 226, "E3": 227, "E4": 228, "E5": 229, "E6": 230, "E7": 231, "E8": 232, "E9": 233, "EA": 234, "EB": 235, "EC": 236, "ED": 237, "EE": 238, "EF": 239, "F0": 240, "F1": 241, "F2": 242, "F3": 243, "F4": 244, "F5": 245, "F6": 246, "F7": 247, "F8": 248, "F9": 249, "FA": 250, "FB": 251, "FC": 252, "FD": 253, "FE": 254, "FF": 255, "??": 256, "PAD": 257} -------------------------------------------------------------------------------- /src/method/hydra/custom_training.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import tensorflow as tf 3 | import os 4 | project_path = os.path.dirname(os.path.realpath("../../../")) 5 | import sys 6 | sys.path.append(project_path) 7 | from src.method.hydra.hydra_architecture import HYDRA 8 | from src.method.hydra.tfreader import make_dataset 9 | from src.method.utils import load_parameters 10 | from src.method.utils import load_vocabulary 11 | from src.method.utils import create_lookup_table 12 | from sklearn.metrics import confusion_matrix 13 | 14 | if __name__ == "__main__": 15 | parser = argparse.ArgumentParser(description='HYDRA Model Training') 16 | parser.add_argument("model", 17 | type=str, 18 | help="Model name") 19 | parser.add_argument("tr_tfrecord", 20 | type=str, 21 | help="Training TFRecord file") 22 | parser.add_argument("val_tfrecord", 23 | type=str, 24 | help="Validation TFrecord file") 25 | parser.add_argument("parameters", 26 | type=str, 27 | help="JSON file containing the parameters of the model") 28 | parser.add_argument("opcodes_vocabulary_mapping_filepath", 29 | type=str, 30 | help="Filepath describing the vocabulary mapping between mnemonics and IDs") 31 | parser.add_argument("bytes_vocabulary_mapping_filepath", 32 | type=str, 33 | help="Filepath describing the vocabulary mapping between mnemonics and IDs") 34 | parser.add_argument("--test_tfrecord", 35 | type=str, 36 | help="Testing TFRecord file", 37 | default=None) 38 | args = parser.parse_args() 39 | 40 | print("TensorFlow version: {}".format(tf.__version__)) 41 | print("Eager execution: {}".format(tf.executing_eagerly())) 42 | print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU'))) 43 | tf.debugging.set_log_device_placement(True) 44 | 45 | gpus = tf.config.experimental.list_physical_devices('GPU') 46 | if gpus: 47 | try: 48 | tf.config.experimental.set_visible_devices(gpus[2], 'GPU') 49 | logical_gpus = tf.config.experimental.list_logical_devices('GPU') 50 | print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPU") 51 | except RuntimeError as e: 52 | # Visible devices must be set before GPUs have been initialized 53 | print(e) 54 | 55 | #Load vocabulary and create lookup table 56 | opcodes_vocabulary_mapping = load_vocabulary(args.opcodes_vocabulary_mapping_filepath) 57 | bytes_vocabulary_mapping = load_vocabulary(args.bytes_vocabulary_mapping_filepath) 58 | 59 | opcodes_lookup_table = create_lookup_table(opcodes_vocabulary_mapping, 1) 60 | bytes_lookup_table = create_lookup_table(bytes_vocabulary_mapping, 1) 61 | 62 | # Load parameters of the model 63 | parameters = load_parameters(args.parameters) 64 | 65 | # Specify GPU 66 | if "gpu" in parameters.keys(): 67 | os.environ["CUDA_VISIBLE_DEVICES"] = parameters["gpu"] 68 | 69 | 70 | model = HYDRA(parameters) 71 | 72 | ## If u want, you can load the weights of the pretrained models before starting training 73 | 74 | # Load weight here! 75 | # model.load_opcodes_subnetwork_pretrained_weights(path_to_opcodes_weights) 76 | # model.load_bytes_subnetwork_pretrained_weights(path_to_bytes_weights) 77 | # model.load_apis_subnetwork_pretrained_weights(path_to_apis_weights) 78 | 79 | # Start training from a previous checkpoint 80 | #if os.path.isdir("models/{}/".format(args.model)): 81 | # print("LOADING WEIGHTS!!!!") 82 | # latest = tf.train.latest_checkpoint("models/{}/".format(args.model)) 83 | # model.load_weights(latest) 84 | 85 | loss_func = tf.keras.losses.SparseCategoricalCrossentropy() 86 | accuracy = tf.keras.metrics.SparseCategoricalAccuracy() 87 | optimizer = tf.keras.optimizers.Adam(learning_rate=parameters['learning_rate']) 88 | 89 | 90 | def train_loop(opcodes, bytes, apis, labels, training=False): 91 | # Define the GradientTape context 92 | with tf.GradientTape() as tape: 93 | # Get the probabilities 94 | predictions = model(opcodes, bytes, apis, training) 95 | #labels = tf.dtypes.cast(labels, tf.float32) 96 | # Calculate the loss 97 | loss = loss_func(labels, predictions) 98 | # Get the gradients 99 | gradients = tape.gradient(loss, model.trainable_variables) 100 | # Update the weights 101 | optimizer.apply_gradients(zip(gradients, model.trainable_variables)) 102 | return loss, predictions 103 | 104 | 105 | # Training loop 106 | # 1/ Iterate each epoch. An epoch is one pass through the dataset 107 | # 2/ Whithin an epoch, iterate over each example in the training Dataset. 108 | # 3/ Calculate model's loss and gradients 109 | # 4/ Use an optimizer to update the model's variables 110 | # 5/ Keep track of stats and repeat 111 | 112 | train_loss_results = [] 113 | train_accuracy_results = [] 114 | 115 | validation_loss_results = [] 116 | validation_accuracy_results = [] 117 | 118 | #checkpoint_path = "models/ShallowCNN/model_ep_{}.ckpt" 119 | #checkpoint_dir = os.path.dirname(checkpoint_path) 120 | 121 | num_epochs = parameters['epochs'] 122 | 123 | initial_loss = 10.0 124 | for epoch in range(num_epochs): 125 | print("Current epoch: {}".format(epoch)) 126 | checkpoint_path = "models/{}/model_001.ckpt".format(args.model) 127 | #checkpoint_dir = os.path.dirname(checkpoint_path) 128 | 129 | d_train = make_dataset(args.tr_tfrecord, 130 | opcodes_lookup_table, 131 | bytes_lookup_table, 132 | parameters['buffer_size'], 133 | parameters['batch_size'], 134 | 1) 135 | d_val = make_dataset(args.val_tfrecord, 136 | opcodes_lookup_table, 137 | bytes_lookup_table, 138 | 1024, 139 | 1, 140 | 1) 141 | 142 | 143 | # Training metrics 144 | epoch_loss_avg = tf.keras.metrics.Mean() 145 | epoch_accuracy = tf.keras.metrics.SparseCategoricalAccuracy() 146 | # Validation metrics 147 | val_epoch_loss_avg = tf.keras.metrics.Mean() 148 | val_epoch_accuracy = tf.keras.metrics.SparseCategoricalAccuracy() 149 | tr_step = 0 150 | 151 | # Training loop 152 | for step, (opcodes, bytes, apis, y) in enumerate(d_train): 153 | #print("Input: {}".format(x)) 154 | 155 | loss, y_ = train_loop(opcodes, bytes, apis, y, True) 156 | 157 | # Track progress 158 | epoch_loss_avg(loss) 159 | epoch_accuracy(y, y_) 160 | print("Iteration step: {}; Loss: {:.3f}, Accuracy: {:.3%}".format(tr_step, 161 | epoch_loss_avg.result(), 162 | epoch_accuracy.result())) 163 | tr_step += 1 164 | 165 | # End epoch 166 | train_loss_results.append(epoch_loss_avg.result()) 167 | train_accuracy_results.append(epoch_accuracy.result()) 168 | 169 | 170 | 171 | # Run a validation loop at the end of each epoch. 172 | for opcodes_batch_val, bytes_batch_val, apis_batch_val, y_batch_val in d_val: 173 | val_logits = model(opcodes_batch_val, bytes_batch_val, apis_batch_val, False) 174 | val_loss = loss_func(y_batch_val, val_logits) 175 | 176 | # Update metrics 177 | val_epoch_loss_avg(val_loss) 178 | val_epoch_accuracy(y_batch_val, val_logits) 179 | 180 | val_acc = val_epoch_accuracy.result() 181 | val_loss = val_epoch_loss_avg.result() 182 | print('Epoch: {}; Validation loss {}; acc: {}'.format(epoch, val_loss, val_acc)) 183 | 184 | validation_loss_results.append(val_loss) 185 | validation_accuracy_results.append(val_acc) 186 | 187 | if float(val_loss) < initial_loss: 188 | initial_loss = float(val_loss) 189 | model.save_weights(checkpoint_path) # Save only the weights 190 | 191 | model.load_weights(checkpoint_path) 192 | test_epoch_loss_avg = tf.keras.metrics.Mean() 193 | test_epoch_accuracy = tf.keras.metrics.SparseCategoricalAccuracy() 194 | 195 | y_actual_test = [] 196 | y_pred_test = [] 197 | # Evaluate model on the test set 198 | if args.test_tfrecord is not None: 199 | d_test = make_dataset(args.test_tfrecord, 200 | opcodes_lookup_table, 201 | bytes_lookup_table, 202 | 1, 203 | 1, 204 | 1) 205 | 206 | for opcodes_batch_test, bytes_batch_test, apis_batch_test, y_batch_test in d_test: 207 | test_logits = model(opcodes_batch_test, bytes_batch_test, apis_batch_test, False) 208 | test_loss = loss_func(y_batch_test, test_logits) 209 | 210 | # For the confusion matrix 211 | y_pred = tf.argmax(test_logits, axis=-1) 212 | y_pred_test.extend(y_pred) 213 | y_actual_test.extend(y_batch_test) 214 | 215 | # Update metrics 216 | test_epoch_loss_avg(test_loss) 217 | test_epoch_accuracy(y_batch_test, test_logits) 218 | 219 | test_acc = test_epoch_accuracy.result() 220 | test_loss = test_epoch_loss_avg.result() 221 | print('Test loss {}; acc: {}'.format(test_loss, test_acc)) 222 | 223 | cm = confusion_matrix(y_actual_test, y_pred_test) 224 | print("Confusion Matrix:\n {}".format(cm)) -------------------------------------------------------------------------------- /src/method/hydra/hydra_architecture.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | class HYDRA(tf.keras.Model): 4 | def __init__(self, parameters): 5 | super(HYDRA, self).__init__() 6 | self.parameters = parameters 7 | 8 | def build(self, input_shapes): 9 | # Bytes component 10 | ######################################### Bytes component ###################################################### 11 | self.bytes_emb = tf.keras.layers.Embedding(self.parameters['bytes']['V'], self.parameters['bytes']['E'], 12 | input_shape=(None, self.parameters['max_bytes'])) 13 | 14 | self.bytes_conv_1 = tf.keras.layers.Conv2D(filters=self.parameters['bytes']['num_filters'][0], 15 | kernel_size=[self.parameters['bytes']['kernel_sizes'][0], 16 | self.parameters['bytes']['E']], 17 | strides=(self.parameters['strides'][0], 1), 18 | data_format='channels_last', 19 | use_bias=True, 20 | activation="relu") 21 | 22 | self.bytes_conv_2 = tf.keras.layers.Conv2D(filters=self.parameters['bytes']['num_filters'][1], 23 | kernel_size=[self.parameters['bytes']['kernel_sizes'][1], 24 | 1], 25 | strides=(self.parameters['bytes']['strides'][1], 1), 26 | data_format='channels_last', 27 | use_bias=True, 28 | activation="relu") 29 | 30 | self.bytes_max_pool_1 = tf.keras.layers.MaxPooling2D(pool_size=(self.parameters['bytes']['max_pool_size'], 1)) 31 | 32 | self.bytes_conv_3 = tf.keras.layers.Conv2D(filters=self.parameters['bytes']['num_filters'][2], 33 | kernel_size=[self.parameters['bytes']['kernel_sizes'][2], 34 | 1], 35 | strides=(self.parameters['bytes']['strides'][2], 1), 36 | data_format='channels_last', 37 | use_bias=True, 38 | activation="relu") 39 | 40 | self.bytes_conv_4 = tf.keras.layers.Conv2D(filters=self.parameters['bytes']['num_filters'][3], 41 | kernel_size=[self.parameters['bytes']['kernel_sizes'][3], 42 | 1], 43 | strides=(self.parameters['bytes']['strides'][3], 1), 44 | data_format='channels_last', 45 | use_bias=True, 46 | activation="relu") 47 | 48 | self.bytes_global_avg_pool = tf.keras.layers.GlobalAvgPool2D() 49 | 50 | self.bytes_drop_1 = tf.keras.layers.Dropout(self.parameters["dropout_rate"]) 51 | self.bytes_dense_1 = tf.keras.layers.Dense(self.parameters['bytes']['hidden'][0], 52 | activation="selu") 53 | 54 | self.bytes_drop_2 = tf.keras.layers.Dropout(self.parameters["dropout_rate"]) 55 | self.bytes_dense_2 = tf.keras.layers.Dense(self.parameters['bytes']['hidden'][1], 56 | activation="selu") 57 | 58 | self.bytes_drop_3 = tf.keras.layers.Dropout(self.parameters["dropout_rate"]) 59 | 60 | self.bytes_dense_3 = tf.keras.layers.Dense(self.parameters['bytes']['hidden'][2], 61 | activation="selu") 62 | 63 | ####################################### Opcodes component ###################################################### 64 | self.opcodes_emb = tf.keras.layers.Embedding(self.parameters['opcodes']['V'], 65 | self.parameters['opcodes']['E'], 66 | input_shape=(None, 67 | self.parameters['max_opcodes'])) 68 | 69 | self.opcodes_conv_3 = tf.keras.layers.Conv2D(self.parameters['opcodes']['conv']['num_filters'], 70 | (self.parameters['opcodes']['conv']['size'][0], 71 | self.parameters['opcodes']['E']), 72 | activation="relu", 73 | input_shape=(None, 74 | self.parameters['opcodes']['seq_length'], 75 | self.parameters['opcodes']['E'])) 76 | self.opcodes_global_max_pooling_3 = tf.keras.layers.GlobalMaxPooling2D() 77 | 78 | self.opcodes_conv_5 = tf.keras.layers.Conv2D(self.parameters['opcodes']['conv']['num_filters'], 79 | (self.parameters['opcodes']['conv']['size'][1], self.parameters['E']), 80 | activation="relu", 81 | input_shape=(None, 82 | self.parameters['opcodes']['seq_length'], 83 | self.parameters['opcodes']['E'])) 84 | self.opcodes_global_max_pooling_5 = tf.keras.layers.GlobalMaxPooling2D() 85 | 86 | self.opcodes_conv_7 = tf.keras.layers.Conv2D(self.parameters['opcodes']['conv']['num_filters'], 87 | (self.parameters['opcodes']['conv']['size'][2], self.parameters['E']), 88 | activation="relu", 89 | input_shape=(None, 90 | self.parameters['opcodes']['seq_length'], 91 | self.parameters['opcodes']['E'])) 92 | self.opcodes_global_max_pooling_7 = tf.keras.layers.GlobalMaxPooling2D() 93 | 94 | ################################################# APIs Component ############################################### 95 | self.apis_input_dropout = tf.keras.layers.Dropout(self.parameters["input_dropout_rate"], 96 | input_shape=(None, self.parameters["api_features"])) 97 | 98 | self.apis_hidden_1 = tf.keras.layers.Dense(self.parameters['bytes']['hidden'], 99 | activation="relu", 100 | input_shape=(None, self.parameters["api_features"])) 101 | 102 | self.bytes_apis_dense_dropout = tf.keras.layers.Dropout(self.parameters["dropout_rate"]) 103 | self.bytes_apis_dense = tf.keras.layers.Dense(self.parameters['hidden'][0], activation="selu") 104 | 105 | self.dense_dropout = tf.keras.layers.Dropout(self.parameters["dropout_rate"]) 106 | self.dense = tf.keras.layers.Dense(self.parameters['hidden'][1], activation="selu") 107 | 108 | self.output_dropout = tf.keras.layers.Dropout(self.parameters["dropout_rate"]) 109 | self.out = tf.keras.layers.Dense(self.parameters['output'], 110 | activation="softmax") 111 | 112 | def call(self, opcodes_tensor, bytes_tensor, apis_tensor, training=False): 113 | # Bytes subcomponent 114 | bytes_emb = self.bytes_emb(bytes_tensor) 115 | bytes_emb_expanded = tf.keras.backend.expand_dims(bytes_emb, axis=-1) 116 | 117 | bytes_conv_1 = self.bytes_conv_1(bytes_emb_expanded) 118 | bytes_conv_2 = self.bytes_conv_2(bytes_conv_1) 119 | 120 | bytes_max_pool_1 = self.bytes_max_pool_1(bytes_conv_2) 121 | 122 | bytes_conv_3 = self.bytes_conv_3(bytes_max_pool_1) 123 | bytes_conv_4 = self.bytes_conv_4(bytes_conv_3) 124 | 125 | bytes_features = self.bytes_global_avg_pool(bytes_conv_4) 126 | 127 | bytes_drop_1 = self.bytes_drop_1(bytes_features, training=training) 128 | bytes_dense_1 = self.bytes_dense_1(bytes_drop_1) 129 | 130 | bytes_drop_2 = self.bytes_drop_2(bytes_dense_1, training=training) 131 | bytes_dense_2 = self.bytes_dense_2(bytes_drop_2) 132 | 133 | bytes_drop_3 = self.bytes_drop_3(bytes_dense_2, training=training) 134 | bytes_dense_3 = self.bytes_dense_3(bytes_drop_3) 135 | 136 | # Opcodes subcomponent 137 | opcodes_emb = self.opcodes_emb(opcodes_tensor) 138 | opcodes_emb_expanded = tf.keras.backend.expand_dims(opcodes_emb, axis=-1) 139 | 140 | opcodes_conv_3 = self.opcodes_conv_3(opcodes_emb_expanded) 141 | opcodes_pool_3 = self.opcodes_global_max_pooling_3(opcodes_conv_3) 142 | 143 | opcodes_conv_5 = self.opcodes_conv_5(opcodes_emb_expanded) 144 | opcodes_pool_5 = self.opcodes_global_max_pooling_5(opcodes_conv_5) 145 | 146 | opcodes_conv_7 = self.opcodes_conv_7(opcodes_emb_expanded) 147 | opcodes_pool_7 = self.opcodes_global_max_pooling_7(opcodes_conv_7) 148 | 149 | #APIs subcomponent 150 | apis_input_dropout = self.apis_input_dropout(apis_tensor, training=training) 151 | apis_hidden1 = self.apis_hidden_1(apis_input_dropout) 152 | 153 | 154 | # Features fusion 155 | features_api_bytes = tf.keras.layers.concatenate([bytes_dense_3, apis_hidden1]) 156 | features_api_bytes_dropout = self.bytes_apis_dense_dropout(features_api_bytes, training=training) 157 | dense_api_bytes = self.bytes_apis_dense(features_api_bytes_dropout) 158 | 159 | features = tf.keras.layers.concatenate([opcodes_pool_3, opcodes_pool_5, opcodes_pool_7, dense_api_bytes]) 160 | features_dropout = self.dense_dropout(features, training=training) 161 | dense_opcodes_apis_bytes = self.dense(features_dropout) 162 | 163 | features_dropout = self.dense_dropout(dense_opcodes_apis_bytes, training=training) 164 | output = self.out(features_dropout) 165 | 166 | return output 167 | 168 | def load_opcodes_subnetwork_pretrained_weights(self, model): 169 | """ 170 | Loads the pretrained weights of the opcodes subnetwork into the bimodal architecture 171 | :param model: filepath to the opcodes' model 172 | :return: 173 | """ 174 | print("ToImplement") 175 | 176 | def load_bytes_subnetwork_pretrained_weights(self, model): 177 | """ 178 | Loads the pretrained weights of the bytes subnetwork into the bimodal architecture 179 | :param model: filepath to the bytes' model 180 | :return: 181 | """ 182 | print("ToImplement") 183 | 184 | def load_apis_subnetwork_pretrained_weights(self, model): 185 | """ 186 | Loads the pretrained weights of the apis subnetwork into the bimodal architecture 187 | :param model: filepath to the apis' model 188 | :return: 189 | """ 190 | print("ToImplement") -------------------------------------------------------------------------------- /src/method/hydra/parameters/hydra_parameters.json: -------------------------------------------------------------------------------- 1 | { 2 | "apis":{ 3 | "hidden":250 4 | }, 5 | "api_features":4500, 6 | "bytes":{ 7 | "E": 8, 8 | "max_bytes_values":2000000, 9 | "V":259, 10 | "kernel_sizes":[ 11 | 32, 12 | 32, 13 | 16, 14 | 16 15 | ], 16 | "strides":[ 17 | 4, 18 | 4, 19 | 8, 20 | 8 21 | ], 22 | "num_filters": [ 23 | 48, 24 | 96, 25 | 128, 26 | 192 27 | ], 28 | "max_pool_size":4, 29 | "hidden":[ 30 | 192, 31 | 160, 32 | 128 33 | ] 34 | }, 35 | "opcodes":{ 36 | "V": 461, 37 | "E": 4, 38 | "conv":{ 39 | "num_filters": 100, 40 | "size":[3,5,7] 41 | } 42 | }, 43 | "output":9, 44 | "hidden":250, 45 | "buffer_size": 1024, 46 | "batch_size":32, 47 | "epochs":25, 48 | "learning_rate":0.001, 49 | "input_dropout_rate":0.0, 50 | "dropout_rate":0.5, 51 | "gpu":"0" 52 | } -------------------------------------------------------------------------------- /src/method/hydra/tfreader.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import tensorflow_text as text 3 | 4 | 5 | def _parse_tfrecord_function(example, opcodes_lookup_table, bytes_lookup_table): 6 | example_fmt = { 7 | 'opcodes': tf.io.FixedLenFeature([], tf.string), 8 | 'bytes': tf.io.FixedLenFeature([], tf.string), 9 | 'APIs': tf.io.FixedLenFeature([], tf.string), 10 | 'label': tf.io.FixedLenFeature([], tf.int64) 11 | } 12 | parsed = tf.io.parse_single_example(example, example_fmt) 13 | 14 | tokenizer = text.WhitespaceTokenizer() 15 | 16 | opcodes_tokens = tokenizer.tokenize(parsed['opcodes']) 17 | opcodes_IDs = opcodes_lookup_table.lookup(opcodes_tokens) 18 | 19 | bytes_tokens = tokenizer.tokenize(parsed['bytes']) 20 | bytes_IDs = bytes_lookup_table.lookup(bytes_tokens) 21 | 22 | feature_vector = tf.io.decode_raw(parsed['APIs'], tf.float32) 23 | return opcodes_IDs, bytes_IDs, feature_vector, parsed['label'] 24 | 25 | 26 | def make_dataset(filepath, 27 | opcodes_lookup_table, 28 | bytes_lookup_table, 29 | SHUFFLE_BUFFER_SIZE=1024, 30 | BATCH_SIZE=32, 31 | EPOCHS=5): 32 | dataset = tf.data.TFRecordDataset(filepath) 33 | dataset = dataset.shuffle(SHUFFLE_BUFFER_SIZE) 34 | dataset = dataset.repeat(EPOCHS) 35 | dataset = dataset.map(lambda x: _parse_tfrecord_function(x, opcodes_lookup_table, bytes_lookup_table)) 36 | dataset = dataset.batch(batch_size=BATCH_SIZE) 37 | return dataset 38 | -------------------------------------------------------------------------------- /src/method/hydra/tfwriter.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import tensorflow as tf 3 | import os 4 | project_path = os.path.dirname(os.path.realpath("../../../")) 5 | import sys 6 | import csv 7 | sys.path.append(project_path) 8 | from metaphor.metaphor_engine import MetaPHOR 9 | from src.method.utils import load_vocabulary, serialize_hydra_example 10 | 11 | 12 | def dataset_to_tfrecords(pe_filepath, 13 | tfrecords_filepath, 14 | labels_filepath, 15 | opcodes_vocabulary_mapping_filepath, 16 | bytes_vocabulary_mapping_filepath, 17 | max_mnemonics=50000, 18 | max_bytes=2000000): 19 | 20 | tfwriter = tf.io.TFRecordWriter(tfrecords_filepath) 21 | opcodes_vocabulary_mapping = load_vocabulary(opcodes_vocabulary_mapping_filepath) 22 | bytes_vocabulary_mapping = load_vocabulary(bytes_vocabulary_mapping_filepath) 23 | 24 | i = 0 25 | with open(labels_filepath, "r") as labels_file: 26 | reader = csv.DictReader(labels_file, fieldnames=["Id", 27 | "Class"]) 28 | reader.__next__() 29 | for row in reader: 30 | print("{};{}".format(i, row['Id'])) 31 | metaPHOR = MetaPHOR(pe_filepath + row['Id'] + ".asm") 32 | 33 | # Extract opcodes 34 | opcodes = metaPHOR.get_opcodes_data_as_list(opcodes_vocabulary_mapping) 35 | if len(opcodes) < max_mnemonics: 36 | while len(opcodes) < max_mnemonics: 37 | opcodes.append("PAD") 38 | else: 39 | opcodes = opcodes[:max_mnemonics] 40 | raw_mnemonics = " ".join(opcodes) 41 | 42 | # Extract bytes 43 | bytes_sequence = metaPHOR.get_hexadecimal_data_as_list() 44 | for i in range(len(bytes_sequence)): 45 | if bytes_sequence[i] not in bytes_vocabulary_mapping.keys(): 46 | bytes_sequence[i] = "UNK" 47 | if len(bytes_sequence) < max_bytes: 48 | while len(bytes_sequence) < max_bytes: 49 | bytes_sequence.append("PAD") 50 | else: 51 | bytes_sequence = bytes_sequence[:max_bytes] 52 | raw_bytes_sequence = " ".join(bytes_sequence) 53 | 54 | # Extract APIs 55 | feature_vector = metaPHOR.count_windows_api_calls() 56 | 57 | example = serialize_hydra_example(raw_mnemonics, 58 | raw_bytes_sequence, 59 | feature_vector, 60 | int(row['Class']) - 1) 61 | tfwriter.write(example) 62 | i += 1 63 | 64 | 65 | if __name__ == "__main__": 66 | parser = argparse.ArgumentParser(description='HYDRA TFWriter Script') 67 | parser.add_argument("pe_filepath", 68 | type=str, 69 | help="Filepath describing the location of the pe files in asm format") 70 | parser.add_argument("tfrecords_filepath", 71 | type=str, 72 | help="Where the TFRecord files will be stored") 73 | parser.add_argument("labels_filepath", 74 | type=str, 75 | help="CSV filepath containing the ID and class of each PE file in pe_filepath") 76 | parser.add_argument("opcodes_vocabulary_mapping_filepath", 77 | type=str, 78 | help="Filepath describing the vocabulary mapping between mnemonics and IDs") 79 | parser.add_argument("bytes_vocabulary_mapping_filepath", 80 | type=str, 81 | help="Filepath describing the vocabulary mapping between bytes and IDs") 82 | parser.add_argument("--max_opcodes", 83 | type=int, 84 | help="Maximum number of mnemonics per file", 85 | default=50000) 86 | parser.add_argument("--max_bytes", 87 | type=int, 88 | help="Maximum number of bytes per file", 89 | default=2000000) 90 | args = parser.parse_args() 91 | dataset_to_tfrecords(args.pe_filepath, 92 | args.tfrecords_filepath, 93 | args.labels_filepath, 94 | args.opcodes_vocabulary_mapping_filepath, 95 | args.bytes_vocabulary_mapping_filepath, 96 | args.max_opcodes, 97 | args.max_bytes) -------------------------------------------------------------------------------- /src/method/opcodes_component/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danielgibert/mlw_classification_hydra/c49c5a4aa4e1581304e64015d710f224d0faf57a/src/method/opcodes_component/__init__.py -------------------------------------------------------------------------------- /src/method/opcodes_component/base_architecture.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | 4 | class ShallowCNN(tf.keras.Model): 5 | def __init__(self, parameters): 6 | super(ShallowCNN, self).__init__() 7 | self.parameters = parameters 8 | 9 | def build(self, input_shapes): 10 | self.emb = tf.keras.layers.Embedding(self.parameters['V'], self.parameters['E'], input_shape=(None, self.parameters['seq_length'])) 11 | 12 | self.conv_3 = tf.keras.layers.Conv2D(self.parameters['conv']['num_filters'], 13 | (self.parameters['conv']['size'][0], self.parameters['E']), 14 | activation="relu", 15 | input_shape=(None, 16 | self.parameters['seq_length'], 17 | self.parameters['E'])) 18 | self.global_max_pooling_3 = tf.keras.layers.GlobalMaxPooling2D() 19 | 20 | self.conv_5 = tf.keras.layers.Conv2D(self.parameters['conv']['num_filters'], 21 | (self.parameters['conv']['size'][1], self.parameters['E']), 22 | activation="relu", 23 | input_shape=(None, 24 | self.parameters['seq_length'], 25 | self.parameters['E'])) 26 | self.global_max_pooling_5 = tf.keras.layers.GlobalMaxPooling2D() 27 | 28 | 29 | self.conv_7 = tf.keras.layers.Conv2D(self.parameters['conv']['num_filters'], 30 | (self.parameters['conv']['size'][2], self.parameters['E']), 31 | activation="relu", 32 | input_shape=(None, 33 | self.parameters['seq_length'], 34 | self.parameters['E'])) 35 | self.global_max_pooling_7 = tf.keras.layers.GlobalMaxPooling2D() 36 | 37 | self.dense_dropout = tf.keras.layers.Dropout(0.5) 38 | self.dense = tf.keras.layers.Dense(self.parameters['output'], 39 | activation="softmax") 40 | 41 | def call(self, input_tensor, training=False): 42 | emb = self.emb(input_tensor) 43 | emb_expanded = tf.keras.backend.expand_dims(emb, axis=-1) 44 | 45 | 46 | conv_3 = self.conv_3(emb_expanded) 47 | pool_3 = self.global_max_pooling_3(conv_3) 48 | 49 | conv_5 = self.conv_5(emb_expanded) 50 | pool_5 = self.global_max_pooling_5(conv_5) 51 | 52 | conv_7 = self.conv_7(emb_expanded) 53 | pool_7 = self.global_max_pooling_7(conv_7) 54 | 55 | features = tf.keras.layers.concatenate([pool_3, pool_5, pool_7]) 56 | features_dropout = self.dense_dropout(features, training=training) 57 | output = self.dense(features_dropout) 58 | 59 | return output -------------------------------------------------------------------------------- /src/method/opcodes_component/custom_training.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import tensorflow as tf 3 | import os 4 | project_path = os.path.dirname(os.path.realpath("../../../")) 5 | import sys 6 | sys.path.append(project_path) 7 | from src.method.opcodes_component.ShallowCNN import ShallowCNN 8 | from src.method.opcodes_component.tfreader import make_dataset 9 | from src.method.utils import load_parameters 10 | from src.method.utils import load_vocabulary 11 | from src.method.utils import create_lookup_table 12 | from sklearn.metrics import confusion_matrix 13 | 14 | if __name__ == "__main__": 15 | parser = argparse.ArgumentParser(description='Opcodes Model Training') 16 | parser.add_argument("model", 17 | type=str, 18 | help="Model filename") 19 | parser.add_argument("tr_tfrecord", 20 | type=str, 21 | help="Training TFRecord file") 22 | parser.add_argument("val_tfrecord", 23 | type=str, 24 | help="Validation TFrecord file") 25 | parser.add_argument("parameters", 26 | type=str, 27 | help="JSON file containing the parameters of the model") 28 | parser.add_argument("vocabulary_mapping_filepath", 29 | type=str, 30 | help="Filepath describing the vocabulary mapping between mnemonics and IDs") 31 | parser.add_argument("--test_tfrecord", 32 | type=str, 33 | help="Testing TFRecord file", 34 | default=None) 35 | args = parser.parse_args() 36 | 37 | print("TensorFlow version: {}".format(tf.__version__)) 38 | print("Eager execution: {}".format(tf.executing_eagerly())) 39 | print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU'))) 40 | tf.debugging.set_log_device_placement(True) 41 | 42 | gpus = tf.config.experimental.list_physical_devices('GPU') 43 | if gpus: 44 | try: 45 | tf.config.experimental.set_visible_devices(gpus[1], 'GPU') 46 | logical_gpus = tf.config.experimental.list_logical_devices('GPU') 47 | print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPU") 48 | except RuntimeError as e: 49 | # Visible devices must be set before GPUs have been initialized 50 | print(e) 51 | 52 | #Load vocabulary and create lookup table 53 | vocabulary_mapping = load_vocabulary(args.vocabulary_mapping_filepath) 54 | lookup_table = create_lookup_table(vocabulary_mapping, 1) 55 | 56 | # Load parameters of the model 57 | parameters = load_parameters(args.parameters) 58 | 59 | model = ShallowCNN(parameters) 60 | 61 | loss_func = tf.keras.losses.SparseCategoricalCrossentropy() 62 | accuracy = tf.keras.metrics.SparseCategoricalAccuracy() 63 | optimizer = tf.keras.optimizers.Adam(learning_rate=parameters['learning_rate']) 64 | 65 | 66 | def train_loop(features, labels, training=False): 67 | # Define the GradientTape context 68 | with tf.GradientTape() as tape: 69 | # Get the probabilities 70 | predictions = model(features, training) 71 | #labels = tf.dtypes.cast(labels, tf.float32) 72 | # Calculate the loss 73 | loss = loss_func(labels, predictions) 74 | # Get the gradients 75 | gradients = tape.gradient(loss, model.trainable_variables) 76 | # Update the weights 77 | optimizer.apply_gradients(zip(gradients, model.trainable_variables)) 78 | return loss, predictions 79 | 80 | 81 | # Training loop 82 | # 1/ Iterate each epoch. An epoch is one pass through the dataset 83 | # 2/ Whithin an epoch, iterate over each example in the training Dataset. 84 | # 3/ Calculate model's loss and gradients 85 | # 4/ Use an optimizer to update the model's variables 86 | # 5/ Keep track of stats and repeat 87 | 88 | train_loss_results = [] 89 | train_accuracy_results = [] 90 | 91 | validation_loss_results = [] 92 | validation_accuracy_results = [] 93 | 94 | #checkpoint_path = "models/ShallowCNN/model_ep_{}.ckpt" 95 | #checkpoint_dir = os.path.dirname(checkpoint_path) 96 | 97 | num_epochs = parameters['epochs'] 98 | 99 | initial_loss = 10.0 100 | for epoch in range(num_epochs): 101 | print("Current epoch: {}".format(epoch)) 102 | checkpoint_path = "models/{}/model_001.ckpt".format(args.model) 103 | #checkpoint_dir = os.path.dirname(checkpoint_path) 104 | 105 | d_train = make_dataset(args.tr_tfrecord, 106 | lookup_table, 107 | parameters['buffer_size'], 108 | parameters['batch_size'], 109 | 1) 110 | d_val = make_dataset(args.val_tfrecord, 111 | lookup_table, 112 | 1024, 113 | 1, 114 | 1) 115 | 116 | 117 | # Training metrics 118 | epoch_loss_avg = tf.keras.metrics.Mean() 119 | epoch_accuracy = tf.keras.metrics.SparseCategoricalAccuracy() 120 | # Validation metrics 121 | val_epoch_loss_avg = tf.keras.metrics.Mean() 122 | val_epoch_accuracy = tf.keras.metrics.SparseCategoricalAccuracy() 123 | tr_step = 0 124 | 125 | # Training loop 126 | for step, (x, y) in enumerate(d_train): 127 | 128 | loss, y_ = train_loop(x, y, True) 129 | 130 | # Track progress 131 | epoch_loss_avg(loss) 132 | epoch_accuracy(y, y_) 133 | print("Iteration step: {}; Loss: {:.3f}, Accuracy: {:.3%}".format(tr_step, 134 | epoch_loss_avg.result(), 135 | epoch_accuracy.result())) 136 | tr_step += 1 137 | 138 | # End epoch 139 | train_loss_results.append(epoch_loss_avg.result()) 140 | train_accuracy_results.append(epoch_accuracy.result()) 141 | 142 | 143 | 144 | # Run a validation loop at the end of each epoch. 145 | for x_batch_val, y_batch_val in d_val: 146 | val_logits = model(x_batch_val, False) 147 | val_loss = loss_func(y_batch_val, val_logits) 148 | 149 | # Update metrics 150 | val_epoch_loss_avg(val_loss) 151 | val_epoch_accuracy(y_batch_val, val_logits) 152 | 153 | val_acc = val_epoch_accuracy.result() 154 | val_loss = val_epoch_loss_avg.result() 155 | print('Epoch: {}; Validation loss {}; acc: {}'.format(epoch, val_loss, val_acc)) 156 | 157 | validation_loss_results.append(val_loss) 158 | validation_accuracy_results.append(val_acc) 159 | 160 | if float(val_loss) < initial_loss: 161 | initial_loss = float(val_loss) 162 | model.save_weights(checkpoint_path) # Save only the weights 163 | 164 | model.load_weights(checkpoint_path) 165 | test_epoch_loss_avg = tf.keras.metrics.Mean() 166 | test_epoch_accuracy = tf.keras.metrics.SparseCategoricalAccuracy() 167 | 168 | y_actual_test = [] 169 | y_pred_test = [] 170 | # Evaluate model on the test set 171 | if args.test_tfrecord is not None: 172 | d_test = make_dataset(args.test_tfrecord, 173 | lookup_table, 174 | 1, 175 | 1, 176 | 1) 177 | 178 | for x_batch_test, y_batch_test in d_test: 179 | test_logits = model(x_batch_test, False) 180 | test_loss = loss_func(y_batch_test, test_logits) 181 | 182 | # For the confusion matrix 183 | y_pred = tf.argmax(test_logits, axis=-1) 184 | y_pred_test.extend(y_pred) 185 | y_actual_test.extend(y_batch_test) 186 | 187 | # Update metrics 188 | test_epoch_loss_avg(test_loss) 189 | test_epoch_accuracy(y_batch_test, test_logits) 190 | 191 | test_acc = test_epoch_accuracy.result() 192 | test_loss = test_epoch_loss_avg.result() 193 | print('Test loss {}; acc: {}'.format(test_loss, test_acc)) 194 | 195 | cm = confusion_matrix(y_actual_test, y_pred_test) 196 | print("Confusion Matrix:\n {}".format(cm)) 197 | 198 | -------------------------------------------------------------------------------- /src/method/opcodes_component/parameters/standard_cnn_parameters.json: -------------------------------------------------------------------------------- 1 | { 2 | "V": 461, 3 | "E": 4, 4 | "conv":{ 5 | "num_filters": 100, 6 | "size":[3,5,7] 7 | }, 8 | "output":9, 9 | "buffer_size": 8000, 10 | "batch_size":32, 11 | "epochs":50, 12 | "learning_rate":0.001, 13 | "seq_length":50000, 14 | "gpu":"0" 15 | } 16 | -------------------------------------------------------------------------------- /src/method/opcodes_component/tfreader.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import tensorflow_text as text 3 | 4 | 5 | def _parse_tfrecord_function(example, lookup_table): 6 | example_fmt = { 7 | 'opcodes': tf.io.FixedLenFeature([], tf.string), 8 | 'label': tf.io.FixedLenFeature([], tf.int64) 9 | } 10 | parsed = tf.io.parse_single_example(example, example_fmt) 11 | tokenizer = text.WhitespaceTokenizer() 12 | tokens = tokenizer.tokenize(parsed['opcodes']) 13 | IDs = lookup_table.lookup(tokens) 14 | return IDs, parsed['label'] 15 | 16 | 17 | def make_dataset(filepath, lookup_table, SHUFFLE_BUFFER_SIZE=1024, BATCH_SIZE=32, EPOCHS=5): 18 | dataset = tf.data.TFRecordDataset(filepath) 19 | dataset = dataset.shuffle(SHUFFLE_BUFFER_SIZE) 20 | dataset = dataset.repeat(EPOCHS) 21 | dataset = dataset.map(lambda x: _parse_tfrecord_function(x, lookup_table)) 22 | dataset = dataset.batch(batch_size=BATCH_SIZE) 23 | return dataset 24 | 25 | -------------------------------------------------------------------------------- /src/method/opcodes_component/tfwriter.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import tensorflow as tf 3 | import os 4 | project_path = os.path.dirname(os.path.realpath("../../../")) 5 | import sys 6 | import csv 7 | sys.path.append(project_path) 8 | from metaphor.metaphor_engine import MetaPHOR 9 | from src.method.utils import load_vocabulary, serialize_mnemonics_example 10 | 11 | 12 | def dataset_to_tfrecords(pe_filepath, 13 | tfrecords_filepath, 14 | labels_filepath, 15 | vocabulary_mapping_filepath, 16 | max_mnemonics): 17 | 18 | vocabulary_mapping = load_vocabulary(vocabulary_mapping_filepath) 19 | tfwriter = tf.io.TFRecordWriter(tfrecords_filepath) 20 | 21 | i = 0 22 | 23 | # Training TFRecord 24 | with open(labels_filepath, "r") as labels_file: 25 | reader = csv.DictReader(labels_file, fieldnames=["Id", 26 | "Class"]) 27 | reader.__next__() 28 | for row in reader: 29 | print("{};{}".format(i, row['Id'])) 30 | metaPHOR = MetaPHOR(pe_filepath + row['Id'] + ".asm") 31 | opcodes = metaPHOR.get_opcodes_data_as_list(vocabulary_mapping) 32 | 33 | if len(opcodes) < max_mnemonics: 34 | while len(opcodes) < max_mnemonics: 35 | opcodes.append("PAD") 36 | else: 37 | opcodes = opcodes[:max_mnemonics] 38 | raw_mnemonics = " ".join(opcodes) 39 | 40 | example = serialize_mnemonics_example(raw_mnemonics, int(row['Class'])-1) 41 | tfwriter.write(example) 42 | i += 1 43 | 44 | 45 | if __name__ == "__main__": 46 | parser = argparse.ArgumentParser(description='Mnemonics-based TFWriter Script') 47 | parser.add_argument("pe_filepath", 48 | type=str, 49 | help="Filepath describing the location of the pe files in asm format") 50 | parser.add_argument("tfrecords_filepath", 51 | type=str, 52 | help="Where the TFRecord files will be stores") 53 | parser.add_argument("labels_filepath", 54 | type=str, 55 | help="CSV filepath containing the ID and class of each PE file in pe_filepath") 56 | parser.add_argument("vocabulary_mapping_filepath", 57 | type=str, 58 | help="Filepath describing the vocabulary mapping between mnemonics and IDs") 59 | parser.add_argument("--max_mnemonics", 60 | type=int, 61 | help="Maximum number of mnemonics per file", 62 | default=50000) 63 | args = parser.parse_args() 64 | dataset_to_tfrecords(args.pe_filepath, 65 | args.tfrecords_filepath, 66 | args.labels_filepath, 67 | args.vocabulary_mapping_filepath, 68 | args.max_mnemonics) -------------------------------------------------------------------------------- /src/method/opcodes_component/vocabulary/mnemonics_inverse_vocabulary_mapping_min=3.json: -------------------------------------------------------------------------------- 1 | {"0": "UNK", "1": "NONE", "2": "PAD", "3": "andps", "4": "psubusb", "5": "por", "6": "jnz", "7": "fxch4", "8": "pabsw", "9": "bsr", "10": "setl", "11": "paddw", "12": "fxsave", "13": "vmwrite", "14": "bound", "15": "jo", "16": "movhps", "17": "into", "18": "vmovd", "19": "pxor", "20": "vinsertf128", "21": "vmovdqa", "22": "paddsw", "23": "fist", "24": "orps", "25": "vmovapd", "26": "paddusw", "27": "roundps", "28": "fsub", "29": "bsf", "30": "fcmovu", "31": "vorpd", "32": "packuswb", "33": "unpckhpd", "34": "shufpd", "35": "pminuw", "36": "psubq", "37": "arpl", "38": "pandn", "39": "extractps", "40": "vpunpckhbw", "41": "vcvtss2sd", "42": "vpmullw", "43": "bswap", "44": "rcr", "45": "pmovzxbd", "46": "movsd", "47": "lddqu", "48": "movhpd", "49": "packusdw", "50": "jns", "51": "pshuflw", "52": "jnb", "53": "psllw", "54": "cvttps2dq", "55": "cmova", "56": "psrlq", "57": "setnbe", "58": "fcmovbe", "59": "leave", "60": "punpckldq", "61": "ffreep", "62": "phaddd", "63": "cmovb", "64": "insertps", "65": "scas", "66": "cmpps", "67": "sub", "68": "punpckhqdq", "69": "xchg", "70": "fcmovnbe", "71": "rol", "72": "pmulhrsw", "73": "fstsw", "74": "movnti", "75": "jnp", "76": "setz", "77": "int", "78": "minsd", "79": "paddusb", "80": "mulps", "81": "fcompp", "82": "vextractf128", "83": "lar", "84": "psrld", "85": "seto", "86": "vperm2f128", "87": "js", "88": "fadd", "89": "not", "90": "jmp", "91": "lss", "92": "paddq", "93": "fcmovnb", "94": "pfsubr", "95": "psrlw", "96": "punpckhdq", "97": "sqrtss", "98": "jg", "99": "pavgw", "100": "rcpss", "101": "cvtsd2si", "102": "pmaddwd", "103": "ucomisd", "104": "btr", "105": "cmovbe", "106": "vmovsd", "107": "jge", "108": "cmpneqps", "109": "vpaddw", "110": "sbb", "111": "cmovp", "112": "fcomi", "113": "idiv", "114": "pavgusb", "115": "jl", "116": "psubusw", "117": "fsubr", "118": "btc", "119": "pfadd", "120": "pshufb", "121": "addps", "122": "pand", "123": "imul", "124": "lidt", "125": "pshufhw", "126": "movaps", "127": "fild", "128": "lfs", "129": "addsd", "130": "lgs", "131": "addpd", "132": "vmptrst", "133": "vpcmpeqw", "134": "pshufd", "135": "fidivr", "136": "divss", "137": "prefetcht2", "138": "psrldq", "139": "shl", "140": "psraw", "141": "cvtdq2pd", "142": "pcmpgtw", "143": "vmovddup", "144": "subpd", "145": "movsx", "146": "vpcmpgtw", "147": "xor", "148": "fmul", "149": "cmp", "150": "movss", "151": "retnw", "152": "vpextrw", "153": "femms", "154": "rcl", "155": "wait", "156": "pslld", "157": "orpd", "158": "vunpckhps", "159": "pinsrd", "160": "xbegin", "161": "inc", "162": "ficom", "163": "sgdt", "164": "test", "165": "setbe", "166": "mpsadbw", "167": "prefetchnta", "168": "cmpleps", "169": "pcmpeqd", "170": "cmovl", "171": "ffree", "172": "fdivp", "173": "cmpltps", "174": "cvttsd2si", "175": "pmaxub", "176": "vsqrtpd", "177": "fistp", "178": "fnstenv", "179": "punpcklbw", "180": "vpmaddwd", "181": "fxch", "182": "cvtpd2ps", "183": "ucomiss", "184": "setno", "185": "sar", "186": "maxps", "187": "pmuludq", "188": "cvtps2pd", "189": "vpaddusw", "190": "fsubrp", "191": "movlps", "192": "fld1", "193": "jb", "194": "and", "195": "jbe", "196": "pcmpeqw", "197": "fnsave", "198": "cmovo", "199": "out", "200": "frstor", "201": "ldmxcsr", "202": "div", "203": "cvtpi2ps", "204": "mulpd", "205": "add", "206": "pextrw", "207": "movq", "208": "pextrb", "209": "punpcklwd", "210": "shufps", "211": "fdivr", "212": "cvtsd2ss", "213": "adc", "214": "movq2dq", "215": "cmpltpd", "216": "paddb", "217": "cmps", "218": "addss", "219": "vpunpckldq", "220": "pmovzxwd", "221": "vaddps", "222": "cmovno", "223": "pfmul", "224": "pmaxsw", "225": "subss", "226": "cvtsi2ss", "227": "subsd", "228": "cdq", "229": "stmxcsr", "230": "vmptrld", "231": "setnl", "232": "shrd", "233": "push", "234": "or", "235": "fucompp", "236": "cvtdq2ps", "237": "vpshufhw", "238": "fnstsw", "239": "pmulhw", "240": "vmulps", "241": "xlat", "242": "cvtsi2sd", "243": "cmovnp", "244": "subps", "245": "pslldq", "246": "ror", "247": "fcmovne", "248": "pfrcpit1", "249": "mulss", "250": "psadbw", "251": "movmskps", "252": "fcomp", "253": "fnstcw", "254": "jp", "255": "divsd", "256": "movhlps", "257": "fmulp", "258": "fdiv", "259": "pfrcpit2", "260": "jno", "261": "setnz", "262": "fld", "263": "in", "264": "paddd", "265": "cmpeqsd", "266": "str", "267": "lds", "268": "setnb", "269": "aam", "270": "packsswb", "271": "sets", "272": "pminsw", "273": "vdivsd", "274": "xorpd", "275": "faddp", "276": "lldt", "277": "andnpd", "278": "pmovmskb", "279": "fucomp", "280": "retn", "281": "mov", "282": "fsubp", "283": "sqrtsd", "284": "clflush", "285": "pshufw", "286": "vrcpss", "287": "enter", "288": "vsubps", "289": "psubsb", "290": "fstp", "291": "movntdq", "292": "punpckhwd", "293": "cvtps2pi", "294": "fiadd", "295": "pfcmpge", "296": "setnp", "297": "pfrcp", "298": "psubw", "299": "movdq2q", "300": "lock", "301": "cmpsw", "302": "phaddw", "303": "sldt", "304": "pfsub", "305": "comisd", "306": "punpcklqdq", "307": "vcvttpd2dq", "308": "loopne", "309": "pop", "310": "lods", "311": "xadd", "312": "pblendvb", "313": "pcmpgtd", "314": "xorps", "315": "fcmovb", "316": "cmovge", "317": "movups", "318": "comiss", "319": "vdivps", "320": "pfnacc", "321": "fidiv", "322": "fucomi", "323": "fldenv", "324": "vmovaps", "325": "movntps", "326": "prefetchw", "327": "pmaddubsw", "328": "psubd", "329": "cmovns", "330": "pmulhuw", "331": "cmovs", "332": "setb", "333": "movlhps", "334": "movdqu", "335": "movzx", "336": "movmskpd", "337": "setle", "338": "psubsw", "339": "vshufps", "340": "prefetcht0", "341": "cmpnlesd", "342": "loop", "343": "cmpltsd", "344": "pswapd", "345": "vandnpd", "346": "unpcklpd", "347": "cld", "348": "cmovle", "349": "fldcw", "350": "lea", "351": "cmpxchg", "352": "fldz", "353": "vminsd", "354": "fcom", "355": "jz", "356": "cmpnlepd", "357": "shr", "358": "pminub", "359": "paddsb", "360": "fimul", "361": "vpsrad", "362": "rsqrtps", "363": "fxch7", "364": "minps", "365": "psubb", "366": "nop", "367": "call", "368": "minss", "369": "cvtss2sd", "370": "cvttps2pi", "371": "psrad", "372": "movntq", "373": "divps", "374": "vmovdqu", "375": "shld", "376": "cmplesd", "377": "stos", "378": "cvtss2si", "379": "vblendps", "380": "bts", "381": "mul", "382": "fisub", "383": "fcmove", "384": "retf", "385": "andpd", "386": "pinsrw", "387": "cmovg", "388": "pmullw", "389": "retfw", "390": "rcpps", "391": "vmread", "392": "sqrtps", "393": "maxss", "394": "jcxz", "395": "lsl", "396": "vpsllw", "397": "fisttp", "398": "packssdw", "399": "unpckhps", "400": "aad", "401": "lgdt", "402": "bt", "403": "vpermilps", "404": "pfmax", "405": "pfcmpgt", "406": "fucom", "407": "unpcklps", "408": "fbstp", "409": "psllq", "410": "movapd", "411": "palignr", "412": "fst", "413": "align", "414": "pextrd", "415": "vxorps", "416": "cvtps2dq", "417": "dec", "418": "ja", "419": "sal", "420": "movlpd", "421": "fisubr", "422": "phminposuw", "423": "andnps", "424": "pi2fd", "425": "movdqa", "426": "cmpxchg8b", "427": "vunpcklps", "428": "setnle", "429": "pf2id", "430": "fbld", "431": "jecxz", "432": "ficomp", "433": "punpckhbw", "434": "movd", "435": "fdivrp", "436": "loope", "437": "setp", "438": "mulsd", "439": "rsqrtss", "440": "pavgb", "441": "vmovups", "442": "fcmovnu", "443": "cvttss2si", "444": "les", "445": "xabort", "446": "neg", "447": "cmpneqpd", "448": "vpshuflw", "449": "pcmpeqb", "450": "setns", "451": "jle", "452": "pcmpgtb", "453": "dw", "454": "db", "455": "dd", "456": "stop_line", "457": "loc_", "458": "sub_", "459": "endp"} -------------------------------------------------------------------------------- /src/method/opcodes_component/vocabulary/mnemonics_vocabulary_mapping_min=3.json: -------------------------------------------------------------------------------- 1 | {"UNK": 0, "NONE": 1, "PAD": 2, "andps": 3, "psubusb": 4, "por": 5, "jnz": 6, "fxch4": 7, "pabsw": 8, "bsr": 9, "setl": 10, "paddw": 11, "fxsave": 12, "vmwrite": 13, "bound": 14, "jo": 15, "movhps": 16, "into": 17, "vmovd": 18, "pxor": 19, "vinsertf128": 20, "vmovdqa": 21, "paddsw": 22, "fist": 23, "orps": 24, "vmovapd": 25, "paddusw": 26, "roundps": 27, "fsub": 28, "bsf": 29, "fcmovu": 30, "vorpd": 31, "packuswb": 32, "unpckhpd": 33, "shufpd": 34, "pminuw": 35, "psubq": 36, "arpl": 37, "pandn": 38, "extractps": 39, "vpunpckhbw": 40, "vcvtss2sd": 41, "vpmullw": 42, "bswap": 43, "rcr": 44, "pmovzxbd": 45, "movsd": 46, "lddqu": 47, "movhpd": 48, "packusdw": 49, "jns": 50, "pshuflw": 51, "jnb": 52, "psllw": 53, "cvttps2dq": 54, "cmova": 55, "psrlq": 56, "setnbe": 57, "fcmovbe": 58, "leave": 59, "punpckldq": 60, "ffreep": 61, "phaddd": 62, "cmovb": 63, "insertps": 64, "scas": 65, "cmpps": 66, "sub": 67, "punpckhqdq": 68, "xchg": 69, "fcmovnbe": 70, "rol": 71, "pmulhrsw": 72, "fstsw": 73, "movnti": 74, "jnp": 75, "setz": 76, "int": 77, "minsd": 78, "paddusb": 79, "mulps": 80, "fcompp": 81, "vextractf128": 82, "lar": 83, "psrld": 84, "seto": 85, "vperm2f128": 86, "js": 87, "fadd": 88, "not": 89, "jmp": 90, "lss": 91, "paddq": 92, "fcmovnb": 93, "pfsubr": 94, "psrlw": 95, "punpckhdq": 96, "sqrtss": 97, "jg": 98, "pavgw": 99, "rcpss": 100, "cvtsd2si": 101, "pmaddwd": 102, "ucomisd": 103, "btr": 104, "cmovbe": 105, "vmovsd": 106, "jge": 107, "cmpneqps": 108, "vpaddw": 109, "sbb": 110, "cmovp": 111, "fcomi": 112, "idiv": 113, "pavgusb": 114, "jl": 115, "psubusw": 116, "fsubr": 117, "btc": 118, "pfadd": 119, "pshufb": 120, "addps": 121, "pand": 122, "imul": 123, "lidt": 124, "pshufhw": 125, "movaps": 126, "fild": 127, "lfs": 128, "addsd": 129, "lgs": 130, "addpd": 131, "vmptrst": 132, "vpcmpeqw": 133, "pshufd": 134, "fidivr": 135, "divss": 136, "prefetcht2": 137, "psrldq": 138, "shl": 139, "psraw": 140, "cvtdq2pd": 141, "pcmpgtw": 142, "vmovddup": 143, "subpd": 144, "movsx": 145, "vpcmpgtw": 146, "xor": 147, "fmul": 148, "cmp": 149, "movss": 150, "retnw": 151, "vpextrw": 152, "femms": 153, "rcl": 154, "wait": 155, "pslld": 156, "orpd": 157, "vunpckhps": 158, "pinsrd": 159, "xbegin": 160, "inc": 161, "ficom": 162, "sgdt": 163, "test": 164, "setbe": 165, "mpsadbw": 166, "prefetchnta": 167, "cmpleps": 168, "pcmpeqd": 169, "cmovl": 170, "ffree": 171, "fdivp": 172, "cmpltps": 173, "cvttsd2si": 174, "pmaxub": 175, "vsqrtpd": 176, "fistp": 177, "fnstenv": 178, "punpcklbw": 179, "vpmaddwd": 180, "fxch": 181, "cvtpd2ps": 182, "ucomiss": 183, "setno": 184, "sar": 185, "maxps": 186, "pmuludq": 187, "cvtps2pd": 188, "vpaddusw": 189, "fsubrp": 190, "movlps": 191, "fld1": 192, "jb": 193, "and": 194, "jbe": 195, "pcmpeqw": 196, "fnsave": 197, "cmovo": 198, "out": 199, "frstor": 200, "ldmxcsr": 201, "div": 202, "cvtpi2ps": 203, "mulpd": 204, "add": 205, "pextrw": 206, "movq": 207, "pextrb": 208, "punpcklwd": 209, "shufps": 210, "fdivr": 211, "cvtsd2ss": 212, "adc": 213, "movq2dq": 214, "cmpltpd": 215, "paddb": 216, "cmps": 217, "addss": 218, "vpunpckldq": 219, "pmovzxwd": 220, "vaddps": 221, "cmovno": 222, "pfmul": 223, "pmaxsw": 224, "subss": 225, "cvtsi2ss": 226, "subsd": 227, "cdq": 228, "stmxcsr": 229, "vmptrld": 230, "setnl": 231, "shrd": 232, "push": 233, "or": 234, "fucompp": 235, "cvtdq2ps": 236, "vpshufhw": 237, "fnstsw": 238, "pmulhw": 239, "vmulps": 240, "xlat": 241, "cvtsi2sd": 242, "cmovnp": 243, "subps": 244, "pslldq": 245, "ror": 246, "fcmovne": 247, "pfrcpit1": 248, "mulss": 249, "psadbw": 250, "movmskps": 251, "fcomp": 252, "fnstcw": 253, "jp": 254, "divsd": 255, "movhlps": 256, "fmulp": 257, "fdiv": 258, "pfrcpit2": 259, "jno": 260, "setnz": 261, "fld": 262, "in": 263, "paddd": 264, "cmpeqsd": 265, "str": 266, "lds": 267, "setnb": 268, "aam": 269, "packsswb": 270, "sets": 271, "pminsw": 272, "vdivsd": 273, "xorpd": 274, "faddp": 275, "lldt": 276, "andnpd": 277, "pmovmskb": 278, "fucomp": 279, "retn": 280, "mov": 281, "fsubp": 282, "sqrtsd": 283, "clflush": 284, "pshufw": 285, "vrcpss": 286, "enter": 287, "vsubps": 288, "psubsb": 289, "fstp": 290, "movntdq": 291, "punpckhwd": 292, "cvtps2pi": 293, "fiadd": 294, "pfcmpge": 295, "setnp": 296, "pfrcp": 297, "psubw": 298, "movdq2q": 299, "lock": 300, "cmpsw": 301, "phaddw": 302, "sldt": 303, "pfsub": 304, "comisd": 305, "punpcklqdq": 306, "vcvttpd2dq": 307, "loopne": 308, "pop": 309, "lods": 310, "xadd": 311, "pblendvb": 312, "pcmpgtd": 313, "xorps": 314, "fcmovb": 315, "cmovge": 316, "movups": 317, "comiss": 318, "vdivps": 319, "pfnacc": 320, "fidiv": 321, "fucomi": 322, "fldenv": 323, "vmovaps": 324, "movntps": 325, "prefetchw": 326, "pmaddubsw": 327, "psubd": 328, "cmovns": 329, "pmulhuw": 330, "cmovs": 331, "setb": 332, "movlhps": 333, "movdqu": 334, "movzx": 335, "movmskpd": 336, "setle": 337, "psubsw": 338, "vshufps": 339, "prefetcht0": 340, "cmpnlesd": 341, "loop": 342, "cmpltsd": 343, "pswapd": 344, "vandnpd": 345, "unpcklpd": 346, "cld": 347, "cmovle": 348, "fldcw": 349, "lea": 350, "cmpxchg": 351, "fldz": 352, "vminsd": 353, "fcom": 354, "jz": 355, "cmpnlepd": 356, "shr": 357, "pminub": 358, "paddsb": 359, "fimul": 360, "vpsrad": 361, "rsqrtps": 362, "fxch7": 363, "minps": 364, "psubb": 365, "nop": 366, "call": 367, "minss": 368, "cvtss2sd": 369, "cvttps2pi": 370, "psrad": 371, "movntq": 372, "divps": 373, "vmovdqu": 374, "shld": 375, "cmplesd": 376, "stos": 377, "cvtss2si": 378, "vblendps": 379, "bts": 380, "mul": 381, "fisub": 382, "fcmove": 383, "retf": 384, "andpd": 385, "pinsrw": 386, "cmovg": 387, "pmullw": 388, "retfw": 389, "rcpps": 390, "vmread": 391, "sqrtps": 392, "maxss": 393, "jcxz": 394, "lsl": 395, "vpsllw": 396, "fisttp": 397, "packssdw": 398, "unpckhps": 399, "aad": 400, "lgdt": 401, "bt": 402, "vpermilps": 403, "pfmax": 404, "pfcmpgt": 405, "fucom": 406, "unpcklps": 407, "fbstp": 408, "psllq": 409, "movapd": 410, "palignr": 411, "fst": 412, "align": 413, "pextrd": 414, "vxorps": 415, "cvtps2dq": 416, "dec": 417, "ja": 418, "sal": 419, "movlpd": 420, "fisubr": 421, "phminposuw": 422, "andnps": 423, "pi2fd": 424, "movdqa": 425, "cmpxchg8b": 426, "vunpcklps": 427, "setnle": 428, "pf2id": 429, "fbld": 430, "jecxz": 431, "ficomp": 432, "punpckhbw": 433, "movd": 434, "fdivrp": 435, "loope": 436, "setp": 437, "mulsd": 438, "rsqrtss": 439, "pavgb": 440, "vmovups": 441, "fcmovnu": 442, "cvttss2si": 443, "les": 444, "xabort": 445, "neg": 446, "cmpneqpd": 447, "vpshuflw": 448, "pcmpeqb": 449, "setns": 450, "jle": 451, "pcmpgtb": 452, "dw": 453, "db": 454, "dd": 455, "stop_line": 456, "loc_": 457, "sub_": 458, "endp": 459} -------------------------------------------------------------------------------- /src/method/utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | import tensorflow as tf 3 | import numpy as np 4 | import math 5 | 6 | def initialize_TFRecords(tfrecords_filepath, num_tfrecords=10, filename="training"): 7 | training_writers = [] 8 | for i in range(num_tfrecords): 9 | training_writers.append(tf.io.TFRecordWriter(tfrecords_filepath + "{}{}.tfrecords".format(filename,i))) 10 | return training_writers 11 | 12 | def create_lookup_table(vocabulary_mapping, num_oov_buckets): 13 | keys = [k for k in vocabulary_mapping.keys()] 14 | values = [tf.constant(vocabulary_mapping[k], dtype=tf.int64) for k in keys] 15 | 16 | table = tf.lookup.StaticVocabularyTable( 17 | tf.lookup.KeyValueTensorInitializer( 18 | keys=keys, 19 | values=values 20 | ), 21 | num_oov_buckets 22 | ) 23 | return table 24 | 25 | def _bytes_feature(value): 26 | return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value])) 27 | 28 | def _int64_feature(value): 29 | return tf.train.Feature(int64_list=tf.train.Int64List(value=[value])) 30 | 31 | 32 | def load_vocabulary(vocabulary_filepath): 33 | """ 34 | It reads and stores in a dictionary-like structure the data from the file passed as argument 35 | 36 | Parameters 37 | ---------- 38 | vocabulary_filepath: str 39 | JSON-like file 40 | 41 | Return 42 | ------ 43 | vocabulary_dict: dict 44 | """ 45 | with open(vocabulary_filepath, "r") as vocab_file: 46 | vocabulary_dict = json.load(vocab_file) 47 | return vocabulary_dict 48 | 49 | def serialize_mnemonics_example_IDs(mnemonic_IDs, label): 50 | """ 51 | Creates a tf.Example message ready to be written to a file 52 | :param mnemonics: str -> "[4,67,109,...,402, 402]" 53 | :param label: int [0,8] 54 | :return: 55 | """ 56 | feature = { 57 | 'opcodes': _bytes_feature(np.array(mnemonic_IDs).tostring()), 58 | 'label': _int64_feature(label) 59 | } 60 | example_proto = tf.train.Example(features=tf.train.Features(feature=feature)) 61 | return example_proto.SerializeToString() 62 | 63 | def serialize_mnemonics_example(mnemonics, label): 64 | """ 65 | Creates a tf.Example message ready to be written to a file 66 | :param mnemonics: str -> "push,pop,...,NONE" 67 | :param label: int [0,8] 68 | :return: 69 | """ 70 | feature = { 71 | 'opcodes': _bytes_feature(mnemonics.encode('UTF-8')), 72 | 'label': _int64_feature(label) 73 | } 74 | example_proto = tf.train.Example(features=tf.train.Features(feature=feature)) 75 | return example_proto.SerializeToString() 76 | 77 | def serialize_bytes_example(bytes, label): 78 | """ 79 | Creates a tf.Example message ready to be written to a file 80 | :param bytes: str -> "00,FF,...,??,NONE" 81 | :param label: int [0,8] 82 | :return: 83 | """ 84 | feature = { 85 | 'bytes': _bytes_feature(bytes.encode('UTF-8')), 86 | 'label': _int64_feature(label) 87 | } 88 | example_proto = tf.train.Example(features=tf.train.Features(feature=feature)) 89 | return example_proto.SerializeToString() 90 | 91 | def serialize_apis_example(feature_vector, label): 92 | feature = { 93 | 'APIs': _bytes_feature(feature_vector.tostring()), 94 | 'label': _int64_feature(label) 95 | } 96 | example_proto = tf.train.Example(features=tf.train.Features(feature=feature)) 97 | return example_proto.SerializeToString() 98 | 99 | def serialize_hydra_example(opcodes, bytes, apis_values, label): 100 | feature = { 101 | 'opcodes': _bytes_feature(opcodes.encode('UTF-8')), 102 | 'bytes': _bytes_feature(bytes.encode('UTF-8')), 103 | 'APIs': _bytes_feature(apis_values.tostring()), 104 | 'label': _int64_feature(label) 105 | } 106 | example_proto = tf.train.Example(features=tf.train.Features(feature=feature)) 107 | return example_proto.SerializeToString() 108 | 109 | def load_parameters(parameters_path): 110 | """ 111 | It loads the network parameters 112 | 113 | Parameters 114 | ---------- 115 | parameters_path: str 116 | File containing the parameters of the network 117 | """ 118 | with open(parameters_path, "r") as param_file: 119 | params = json.load(param_file) 120 | return params 121 | --------------------------------------------------------------------------------