├── Classification_LSTM_deeplog.py ├── Classification_LSTM_model_def_training.py └── README.md /Classification_LSTM_deeplog.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from keras.models import Sequential 3 | from keras.models import load_model 4 | from keras.layers import Dense 5 | from keras.layers import Dropout 6 | from keras.layers import LSTM 7 | from keras.layers import Embedding 8 | from keras.layers import Activation 9 | 10 | from tkinter import messagebox 11 | 12 | 13 | 14 | def getting_data(log_path, labels_ids_path, keys_path): 15 | 16 | """ Function for reading files 17 | 18 | Reading files and forming lists of data. 19 | 20 | Args: 21 | log_path: directory of keys workflow file. 22 | labels_ids_path: directory of keys ids workflow file. 23 | keys_path: directory of list of keys file. 24 | 25 | Returns: 26 | 3 lists of data, formed from read files. 27 | 28 | Raises: 29 | FileNotFoundError: file or directory does not exist. 30 | IsADirectoryError: expected file, but its a directory. 31 | PermissionError: not enough access rights. 32 | 33 | """ 34 | 35 | with open(log_path) as log_file: 36 | log_list = [line.strip('\n') for line in log_file] 37 | log_list = [line for line in log_list if line] 38 | 39 | with open(labels_ids_path) as label_ids_file: 40 | label_ids_list = [line.strip('\n') for line in label_ids_file] 41 | label_ids_list = [line for line in label_ids_list if line] 42 | 43 | with open(keys_path) as keys_file: 44 | keys_list = [line.strip('\n') for line in keys_file] 45 | keys_list = [line for line in keys_list if line] 46 | 47 | return log_list, label_ids_list, keys_list 48 | 49 | 50 | 51 | def prediction_and_comparison(model, sequence, labels_ids, 52 | labels, h, l, input_vocab_size): 53 | 54 | """ Prediction function 55 | 56 | Creating dictionaries for converting keys from string to int and back. 57 | Chosing seed for prediction. New token prediction in loop for l, 58 | comparison of predicted value and real, asking user "if this anomaly?". 59 | If it is not anomaly, adding this case to labels_accordance_dict 60 | dictionary. After all doing shift on by one element by adding to seed 61 | predicted value and popping out first element. 62 | 63 | Args: 64 | model: all of model information, such as layers, batch_size, nodes. 65 | sequence: string sequence needed for seed for prediction. 66 | labels_ids: list of strings of keys ids workflow for prediction. 67 | labels: list of strings of keys for prediction. 68 | l: int length of predicted sequence. 69 | h: int length of token sequence for training and prediction. 70 | input_vocab_size: prediction keys count. 71 | labels_dict: dictionary of keys, for converting string keys to int. 72 | inverted_labels_dict: dictionary for inverse keys converting. 73 | labels_accordance_dict: dictionary for accordance between predicted 74 | and real values of workflow keys. 75 | seed: string seed for prediction. 76 | res_seq: result string sequence. 77 | 78 | Return: 79 | List of string, which contains started seed and subsequent 80 | predicted values. 81 | 82 | """ 83 | 84 | labels_dict = dict(zip(labels, [i for i in range(0, len(labels))])) 85 | inverted_labels_dict = {v: k for k, v in labels_dict.items()} 86 | labels_accordance_dict = dict(zip(labels, [[] for i in labels])) 87 | 88 | sequence = [labels_dict[item] for item in sequence] 89 | 90 | seed = sequence[0: h] 91 | sequence = [inverted_labels_dict[item] for item in sequence] 92 | 93 | res_seq = [] * len(sequence) * l 94 | res_seq.extend([inverted_labels_dict[item] for item in seed]) 95 | 96 | for i in range(0, l): 97 | seq = np.reshape(seed, (1, len(seed), 1)) 98 | seq = seq / float(input_vocab_size) 99 | pr = model.predict(seq, verbose=0) 100 | index = np.argmax(pr) 101 | pred = inverted_labels_dict[index] 102 | if sequence[h + i] != pred: 103 | if pred not in labels_accordance_dict[sequence[h + i]]: 104 | user_answer = messagebox.askyesno("Warning!", f"Anomaly detected, is it okay?\n {sequence[h + i]} == {pred}") 105 | if user_answer: 106 | labels_accordance_dict[sequence[h + i]].append(pred) 107 | else: 108 | messagebox.showerror("Warning!", "Need to fix!!!") 109 | res_seq.append(pred) 110 | seed.append(labels_dict[pred]) 111 | seed.pop(0) 112 | 113 | return res_seq 114 | 115 | 116 | 117 | def main(): 118 | 119 | """ 120 | Getting prediction data and calling function for predict. 121 | 122 | Args: 123 | l: int length of predicted sequence. 124 | h: int length of token sequence for training and prediction. 125 | input_vocab_size: count of training keys. 126 | pred_log_list: list of strings of workflow of keys for prediction. 127 | pred_label_ids_list: list of strings of keys ids workflow 128 | for prediction. 129 | pred_keys_list: list of strings of keys for prediction. 130 | pred_seq: predicted sequence string list. 131 | 132 | """ 133 | 134 | l = 15 135 | 136 | log_predict_path = 'C://Users//vgolubch//Desktop//LSTMtest//LSTM_deeplog//Workflow.txt' 137 | label_ids_predict_path = 'C://Users//vgolubch//Desktop//LSTMtest//LSTM_deeplog//WorkflowID.txt' 138 | keys_predict_path = 'C://Users//vgolubch//Desktop//LSTMtest//LSTM_deeplog//LogKeys.txt' 139 | 140 | model = load_model('C://Users//vgolubch//Desktop//LSTMtest//LSTM_deeplog//model_config.h5') 141 | 142 | model_info_path = 'C://Users//vgolubch//Desktop//LSTMtest//LSTM_deeplog//model_info.txt' 143 | 144 | 145 | with open(model_info_path) as path: 146 | model_info_list = [line.strip('\n') for line in path] 147 | 148 | input_vocab_size = int(model_info_list[0]) 149 | h = int(model_info_list[2]) 150 | 151 | pred_log_list, pred_label_ids_list, pred_keys_list = getting_data( 152 | log_predict_path, label_ids_predict_path, keys_predict_path) 153 | 154 | pred_seq = prediction_and_comparison(model, pred_log_list, 155 | pred_label_ids_list, pred_keys_list, h, l, input_vocab_size) 156 | 157 | print('\n'.join(pred_seq)) 158 | 159 | 160 | 161 | if __name__ == "__main__": 162 | main() -------------------------------------------------------------------------------- /Classification_LSTM_model_def_training.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from keras.models import Sequential 3 | from keras.layers import Dense 4 | from keras.layers import Dropout 5 | from keras.layers import LSTM 6 | from keras.layers import Embedding 7 | from keras.layers import Activation 8 | from keras.callbacks import ModelCheckpoint 9 | from keras.utils import np_utils 10 | 11 | from sklearn.model_selection import train_test_split 12 | 13 | 14 | 15 | def getting_data(log_path, labels_ids_path, keys_path): 16 | 17 | """ Function for reading files 18 | 19 | Reading files and forming lists of data. 20 | 21 | Args: 22 | log_path: directory of keys workflow file. 23 | labels_ids_path: directory of keys ids workflow file. 24 | keys_path: directory of list of keys file. 25 | 26 | Returns: 27 | 3 lists of data, formed from read files. 28 | 29 | Raises: 30 | FileNotFoundError: file or directory does not exist. 31 | IsADirectoryError: expected file, but its a directory. 32 | PermissionError: not enough access rights. 33 | 34 | """ 35 | 36 | with open(log_path) as log_file: 37 | log_list = [line.strip('\n') for line in log_file] 38 | log_list = [line for line in log_list if line] 39 | 40 | with open(labels_ids_path) as label_ids_file: 41 | label_ids_list = [line.strip('\n') for line in label_ids_file] 42 | label_ids_list = [line for line in label_ids_list if line] 43 | 44 | with open(keys_path) as keys_file: 45 | keys_list = [line.strip('\n') for line in keys_file] 46 | keys_list = [line for line in keys_list if line] 47 | 48 | return log_list, label_ids_list, keys_list 49 | 50 | 51 | 52 | def data_preprocessing(h, train_log_list, train_label_ids_list, 53 | train_keys_list): 54 | 55 | """ Preprocessing of input data 56 | 57 | Creating dictionary of keys and its int values, forming 1 dim lists of 58 | workflow with int values of keys. Vectorization of formed 1 dim lists 59 | and creating 3 dim lists for neural network input. 60 | 61 | Args: 62 | h: length of token sequence for training and prediction. 63 | train_log_list: list of strings of training workflow of keys. 64 | traing_label_ids_list: list strings of training keys ids workflow. 65 | train_keys_list: list of strings of training keys. 66 | dataX: 1 dim list of int values of keys workflow. 67 | dataY: 1 dim list of keys ids workflow. 68 | labels_dict: dictionary, which converts string keys to ints. 69 | x_train: 3 dim list of vectorized keys workflow. 70 | y_train: one-hot encoded keys ids workflow. 71 | 72 | Returns: 73 | x_train, y_train and count of keys. 74 | 75 | """ 76 | 77 | dataX = [] 78 | dataY = [] 79 | 80 | input_vocab_size = len(train_keys_list) 81 | 82 | classes_count = len(train_keys_list) 83 | 84 | labels_dict = dict(zip(train_keys_list, 85 | [i for i in range(0, len(train_keys_list))])) 86 | 87 | for i in range(0, len(train_log_list) - h): 88 | temp = train_log_list[i: i + h] 89 | dataX.append([labels_dict[item] for item in temp]) 90 | dataY.append(train_label_ids_list[i + h]) 91 | 92 | x_train = np.reshape(dataX,(len(dataX), h, 1)) 93 | x_train = x_train / float(input_vocab_size) 94 | y_train = np_utils.to_categorical(dataY) 95 | 96 | return x_train, y_train, classes_count, input_vocab_size 97 | 98 | 99 | 100 | def model_def_train_test(epochs, batch_size, x_train, y_train, x_test, y_test, 101 | classes_count, input_vocab_size): 102 | 103 | """ Model definition 104 | 105 | Sequential model definition with 2 LSTM layers, and output Dense layer. 106 | Checkpointing and saving weights improvement while fitting model. Also 107 | saving model information (shapes, batch_size, classes_count). Taking 108 | test data sequence and testing model. Training data is 2/3 of all data, 109 | testing data is last 1/3 of all data. 110 | 111 | Args: 112 | epochs: int count of model training stage iterations. 113 | batch_size: int model batch size. 114 | x_train: numpy.ndarray 3 dim list of vectorized training keys 115 | workflow. 116 | y_train: numpy.ndarray one-hot encoded keys ids training workflow. 117 | x_test: a part of training set of x for testing. 118 | y_test: a part of training set of y for testing. 119 | classes_count: count of training keys. 120 | input_vocab_size: count of training keys. 121 | 122 | """ 123 | 124 | model_info_path = 'C://Users//vgolubch//Desktop//LSTMtest//LSTM_deeplog//model_info.txt' 125 | model_save_path = 'C://Users//vgolubch//Desktop//LSTMtest//LSTM_deeplog//model_config.h5' 126 | weights_path="C://Users//vgolubch//Desktop//LSTMtest//LSTM_deeplog//model_weights//weights-improvement-{epoch:02d}-{loss:.4f}.hdf5" 127 | 128 | model = Sequential() 129 | model.add(LSTM(64, 130 | input_shape=(x_train.shape[1], x_train.shape[2]), 131 | return_sequences=True)) 132 | model.add(Dropout(0.5)) 133 | model.add(LSTM(64)) 134 | model.add(Dropout(0.5)) 135 | model.add(Dense(classes_count, activation='softmax')) 136 | model.compile(loss='categorical_crossentropy', 137 | optimizer='adam', 138 | metrics=['accuracy']) 139 | 140 | print(model.summary()) 141 | 142 | checkpoint = ModelCheckpoint(weights_path, monitor='loss', verbose=1, 143 | save_best_only=True, mode='min') 144 | callbacks_list = [checkpoint] 145 | 146 | model.fit(x_train, y_train, 147 | epochs=epochs, batch_size=batch_size, callbacks=callbacks_list) 148 | 149 | model.save(model_save_path) 150 | 151 | with open(model_info_path, 'w') as path: 152 | path.write(str(input_vocab_size) + '\n') 153 | path.write(str(classes_count) + '\n') 154 | path.write(str(x_train.shape[1]) + '\n') 155 | path.write(str(x_train.shape[2])) 156 | 157 | accr = model.evaluate(x_test, y_test) 158 | print('Test set\nLoss: {:0.3f}\nAccuracy: {:0.3f}'.format(accr[0],accr[1])) 159 | 160 | 161 | 162 | def main(): 163 | 164 | """ 165 | Setting neural network parameters and needed files paths. Splitting 166 | data on training set and testing set. 167 | 168 | Args: 169 | log_train_path: string directory of training keys workflow file. 170 | label_ids_train_path: string directory of training keys ids 171 | workflow file. 172 | keys_train_path: string directory of training list of keys file. 173 | train_log_list: list of strings of training workflow of keys. 174 | traing_label_ids_list: list strings of training keys ids workflow. 175 | train_keys_list: list of strings of training keys. 176 | h: int length of token sequence for training and prediction. 177 | epochs: int count of model training stage iterations. 178 | batch_size: int model batch size. 179 | x_train: numpy.ndarray 3 dim list of vectorized training keys 180 | workflow. 181 | y_train: numpy.ndarray one-hot encoded keys ids training workflow. 182 | x_test: a part of training set of x for testing. 183 | y_test: a part of training set of y for testing. 184 | clases_count: count of training keys. 185 | 186 | """ 187 | 188 | h = 3 189 | epochs = 3 190 | batch_size = 1 191 | 192 | log_train_path = 'C://Users//vgolubch//Desktop//LSTMtest//LSTM_deeplog//Workflow.txt' 193 | label_ids_train_path = 'C://Users//vgolubch//Desktop//LSTMtest//LSTM_deeplog//WorkflowID.txt' 194 | keys_train_path = 'C://Users//vgolubch//Desktop//LSTMtest//LSTM_deeplog//LogKeys.txt' 195 | 196 | train_log_list, train_label_ids_list, train_keys_list = getting_data( 197 | log_train_path, label_ids_train_path, keys_train_path) 198 | 199 | x_train, y_train, classes_count, input_vocab_size = data_preprocessing( 200 | h, train_log_list, train_label_ids_list, train_keys_list) 201 | 202 | x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.33, random_state=42) 203 | 204 | model_def_train_test(epochs, batch_size, x_train, y_train, x_test, y_test, 205 | classes_count, input_vocab_size) 206 | 207 | 208 | 209 | if __name__ == '__main__': 210 | main() -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ClassificationLstm 2 | Classification LSTM RNN for DeepLog realization 3 | 4 | Inspired by: https://www.cs.utah.edu/~lifeifei/papers/deeplog.pdf 5 | --------------------------------------------------------------------------------