├── Classification_LSTM_deeplog.py
├── Classification_LSTM_model_def_training.py
└── README.md


/Classification_LSTM_deeplog.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from keras.models import Sequential
  3 | from keras.models import load_model
  4 | from keras.layers import Dense
  5 | from keras.layers import Dropout
  6 | from keras.layers import LSTM
  7 | from keras.layers import Embedding
  8 | from keras.layers import Activation
  9 | 
 10 | from tkinter import messagebox
 11 | 
 12 | 
 13 | 
 14 | def getting_data(log_path, labels_ids_path, keys_path):
 15 |     
 16 |     """ Function for reading files
 17 | 
 18 |         Reading files and forming lists of data.
 19 | 
 20 |         Args:
 21 |             log_path: directory of keys workflow file.
 22 |             labels_ids_path: directory of keys ids workflow file.
 23 |             keys_path: directory of list of keys file.
 24 | 
 25 |         Returns:
 26 |             3 lists of data, formed from read files.
 27 | 
 28 |         Raises:
 29 |             FileNotFoundError: file or directory does not exist.
 30 |             IsADirectoryError: expected file, but its a directory.
 31 |             PermissionError: not enough access rights.
 32 | 
 33 |     """
 34 |     
 35 |     with open(log_path) as log_file:
 36 |         log_list = [line.strip('\n') for line in log_file]
 37 |         log_list = [line for line in log_list if line]
 38 | 
 39 |     with open(labels_ids_path) as label_ids_file:
 40 |         label_ids_list = [line.strip('\n') for line in label_ids_file]
 41 |         label_ids_list = [line for line in label_ids_list if line]
 42 |     
 43 |     with open(keys_path) as keys_file:
 44 |         keys_list = [line.strip('\n') for line in keys_file]
 45 |         keys_list = [line for line in keys_list if line]
 46 | 
 47 |     return log_list, label_ids_list, keys_list
 48 | 
 49 | 
 50 | 
 51 | def prediction_and_comparison(model, sequence, labels_ids,
 52 |                                 labels, h, l, input_vocab_size):
 53 | 
 54 |     """ Prediction function
 55 | 
 56 |         Creating dictionaries for converting keys from string to int and back.
 57 |         Chosing seed for prediction. New token prediction in loop for l,
 58 |         comparison of predicted value and real, asking user "if this anomaly?".
 59 |         If it is not anomaly, adding this case to labels_accordance_dict
 60 |         dictionary. After all doing shift on by one element by adding to seed
 61 |         predicted value and popping out first element.
 62 | 
 63 |         Args:
 64 |             model: all of model information, such as layers, batch_size, nodes.
 65 |             sequence: string sequence needed for seed for prediction.
 66 |             labels_ids: list of strings of keys ids workflow for prediction.
 67 |             labels: list of strings of keys for prediction.
 68 |             l: int length of predicted sequence.
 69 |             h: int length of token sequence for training and prediction.
 70 |             input_vocab_size: prediction keys count.
 71 |             labels_dict: dictionary of keys, for converting string keys to int.
 72 |             inverted_labels_dict: dictionary for inverse keys converting.
 73 |             labels_accordance_dict: dictionary for accordance between predicted
 74 |             and real values of workflow keys.
 75 |             seed: string seed for prediction.
 76 |             res_seq: result string sequence.
 77 |         
 78 |         Return:
 79 |             List of string, which contains started seed and subsequent
 80 |             predicted values.
 81 |         
 82 |     """
 83 | 
 84 |     labels_dict = dict(zip(labels, [i for i in range(0, len(labels))]))
 85 |     inverted_labels_dict = {v: k for k, v in labels_dict.items()}
 86 |     labels_accordance_dict = dict(zip(labels, [[] for i in labels]))
 87 | 
 88 |     sequence = [labels_dict[item] for item in sequence]
 89 | 
 90 |     seed = sequence[0: h]
 91 |     sequence = [inverted_labels_dict[item] for item in sequence]
 92 | 
 93 |     res_seq = [] * len(sequence) * l
 94 |     res_seq.extend([inverted_labels_dict[item] for item in seed])
 95 |     
 96 |     for i in range(0, l):
 97 |         seq = np.reshape(seed, (1, len(seed), 1))
 98 |         seq = seq / float(input_vocab_size)
 99 |         pr = model.predict(seq, verbose=0)
100 |         index = np.argmax(pr)
101 |         pred = inverted_labels_dict[index]
102 |         if sequence[h + i] != pred:
103 |             if pred not in labels_accordance_dict[sequence[h + i]]:
104 |                 user_answer = messagebox.askyesno("Warning!", f"Anomaly detected, is it okay?\n {sequence[h + i]} == {pred}")
105 |                 if user_answer:
106 |                     labels_accordance_dict[sequence[h + i]].append(pred)
107 |                 else:
108 |                     messagebox.showerror("Warning!", "Need to fix!!!")
109 |         res_seq.append(pred)
110 |         seed.append(labels_dict[pred])
111 |         seed.pop(0)
112 |     
113 |     return res_seq
114 | 
115 | 
116 | 
117 | def main():
118 |     
119 |     """ 
120 |         Getting prediction data and calling function for predict.
121 | 
122 |         Args:
123 |             l: int length of predicted sequence.
124 |             h: int length of token sequence for training and prediction.
125 |             input_vocab_size: count of training keys.
126 |             pred_log_list: list of strings of workflow of keys for prediction.
127 |             pred_label_ids_list: list of strings of keys ids workflow
128 |               for prediction.
129 |             pred_keys_list: list of strings of keys for prediction.
130 |             pred_seq: predicted sequence string list.
131 | 
132 |     """
133 | 
134 |     l = 15
135 | 
136 |     log_predict_path = 'C://Users//vgolubch//Desktop//LSTMtest//LSTM_deeplog//Workflow.txt' 
137 |     label_ids_predict_path = 'C://Users//vgolubch//Desktop//LSTMtest//LSTM_deeplog//WorkflowID.txt'
138 |     keys_predict_path = 'C://Users//vgolubch//Desktop//LSTMtest//LSTM_deeplog//LogKeys.txt'
139 | 
140 |     model = load_model('C://Users//vgolubch//Desktop//LSTMtest//LSTM_deeplog//model_config.h5')
141 | 
142 |     model_info_path = 'C://Users//vgolubch//Desktop//LSTMtest//LSTM_deeplog//model_info.txt'
143 |     
144 | 
145 |     with open(model_info_path) as path:
146 |         model_info_list = [line.strip('\n') for line in path]
147 | 
148 |     input_vocab_size = int(model_info_list[0])
149 |     h = int(model_info_list[2])
150 | 
151 |     pred_log_list, pred_label_ids_list, pred_keys_list = getting_data(
152 |         log_predict_path, label_ids_predict_path, keys_predict_path)
153 | 
154 |     pred_seq = prediction_and_comparison(model, pred_log_list,
155 |         pred_label_ids_list, pred_keys_list, h, l, input_vocab_size)
156 | 
157 |     print('\n'.join(pred_seq))
158 | 
159 | 
160 | 
161 | if __name__ == "__main__":
162 |     main()


--------------------------------------------------------------------------------
/Classification_LSTM_model_def_training.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from keras.models import Sequential
  3 | from keras.layers import Dense
  4 | from keras.layers import Dropout
  5 | from keras.layers import LSTM
  6 | from keras.layers import Embedding
  7 | from keras.layers import Activation
  8 | from keras.callbacks import ModelCheckpoint
  9 | from keras.utils import np_utils
 10 | 
 11 | from sklearn.model_selection import train_test_split
 12 | 
 13 | 
 14 | 
 15 | def getting_data(log_path, labels_ids_path, keys_path):
 16 |     
 17 |     """ Function for reading files
 18 | 
 19 |         Reading files and forming lists of data.
 20 | 
 21 |         Args:
 22 |             log_path: directory of keys workflow file.
 23 |             labels_ids_path: directory of keys ids workflow file.
 24 |             keys_path: directory of list of keys file.
 25 | 
 26 |         Returns:
 27 |             3 lists of data, formed from read files.
 28 | 
 29 |         Raises:
 30 |             FileNotFoundError: file or directory does not exist.
 31 |             IsADirectoryError: expected file, but its a directory.
 32 |             PermissionError: not enough access rights.
 33 | 
 34 |     """
 35 |     
 36 |     with open(log_path) as log_file:
 37 |         log_list = [line.strip('\n') for line in log_file]
 38 |         log_list = [line for line in log_list if line]
 39 | 
 40 |     with open(labels_ids_path) as label_ids_file:
 41 |         label_ids_list = [line.strip('\n') for line in label_ids_file]
 42 |         label_ids_list = [line for line in label_ids_list if line]
 43 |     
 44 |     with open(keys_path) as keys_file:
 45 |         keys_list = [line.strip('\n') for line in keys_file]
 46 |         keys_list = [line for line in keys_list if line]
 47 | 
 48 |     return log_list, label_ids_list, keys_list
 49 | 
 50 | 
 51 | 
 52 | def data_preprocessing(h, train_log_list, train_label_ids_list,
 53 |                         train_keys_list):
 54 | 
 55 |     """ Preprocessing of input data
 56 |     
 57 |         Creating dictionary of keys and its int values, forming 1 dim lists of
 58 |         workflow with int values of keys. Vectorization of formed 1 dim lists
 59 |         and creating 3 dim lists for neural network input.
 60 | 
 61 |         Args:
 62 |             h: length of token sequence for training and prediction.
 63 |             train_log_list: list of strings of training workflow of keys.
 64 |             traing_label_ids_list: list strings of training keys ids workflow.
 65 |             train_keys_list: list of strings of training keys.
 66 |             dataX: 1 dim list of int values of keys workflow.
 67 |             dataY: 1 dim list of keys ids workflow.
 68 |             labels_dict: dictionary, which converts string keys to ints.
 69 |             x_train: 3 dim list of vectorized keys workflow.
 70 |             y_train: one-hot encoded keys ids workflow.
 71 | 
 72 |         Returns:
 73 |             x_train, y_train and count of keys.
 74 |     
 75 |     """
 76 | 
 77 |     dataX = []
 78 |     dataY = []
 79 | 
 80 |     input_vocab_size = len(train_keys_list)
 81 | 
 82 |     classes_count = len(train_keys_list)
 83 |  
 84 |     labels_dict = dict(zip(train_keys_list, 
 85 |         [i for i in range(0, len(train_keys_list))]))
 86 | 
 87 |     for i in range(0, len(train_log_list) - h):
 88 |         temp = train_log_list[i: i + h]
 89 |         dataX.append([labels_dict[item] for item in temp])
 90 |         dataY.append(train_label_ids_list[i + h])
 91 | 
 92 |     x_train = np.reshape(dataX,(len(dataX), h, 1))
 93 |     x_train = x_train / float(input_vocab_size)
 94 |     y_train = np_utils.to_categorical(dataY)
 95 | 
 96 |     return x_train, y_train, classes_count, input_vocab_size
 97 | 
 98 | 
 99 | 
100 | def model_def_train_test(epochs, batch_size, x_train, y_train, x_test, y_test,
101 |                         classes_count, input_vocab_size):
102 | 
103 |     """ Model definition
104 | 
105 |         Sequential model definition with 2 LSTM layers, and output Dense layer.
106 |         Checkpointing and saving weights improvement while fitting model. Also
107 |         saving model information (shapes, batch_size, classes_count). Taking
108 |         test data sequence and testing model. Training data is 2/3 of all data,
109 |         testing data is last 1/3 of all data.
110 | 
111 |         Args:
112 |             epochs: int count of model training stage iterations.
113 |             batch_size: int model batch size.
114 |             x_train: numpy.ndarray 3 dim list of vectorized training keys 
115 |               workflow.
116 |             y_train: numpy.ndarray one-hot encoded keys ids training workflow.
117 |             x_test: a part of training set of x for testing.
118 |             y_test: a part of training set of y for testing.
119 |             classes_count: count of training keys.
120 |             input_vocab_size: count of training keys.
121 | 
122 |     """
123 | 
124 |     model_info_path = 'C://Users//vgolubch//Desktop//LSTMtest//LSTM_deeplog//model_info.txt'
125 |     model_save_path = 'C://Users//vgolubch//Desktop//LSTMtest//LSTM_deeplog//model_config.h5'
126 |     weights_path="C://Users//vgolubch//Desktop//LSTMtest//LSTM_deeplog//model_weights//weights-improvement-{epoch:02d}-{loss:.4f}.hdf5"
127 | 
128 |     model = Sequential()
129 |     model.add(LSTM(64,
130 |         input_shape=(x_train.shape[1], x_train.shape[2]),
131 |         return_sequences=True))
132 |     model.add(Dropout(0.5))
133 |     model.add(LSTM(64))
134 |     model.add(Dropout(0.5))
135 |     model.add(Dense(classes_count, activation='softmax'))
136 |     model.compile(loss='categorical_crossentropy',
137 |         optimizer='adam',
138 |         metrics=['accuracy'])
139 | 
140 |     print(model.summary())
141 | 
142 |     checkpoint = ModelCheckpoint(weights_path, monitor='loss', verbose=1,
143 |         save_best_only=True, mode='min')
144 |     callbacks_list = [checkpoint]
145 | 
146 |     model.fit(x_train, y_train,
147 |         epochs=epochs, batch_size=batch_size, callbacks=callbacks_list)
148 |     
149 |     model.save(model_save_path)
150 | 
151 |     with open(model_info_path, 'w') as path:
152 |         path.write(str(input_vocab_size) + '\n')
153 |         path.write(str(classes_count) + '\n')
154 |         path.write(str(x_train.shape[1]) + '\n')
155 |         path.write(str(x_train.shape[2]))
156 | 
157 |     accr = model.evaluate(x_test, y_test)
158 |     print('Test set\nLoss: {:0.3f}\nAccuracy: {:0.3f}'.format(accr[0],accr[1]))
159 | 
160 | 
161 | 
162 | def main():
163 | 
164 |     """ 
165 |         Setting neural network parameters and needed files paths. Splitting
166 |         data on training set and testing set.
167 | 
168 |         Args:
169 |             log_train_path: string directory of training keys workflow file.
170 |             label_ids_train_path: string directory of training keys ids
171 |               workflow file.
172 |             keys_train_path: string directory of training list of keys file.
173 |             train_log_list: list of strings of training workflow of keys.
174 |             traing_label_ids_list: list strings of training keys ids workflow.
175 |             train_keys_list: list of strings of training keys.
176 |             h: int length of token sequence for training and prediction.
177 |             epochs: int count of model training stage iterations.
178 |             batch_size: int model batch size.
179 |             x_train: numpy.ndarray 3 dim list of vectorized training keys 
180 |               workflow.
181 |             y_train: numpy.ndarray one-hot encoded keys ids training workflow.
182 |             x_test: a part of training set of x for testing.
183 |             y_test: a part of training set of y for testing.
184 |             clases_count: count of training keys.
185 |     
186 |     """
187 | 
188 |     h = 3
189 |     epochs = 3
190 |     batch_size = 1
191 | 
192 |     log_train_path = 'C://Users//vgolubch//Desktop//LSTMtest//LSTM_deeplog//Workflow.txt'
193 |     label_ids_train_path = 'C://Users//vgolubch//Desktop//LSTMtest//LSTM_deeplog//WorkflowID.txt'
194 |     keys_train_path = 'C://Users//vgolubch//Desktop//LSTMtest//LSTM_deeplog//LogKeys.txt'
195 | 
196 |     train_log_list, train_label_ids_list, train_keys_list = getting_data(
197 |         log_train_path, label_ids_train_path, keys_train_path)    
198 |     
199 |     x_train, y_train, classes_count, input_vocab_size = data_preprocessing(
200 |         h, train_log_list, train_label_ids_list, train_keys_list)
201 |     
202 |     x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.33, random_state=42)
203 |     
204 |     model_def_train_test(epochs, batch_size, x_train, y_train, x_test, y_test,
205 |         classes_count, input_vocab_size)
206 | 
207 | 
208 | 
209 | if __name__ == '__main__':
210 |     main()


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # ClassificationLstm
2 | Classification LSTM RNN for DeepLog realization
3 | 
4 | Inspired by: https://www.cs.utah.edu/~lifeifei/papers/deeplog.pdf
5 | 


--------------------------------------------------------------------------------