├── .gitignore
├── Attention.py
├── Drain.py
├── README.md
├── dataloader.py
├── log_data
    └── HDFS_2k.log
├── parse_log.py
├── parse_result
    ├── HDFS_2k.log_structured.csv
    └── HDFS_2k.log_templates.csv
├── preprocessing.py
├── test.ipynb
└── train.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | playground.ipynb
 2 | *.ipynb_checkpoints
 3 | .vscode
 4 | __pycache__
 5 | 
 6 | HDFS.log*
 7 | anomaly_label.csv
 8 | crawl-300d-2M.vec
 9 | embedding_table.pkl
10 | counter_idf.pkl
11 | *.npy
12 | *.npz
13 | *.h5


--------------------------------------------------------------------------------
/Attention.py:
--------------------------------------------------------------------------------
  1 | 
  2 | from keras import backend as K, initializers, regularizers, constraints
  3 | from keras.engine.topology import Layer
  4 | 
  5 | 
  6 | def dot_product(x, kernel):
  7 |     """
  8 |     Wrapper for dot product operation, in order to be compatible with both
  9 |     Theano and Tensorflow
 10 |     Args:
 11 |         x (): input
 12 |         kernel (): weights
 13 |     Returns:
 14 |     """
 15 |     if K.backend() == 'tensorflow':
 16 |         # todo: check that this is correct
 17 |         return K.squeeze(K.dot(x, K.expand_dims(kernel)), axis=-1)
 18 |     else:
 19 |         return K.dot(x, kernel)
 20 | 
 21 | 
 22 | class Attention(Layer):
 23 |     def __init__(self,
 24 |                  W_regularizer=None, b_regularizer=None,
 25 |                  W_constraint=None, b_constraint=None,
 26 |                  bias=True,
 27 |                  return_attention=False,
 28 |                  **kwargs):
 29 |         """
 30 |         Keras Layer that implements an Attention mechanism for temporal data.
 31 |         Supports Masking.
 32 |         Follows the work of Raffel et al. [https://arxiv.org/abs/1512.08756]
 33 |         # Input shape
 34 |             3D tensor with shape: `(samples, steps, features)`.
 35 |         # Output shape
 36 |             2D tensor with shape: `(samples, features)`.
 37 |         :param kwargs:
 38 |         Just put it on top of an RNN Layer (GRU/LSTM/SimpleRNN) with return_sequences=True.
 39 |         The dimensions are inferred based on the output shape of the RNN.
 40 | 
 41 | 
 42 |         Note: The layer has been tested with Keras 1.x
 43 | 
 44 |         Example:
 45 | 
 46 |             # 1
 47 |             model.add(LSTM(64, return_sequences=True))
 48 |             model.add(Attention())
 49 |             # next add a Dense layer (for classification/regression) or whatever...
 50 | 
 51 |             # 2 - Get the attention scores
 52 |             hidden = LSTM(64, return_sequences=True)(words)
 53 |             sentence, word_scores = Attention(return_attention=True)(hidden)
 54 | 
 55 |         """
 56 |         self.supports_masking = True
 57 |         self.return_attention = return_attention
 58 |         self.init = initializers.get('glorot_uniform')
 59 | 
 60 |         self.W_regularizer = regularizers.get(W_regularizer)
 61 |         self.b_regularizer = regularizers.get(b_regularizer)
 62 | 
 63 |         self.W_constraint = constraints.get(W_constraint)
 64 |         self.b_constraint = constraints.get(b_constraint)
 65 | 
 66 |         self.bias = bias
 67 |         super(Attention, self).__init__(**kwargs)
 68 | 
 69 |     def get_config(self):
 70 |         config = {
 71 |             'return_attention': self.return_attention,
 72 |             'W_regularizer': regularizers.serialize(self.W_regularizer),
 73 |             'b_regularizer': regularizers.serialize(self.b_regularizer),
 74 |             'W_constraint': constraints.serialize(self.W_constraint),
 75 |             'b_constraint': constraints.serialize(self.b_constraint),
 76 |             'bias': self.bias
 77 |         }
 78 | 
 79 |         base_config = super(Attention, self).get_config()
 80 |         return dict(list(base_config.items()) + list(config.items()))
 81 | 
 82 |     def build(self, input_shape):
 83 |         assert len(input_shape) == 3
 84 | 
 85 |         self.W = self.add_weight(shape=(input_shape[-1],),
 86 |                                  initializer=self.init,
 87 |                                  name='{}_W'.format(self.name),
 88 |                                  regularizer=self.W_regularizer,
 89 |                                  constraint=self.W_constraint)
 90 |         if self.bias:
 91 |             self.b = self.add_weight(shape=(input_shape[1],),
 92 |                                      initializer='zero',
 93 |                                      name='{}_b'.format(self.name),
 94 |                                      regularizer=self.b_regularizer,
 95 |                                      constraint=self.b_constraint)
 96 |         else:
 97 |             self.b = None
 98 | 
 99 |         self.built = True
100 | 
101 |     def compute_mask(self, input, input_mask=None):
102 |         # do not pass the mask to the next layers
103 |         return None
104 | 
105 |     def call(self, x, mask=None):
106 |         eij = dot_product(x, self.W)
107 | 
108 |         if self.bias:
109 |             eij += self.b
110 | 
111 |         eij = K.tanh(eij)
112 | 
113 |         a = K.exp(eij)
114 | 
115 |         # apply mask after the exp. will be re-normalized next
116 |         if mask is not None:
117 |             # Cast the mask to floatX to avoid float64 upcasting in theano
118 |             a *= K.cast(mask, K.floatx())
119 | 
120 |         # in some cases especially in the early stages of training the sum may be almost zero
121 |         # and this results in NaN's. A workaround is to add a very small positive number ε to the sum.
122 |         # a /= K.cast(K.sum(a, axis=1, keepdims=True), K.floatx())
123 |         a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())
124 | 
125 |         weighted_input = x * K.expand_dims(a)
126 | 
127 |         result = K.sum(weighted_input, axis=1)
128 | 
129 |         if self.return_attention:
130 |             return [result, a]
131 |         return result
132 | 
133 |     def compute_output_shape(self, input_shape):
134 |         if self.return_attention:
135 |             return [(input_shape[0], input_shape[-1]),
136 |                     (input_shape[0], input_shape[1])]
137 |         else:
138 |             return input_shape[0], input_shape[-1]
139 | 


--------------------------------------------------------------------------------
/Drain.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Description : This file implements the Drain algorithm for log parsing
  3 | Author      : LogPAI team
  4 | License     : MIT
  5 | """
  6 | 
  7 | import regex as re
  8 | import os
  9 | import numpy as np
 10 | import pandas as pd
 11 | import hashlib
 12 | from datetime import datetime
 13 | 
 14 | 
 15 | class Logcluster:
 16 |     def __init__(self, logTemplate='', logIDL=None):
 17 |         self.logTemplate = logTemplate
 18 |         if logIDL is None:
 19 |             logIDL = []
 20 |         self.logIDL = logIDL
 21 | 
 22 | 
 23 | class Node:
 24 |     def __init__(self, childD=None, depth=0, digitOrtoken=None):
 25 |         if childD is None:
 26 |             childD = dict()
 27 |         self.childD = childD
 28 |         self.depth = depth
 29 |         self.digitOrtoken = digitOrtoken
 30 | 
 31 | 
 32 | class LogParser:
 33 |     def __init__(self, log_format, indir='./', outdir='./result/', depth=4, st=0.4, 
 34 |                  maxChild=100, rex=[], keep_para=True):
 35 |         """
 36 |         Attributes
 37 |         ----------
 38 |             rex : regular expressions used in preprocessing (step1)
 39 |             path : the input path stores the input log file name
 40 |             depth : depth of all leaf nodes
 41 |             st : similarity threshold
 42 |             maxChild : max number of children of an internal node
 43 |             logName : the name of the input file containing raw log messages
 44 |             savePath : the output path stores the file containing structured logs
 45 |         """
 46 |         self.path = indir
 47 |         self.depth = depth - 2
 48 |         self.st = st
 49 |         self.maxChild = maxChild
 50 |         self.logName = None
 51 |         self.savePath = outdir
 52 |         self.df_log = None
 53 |         self.log_format = log_format
 54 |         self.rex = rex
 55 |         self.keep_para = keep_para
 56 | 
 57 |     def hasNumbers(self, s):
 58 |         return any(char.isdigit() for char in s)
 59 | 
 60 |     def treeSearch(self, rn, seq):
 61 |         retLogClust = None
 62 | 
 63 |         seqLen = len(seq)
 64 |         if seqLen not in rn.childD:
 65 |             return retLogClust
 66 | 
 67 |         parentn = rn.childD[seqLen]
 68 | 
 69 |         currentDepth = 1
 70 |         for token in seq:
 71 |             if currentDepth >= self.depth or currentDepth > seqLen:
 72 |                 break
 73 | 
 74 |             if token in parentn.childD:
 75 |                 parentn = parentn.childD[token]
 76 |             elif '<*>' in parentn.childD:
 77 |                 parentn = parentn.childD['<*>']
 78 |             else:
 79 |                 return retLogClust
 80 |             currentDepth += 1
 81 | 
 82 |         logClustL = parentn.childD
 83 | 
 84 |         retLogClust = self.fastMatch(logClustL, seq)
 85 | 
 86 |         return retLogClust
 87 | 
 88 |     def addSeqToPrefixTree(self, rn, logClust):
 89 |         seqLen = len(logClust.logTemplate)
 90 |         if seqLen not in rn.childD:
 91 |             firtLayerNode = Node(depth=1, digitOrtoken=seqLen)
 92 |             rn.childD[seqLen] = firtLayerNode
 93 |         else:
 94 |             firtLayerNode = rn.childD[seqLen]
 95 | 
 96 |         parentn = firtLayerNode
 97 | 
 98 |         currentDepth = 1
 99 |         for token in logClust.logTemplate:
100 | 
101 |             #Add current log cluster to the leaf node
102 |             if currentDepth >= self.depth or currentDepth > seqLen:
103 |                 if len(parentn.childD) == 0:
104 |                     parentn.childD = [logClust]
105 |                 else:
106 |                     parentn.childD.append(logClust)
107 |                 break
108 | 
109 |             #If token not matched in this layer of existing tree. 
110 |             if token not in parentn.childD:
111 |                 if not self.hasNumbers(token):
112 |                     if '<*>' in parentn.childD:
113 |                         if len(parentn.childD) < self.maxChild:
114 |                             newNode = Node(depth=currentDepth + 1, digitOrtoken=token)
115 |                             parentn.childD[token] = newNode
116 |                             parentn = newNode
117 |                         else:
118 |                             parentn = parentn.childD['<*>']
119 |                     else:
120 |                         if len(parentn.childD)+1 < self.maxChild:
121 |                             newNode = Node(depth=currentDepth+1, digitOrtoken=token)
122 |                             parentn.childD[token] = newNode
123 |                             parentn = newNode
124 |                         elif len(parentn.childD)+1 == self.maxChild:
125 |                             newNode = Node(depth=currentDepth+1, digitOrtoken='<*>')
126 |                             parentn.childD['<*>'] = newNode
127 |                             parentn = newNode
128 |                         else:
129 |                             parentn = parentn.childD['<*>']
130 |             
131 |                 else:
132 |                     if '<*>' not in parentn.childD:
133 |                         newNode = Node(depth=currentDepth+1, digitOrtoken='<*>')
134 |                         parentn.childD['<*>'] = newNode
135 |                         parentn = newNode
136 |                     else:
137 |                         parentn = parentn.childD['<*>']
138 | 
139 |             #If the token is matched
140 |             else:
141 |                 parentn = parentn.childD[token]
142 | 
143 |             currentDepth += 1
144 | 
145 |     #seq1 is template
146 |     def seqDist(self, seq1, seq2):
147 |         assert len(seq1) == len(seq2)
148 |         simTokens = 0
149 |         numOfPar = 0
150 | 
151 |         for token1, token2 in zip(seq1, seq2):
152 |             if token1 == '<*>':
153 |                 numOfPar += 1
154 |                 continue
155 |             if token1 == token2:
156 |                 simTokens += 1 
157 | 
158 |         retVal = float(simTokens) / len(seq1)
159 | 
160 |         return retVal, numOfPar
161 | 
162 | 
163 |     def fastMatch(self, logClustL, seq):
164 |         retLogClust = None
165 | 
166 |         maxSim = -1
167 |         maxNumOfPara = -1
168 |         maxClust = None
169 | 
170 |         for logClust in logClustL:
171 |             curSim, curNumOfPara = self.seqDist(logClust.logTemplate, seq)
172 |             if curSim>maxSim or (curSim==maxSim and curNumOfPara>maxNumOfPara):
173 |                 maxSim = curSim
174 |                 maxNumOfPara = curNumOfPara
175 |                 maxClust = logClust
176 | 
177 |         if maxSim >= self.st:
178 |             retLogClust = maxClust  
179 | 
180 |         return retLogClust
181 | 
182 |     def getTemplate(self, seq1, seq2):
183 |         assert len(seq1) == len(seq2)
184 |         retVal = []
185 | 
186 |         i = 0
187 |         for word in seq1:
188 |             if word == seq2[i]:
189 |                 retVal.append(word)
190 |             else:
191 |                 retVal.append('<*>')
192 | 
193 |             i += 1
194 | 
195 |         return retVal
196 | 
197 |     def outputResult(self, logClustL):
198 |         log_templates = [0] * self.df_log.shape[0]
199 |         log_templateids = [0] * self.df_log.shape[0]
200 |         df_events = []
201 |         for logClust in logClustL:
202 |             template_str = ' '.join(logClust.logTemplate)
203 |             occurrence = len(logClust.logIDL)
204 |             template_id = hashlib.md5(template_str.encode('utf-8')).hexdigest()[0:8]
205 |             for logID in logClust.logIDL:
206 |                 logID -= 1
207 |                 log_templates[logID] = template_str
208 |                 log_templateids[logID] = template_id
209 |             df_events.append([template_id, template_str, occurrence])
210 | 
211 |         df_event = pd.DataFrame(df_events, columns=['EventId', 'EventTemplate', 'Occurrences'])
212 |         self.df_log['EventId'] = log_templateids
213 |         self.df_log['EventTemplate'] = log_templates
214 | 
215 |         if self.keep_para:
216 |             self.df_log["ParameterList"] = self.df_log.apply(self.get_parameter_list, axis=1) 
217 |         self.df_log.to_csv(os.path.join(self.savePath, self.logName + '_structured.csv'), index=False)
218 | 
219 | 
220 |         occ_dict = dict(self.df_log['EventTemplate'].value_counts())
221 |         df_event = pd.DataFrame()
222 |         df_event['EventTemplate'] = self.df_log['EventTemplate'].unique()
223 |         df_event['EventId'] = df_event['EventTemplate'].map(lambda x: hashlib.md5(x.encode('utf-8')).hexdigest()[0:8])
224 |         df_event['Occurrences'] = df_event['EventTemplate'].map(occ_dict)
225 |         df_event.to_csv(os.path.join(self.savePath, self.logName + '_templates.csv'), index=False, columns=["EventId", "EventTemplate", "Occurrences"])
226 | 
227 | 
228 |     def printTree(self, node, dep):
229 |         pStr = ''   
230 |         for i in range(dep):
231 |             pStr += '\t'
232 | 
233 |         if node.depth == 0:
234 |             pStr += 'Root'
235 |         elif node.depth == 1:
236 |             pStr += '<' + str(node.digitOrtoken) + '>'
237 |         else:
238 |             pStr += node.digitOrtoken
239 | 
240 |         print(pStr)
241 | 
242 |         if node.depth == self.depth:
243 |             return 1
244 |         for child in node.childD:
245 |             self.printTree(node.childD[child], dep+1)
246 | 
247 | 
248 |     def parse(self, logName):
249 |         print('Parsing file: ' + os.path.join(self.path, logName))
250 |         start_time = datetime.now()
251 |         self.logName = logName
252 |         rootNode = Node()
253 |         logCluL = []
254 | 
255 |         self.load_data()
256 | 
257 |         count = 0
258 |         for idx, line in self.df_log.iterrows():
259 |             logID = line['LineId']
260 |             logmessageL = self.preprocess(line['Content']).strip().split()
261 |             # logmessageL = filter(lambda x: x != '', re.split('[\s=:,]', self.preprocess(line['Content'])))
262 |             matchCluster = self.treeSearch(rootNode, logmessageL)
263 | 
264 |             #Match no existing log cluster
265 |             if matchCluster is None:
266 |                 newCluster = Logcluster(logTemplate=logmessageL, logIDL=[logID])
267 |                 logCluL.append(newCluster)
268 |                 self.addSeqToPrefixTree(rootNode, newCluster)
269 | 
270 |             #Add the new log message to the existing cluster
271 |             else:
272 |                 newTemplate = self.getTemplate(logmessageL, matchCluster.logTemplate)
273 |                 matchCluster.logIDL.append(logID)
274 |                 if ' '.join(newTemplate) != ' '.join(matchCluster.logTemplate): 
275 |                     matchCluster.logTemplate = newTemplate
276 | 
277 |             count += 1
278 |             if count % 1000 == 0 or count == len(self.df_log):
279 |                 print('Processed {0:.1f}% of log lines.'.format(count * 100.0 / len(self.df_log)))
280 | 
281 | 
282 |         if not os.path.exists(self.savePath):
283 |             os.makedirs(self.savePath)
284 | 
285 |         self.outputResult(logCluL)
286 | 
287 |         print('Parsing done. [Time taken: {!s}]'.format(datetime.now() - start_time))
288 | 
289 |     def load_data(self):
290 |         headers, regex = self.generate_logformat_regex(self.log_format)
291 |         self.df_log = self.log_to_dataframe(os.path.join(self.path, self.logName), regex, headers, self.log_format)
292 | 
293 |     def preprocess(self, line):
294 |         for currentRex in self.rex:
295 |             line = re.sub(currentRex, '<*>', line)
296 |         return line
297 | 
298 |     def log_to_dataframe(self, log_file, regex, headers, logformat):
299 |         """ Function to transform log file to dataframe 
300 |         """
301 |         log_messages = []
302 |         linecount = 0
303 |         with open(log_file, 'r') as fin:
304 |             for line in fin.readlines():
305 |                 try:
306 |                     match = regex.search(line.strip())
307 |                     message = [match.group(header) for header in headers]
308 |                     log_messages.append(message)
309 |                     linecount += 1
310 |                 except Exception as e:
311 |                     pass
312 |         logdf = pd.DataFrame(log_messages, columns=headers)
313 |         logdf.insert(0, 'LineId', None)
314 |         logdf['LineId'] = [i + 1 for i in range(linecount)]
315 |         return logdf
316 | 
317 | 
318 |     def generate_logformat_regex(self, logformat):
319 |         """ Function to generate regular expression to split log messages
320 |         """
321 |         headers = []
322 |         splitters = re.split(r'(<[^<>]+>)', logformat)
323 |         regex = ''
324 |         for k in range(len(splitters)):
325 |             if k % 2 == 0:
326 |                 splitter = re.sub(' +', '\\\s+', splitters[k])
327 |                 regex += splitter
328 |             else:
329 |                 header = splitters[k].strip('<').strip('>')
330 |                 regex += '(?P<%s>.*?)' % header
331 |                 headers.append(header)
332 |         regex = re.compile('^' + regex + '$')
333 |         return headers, regex
334 | 
335 |     def get_parameter_list(self, row):
336 |         template_regex = re.sub(r"<.{1,5}>", "<*>", row["EventTemplate"])
337 |         if "<*>" not in template_regex: return []
338 |         template_regex = re.sub(r'([^A-Za-z0-9])', r'\\\1', template_regex)
339 |         template_regex = re.sub(r'\\ +', r'\s+', template_regex)
340 |         template_regex = "^" + template_regex.replace("\<\*\>", "(.*?)") + "$"
341 |         parameter_list = re.findall(template_regex, row["Content"])
342 |         parameter_list = parameter_list[0] if parameter_list else ()
343 |         parameter_list = list(parameter_list) if isinstance(parameter_list, tuple) else [parameter_list]
344 |         return parameter_list


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Log-based Anomaly Detection System
 2 | The final project of deep learning and practice (summer 2020) in NCTU.
 3 | 
 4 | The main procedures of this system are as follows:
 5 | 1. Adopt Drain to parse log messages to extract log events (templates).
 6 | 2. Extracts semantic information of log events and represents them as semantic vectors using Sentence-BERT.
 7 | 3. Detects anomalies by utilizing an attention-based Bi-LSTM model, which has the ability to capture the contextual
 8 | information in the log sequences and automatically learn the importance of different log events.
 9 | 
10 | We have evaluated our system using the public HDFS dataset, and the
11 | recall, precision and F1-score achieved by it are 0.9977, 0.9691 and 0.9832, respectively
12 | 
13 | ## Preprocessing Order
14 | * unzip HDFS_1.tar.gz into log_data/
15 | * parse_log.py
16 | * preprocessing.py
17 | 
18 | ## Data Discription
19 | * total: 575,061 blocks
20 |     - 16,838 anomaly blocks
21 |     - 558,223 normal blocks
22 | * training: randomly select
23 |     - 6,000 anomaly blocks
24 |     - 6,000 normal blocks
25 | * testing: rest data
26 | 
27 | ## Reference
28 | * [https://dl.acm.org/doi/10.1145/3338906.3338931](https://dl.acm.org/doi/10.1145/3338906.3338931)
29 | * [https://ieeexplore.ieee.org/document/8029742](https://ieeexplore.ieee.org/document/8029742)
30 | * [https://arxiv.org/abs/1908.10084](https://arxiv.org/abs/1908.10084)
31 | * [https://github.com/logpai/loghub](https://github.com/logpai/loghub)
32 | 


--------------------------------------------------------------------------------
/dataloader.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import keras
 3 | from keras.preprocessing.sequence import pad_sequences
 4 | import math
 5 | 
 6 | 
 7 | EMBEDDING_DIM = 768
 8 | 
 9 | 
10 | class DataGenerator(keras.utils.Sequence):
11 |     def __init__(self, x, y, batch_size):
12 |         'Initialization'
13 |         self.x = x
14 |         self.y = y
15 |         self.batch_size = batch_size
16 | 
17 |     def __len__(self):
18 |         'Denotes the number of batches'
19 |         return math.ceil(len(self.x) / self.batch_size)
20 | 
21 |     def __getitem__(self, index):
22 |         'Generate one batch of data'
23 |         x = self.x[index * self.batch_size:(index + 1) * self.batch_size]
24 |         y = self.y[index * self.batch_size:(index + 1) * self.batch_size]
25 | 
26 |         x = pad_sequences(x, dtype='object', padding='post',
27 |                           value=np.zeros(EMBEDDING_DIM)).astype(np.float32)
28 | 
29 |         return x, y
30 | 


--------------------------------------------------------------------------------
/parse_log.py:
--------------------------------------------------------------------------------
 1 | import Drain
 2 | 
 3 | input_dir = 'log_data/'  # The input directory of log file
 4 | output_dir = 'parse_result/'  # The output directory of parsing results
 5 | log_file = 'HDFS.log'  # The input log file name
 6 | log_format = '<Date> <Time> <Pid> <Level> <Component>: <Content>'  # HDFS log format
 7 | # Regular expression list for optional preprocessing (default: [])
 8 | regex = [
 9 |     r'blk_(|-)[0-9]+',  # block id
10 |     r'(/|)([0-9]+\.){3}[0-9]+(:[0-9]+|)(:|)',  # IP
11 |     r'(?<=[^A-Za-z0-9])(\-?\+?\d+)(?=[^A-Za-z0-9])|[0-9]+$',  # Numbers
12 | ]
13 | st = 0.5  # Similarity threshold
14 | depth = 4  # Depth of all leaf nodes
15 | 
16 | parser = Drain.LogParser(log_format, indir=input_dir,
17 |                          outdir=output_dir, depth=depth, st=st, rex=regex)
18 | parser.parse(log_file)
19 | 


--------------------------------------------------------------------------------
/parse_result/HDFS_2k.log_templates.csv:
--------------------------------------------------------------------------------
 1 | EventId,EventTemplate,Occurrences
 2 | dc2c74b7,PacketResponder <*> for block <*> terminating,311
 3 | 5d5de21c,BLOCK* NameSystem.addStoredBlock: blockMap updated: <*> is added to <*> size <*>,314
 4 | e3df2680,Received block <*> of size <*> from <*>,292
 5 | 09a53393,Receiving block <*> src: <*> dest: <*>,292
 6 | 3d91fa85,BLOCK* NameSystem.allocateBlock: <*> <*>,115
 7 | 32777b38,Verification succeeded for <*>,20
 8 | dba996ef,Deleting block <*> file <*>,263
 9 | 626085d5,<*> Served block <*> to <*>,80
10 | 81cee340,<*>Got exception while serving <*> to <*>,80
11 | d63ef163,BLOCK* NameSystem.delete: <*> is added to invalidSet of <*>,224
12 | 40651754,<*> Starting thread to transfer block <*> to <*>,1
13 | 04137b95,BLOCK* ask <*> to delete <*>,2
14 | d6b7b743,Received block <*> src: <*> dest: <*> of size <*>,2
15 | 98012f03,BLOCK* ask <*> to delete <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*>,2
16 | 728076ac,BLOCK* ask <*> to replicate <*> to datanode(s) <*>,1
17 | 415a1760,BLOCK* ask <*> to delete <*> <*> <*> <*> <*> <*> <*> <*> <*>,1
18 | 


--------------------------------------------------------------------------------
/preprocessing.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | from tqdm import tqdm
  4 | import ast
  5 | import re
  6 | import torch
  7 | from sentence_transformers import SentenceTransformer
  8 | 
  9 | 
 10 | def preprocess_data(df, mode):
 11 |     x_data, y_data = [], []
 12 |     pbar = tqdm(total=df['BlockId'].nunique(),
 13 |                 desc=f'{mode} data collection')
 14 | 
 15 |     while len(df) > 0:
 16 |         blk_id = df.iloc[0]['BlockId']
 17 |         last_index = 0
 18 |         for i in range(len(df)):
 19 |             if df.iloc[i]['BlockId'] != blk_id:
 20 |                 break
 21 |             last_index += 1
 22 | 
 23 |         df_blk = df[:last_index]
 24 |         x_data.append(np.array(df_blk['Vector'].tolist()))
 25 | 
 26 |         y_index = int(df_blk.iloc[0]['Label'] == 'Anomaly')
 27 |         y = [0, 0]
 28 |         y[y_index] = 1
 29 |         y_data.append(y)
 30 | 
 31 |         df = df.iloc[last_index:]
 32 |         pbar.update()
 33 |     pbar.close()
 34 | 
 35 |     np.savez(f'preprocessed_data/{mode}_data.npz',
 36 |              x=x_data, y=y_data)
 37 | 
 38 | 
 39 | if __name__ == '__main__':
 40 |     num_workers = 6
 41 |     device = 'cuda' if torch.cuda.is_available() else 'cpu'
 42 |     model = SentenceTransformer(
 43 |         'distilbert-base-nli-mean-tokens', device=device)
 44 | 
 45 |     structured_file_name = 'HDFS.log_structured.csv'
 46 |     template_file_name = 'HDFS.log_templates.csv'
 47 | 
 48 |     # load data
 49 |     df_template = pd.read_csv('parse_result/' + template_file_name)
 50 |     df_structured = pd.read_csv('parse_result/' + structured_file_name)
 51 |     df_label = pd.read_csv('log_data/anomaly_label.csv')
 52 | 
 53 |     # calculate vectors for all known templates
 54 |     print('vector embedding...')
 55 |     embeddings = model.encode(
 56 |         df_template['EventTemplate'].tolist(), num_workers=num_workers)
 57 |     df_template['Vector'] = list(embeddings)
 58 |     template_dict = df_template.set_index('EventTemplate')['Vector'].to_dict()
 59 | 
 60 |     # convert templates to vectors for all logs
 61 |     vectors = []
 62 |     for template in df_structured['EventTemplate']:
 63 |         try:
 64 |             vectors.append(template_dict[template])
 65 |         except KeyError:
 66 |             # new template
 67 |             vectors.append(model.encode(template), num_workers=num_workers)
 68 |     df_structured['Vector'] = vectors
 69 |     print('done')
 70 | 
 71 |     # remove unused column
 72 |     df_structured.drop(columns=['Date', 'Time', 'Pid', 'Level', 'Component',
 73 |                                 'Content', 'EventId', 'EventTemplate'], axis=1, inplace=True)
 74 | 
 75 |     # extract BlockId
 76 |     r1 = re.compile('^blk_-?[0-9]')
 77 |     r2 = re.compile('.*blk_-?[0-9]')
 78 | 
 79 |     paramlists = df_structured['ParameterList'].tolist()
 80 |     blk_id_list = []
 81 |     for paramlist in tqdm(paramlists, desc='extract BlockId'):
 82 |         paramlist = ast.literal_eval(paramlist)
 83 |         blk_id = list(filter(r1.match, paramlist))
 84 | 
 85 |         if len(blk_id) == 0:
 86 |             filter_str_list = list(filter(r2.match, paramlist))
 87 |             # ex: '/mnt/hadoop/mapred/system/job_200811092030_0001/job.jar. blk_-1608999687919862906'
 88 |             blk_id = filter_str_list[0].split(' ')[-1]
 89 |         else:
 90 |             # ex: ['blk_-1608999687919862906'], ['blk_-1608999687919862906', 'blk_-1608999687919862906'],
 91 |             # ['blk_-1608999687919862906 terminating']
 92 |             blk_id = blk_id[0].split(' ')[0]
 93 | 
 94 |         blk_id_list.append(blk_id)
 95 | 
 96 |     df_structured['BlockId'] = blk_id_list
 97 |     df_structured.drop(columns=['ParameterList'], axis=1, inplace=True)
 98 | 
 99 |     # split training and testing data labels
100 |     df_label['Usage'] = 'testing'
101 | 
102 |     n_index = df_label.Label[df_label.Label.eq('Normal')].sample(6000).index
103 |     a_index = df_label.Label[df_label.Label.eq('Anomaly')].sample(6000).index
104 |     train_index = n_index.union(a_index)
105 |     df_label.iloc[train_index, df_label.columns.get_loc('Usage')] = 'training'
106 | 
107 |     df_structured = pd.merge(df_structured, df_label, on='BlockId')
108 |     del df_label
109 | 
110 |     # group data by BlockId
111 |     df_structured.sort_values(by=['BlockId', 'LineId'], inplace=True)
112 |     df_structured.drop(columns=['LineId'], axis=1, inplace=True)
113 | 
114 |     # split training and testing dataframe
115 |     df_test = df_structured[df_structured['Usage'] == 'testing']
116 |     df_train = df_structured[df_structured['Usage'] == 'training']
117 |     del df_structured
118 | 
119 |     # preprocess data
120 |     preprocess_data(df_train, 'training')
121 |     preprocess_data(df_test, 'testing')
122 | 


--------------------------------------------------------------------------------
/test.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import numpy as np\n",
 10 |     "from keras.models import load_model\n",
 11 |     "from sklearn.metrics import precision_recall_fscore_support\n",
 12 |     "\n",
 13 |     "from Attention import Attention\n",
 14 |     "from dataloader import DataGenerator"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 2,
 20 |    "metadata": {},
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "# load data\n",
 24 |     "testing_data = np.load('preprocessed_data/testing_data.npz', allow_pickle=True)\n",
 25 |     "x_test = testing_data['x']\n",
 26 |     "y_test = testing_data['y']\n",
 27 |     "del testing_data"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": 3,
 33 |    "metadata": {},
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "# load model\n",
 37 |     "model = load_model('my_model.h5', custom_objects={'Attention': Attention})"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "code",
 42 |    "execution_count": 4,
 43 |    "metadata": {},
 44 |    "outputs": [
 45 |     {
 46 |      "name": "stdout",
 47 |      "output_type": "stream",
 48 |      "text": [
 49 |       "4399/4399 [==============================] - 824s 187ms/step\n"
 50 |      ]
 51 |     }
 52 |    ],
 53 |    "source": [
 54 |     "# test\n",
 55 |     "test_generator = DataGenerator(x_test, y_test, batch_size=128)\n",
 56 |     "y_pred = model.predict(test_generator, verbose=1)"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "code",
 61 |    "execution_count": 5,
 62 |    "metadata": {},
 63 |    "outputs": [],
 64 |    "source": [
 65 |     "# calculate metrics\n",
 66 |     "y_true = np.argmax(y_test, axis=1)\n",
 67 |     "y_pred = np.argmax(y_pred, axis=1)\n",
 68 |     "report = precision_recall_fscore_support(y_true, y_pred, average='binary')"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "code",
 73 |    "execution_count": 6,
 74 |    "metadata": {},
 75 |    "outputs": [
 76 |     {
 77 |      "name": "stdout",
 78 |      "output_type": "stream",
 79 |      "text": [
 80 |       "Number of testing data: 563061\n",
 81 |       "Precision: 0.9691\n",
 82 |       "Recall: 0.9977\n",
 83 |       "F1 score: 0.9832\n"
 84 |      ]
 85 |     }
 86 |    ],
 87 |    "source": [
 88 |     "print(f'Number of testing data: {x_test.shape[0]}')\n",
 89 |     "print(f'Precision: {report[0]:.4f}')\n",
 90 |     "print(f'Recall: {report[1]:.4f}')\n",
 91 |     "print(f'F1 score: {report[2]:.4f}')"
 92 |    ]
 93 |   }
 94 |  ],
 95 |  "metadata": {
 96 |   "kernelspec": {
 97 |    "display_name": "Python (project)",
 98 |    "language": "python",
 99 |    "name": "project"
100 |   },
101 |   "language_info": {
102 |    "codemirror_mode": {
103 |     "name": "ipython",
104 |     "version": 3
105 |    },
106 |    "file_extension": ".py",
107 |    "mimetype": "text/x-python",
108 |    "name": "python",
109 |    "nbconvert_exporter": "python",
110 |    "pygments_lexer": "ipython3",
111 |    "version": "3.8.5"
112 |   }
113 |  },
114 |  "nbformat": 4,
115 |  "nbformat_minor": 4
116 | }
117 | 


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from keras.models import Sequential
 3 | from keras.layers import LSTM, Dense, Bidirectional, Masking
 4 | 
 5 | from Attention import Attention
 6 | from dataloader import DataGenerator
 7 | 
 8 | 
 9 | # hyper-parameters
10 | EMBEDDING_DIM = 768
11 | batch_size = 32
12 | epochs = 20
13 | rnn_units = 256
14 | 
15 | 
16 | # load data
17 | training_data = np.load(
18 |     'preprocessed_data/training_data.npz', allow_pickle=True)
19 | x_train = training_data['x']
20 | y_train = training_data['y']
21 | del training_data
22 | 
23 | 
24 | # model
25 | model = Sequential()
26 | model.add(Masking(mask_value=0., input_shape=(None, EMBEDDING_DIM)))
27 | model.add(Bidirectional(LSTM(rnn_units, return_sequences=True)))
28 | model.add(Attention(bias=False))
29 | model.add(Dense(2, activation='softmax'))
30 | model.compile(loss='binary_crossentropy',
31 |               optimizer='rmsprop', metrics=['acc'])
32 | print(model.summary())
33 | 
34 | # train
35 | train_generator = DataGenerator(x_train, y_train, batch_size=batch_size)
36 | model.fit(train_generator, epochs=epochs)
37 | 
38 | # save model
39 | model.save('my_model.h5')
40 | 


--------------------------------------------------------------------------------