├── .gitignore ├── Attention.py ├── Drain.py ├── README.md ├── dataloader.py ├── log_data └── HDFS_2k.log ├── parse_log.py ├── parse_result ├── HDFS_2k.log_structured.csv └── HDFS_2k.log_templates.csv ├── preprocessing.py ├── test.ipynb └── train.py /.gitignore: -------------------------------------------------------------------------------- 1 | playground.ipynb 2 | *.ipynb_checkpoints 3 | .vscode 4 | __pycache__ 5 | 6 | HDFS.log* 7 | anomaly_label.csv 8 | crawl-300d-2M.vec 9 | embedding_table.pkl 10 | counter_idf.pkl 11 | *.npy 12 | *.npz 13 | *.h5 -------------------------------------------------------------------------------- /Attention.py: -------------------------------------------------------------------------------- 1 | 2 | from keras import backend as K, initializers, regularizers, constraints 3 | from keras.engine.topology import Layer 4 | 5 | 6 | def dot_product(x, kernel): 7 | """ 8 | Wrapper for dot product operation, in order to be compatible with both 9 | Theano and Tensorflow 10 | Args: 11 | x (): input 12 | kernel (): weights 13 | Returns: 14 | """ 15 | if K.backend() == 'tensorflow': 16 | # todo: check that this is correct 17 | return K.squeeze(K.dot(x, K.expand_dims(kernel)), axis=-1) 18 | else: 19 | return K.dot(x, kernel) 20 | 21 | 22 | class Attention(Layer): 23 | def __init__(self, 24 | W_regularizer=None, b_regularizer=None, 25 | W_constraint=None, b_constraint=None, 26 | bias=True, 27 | return_attention=False, 28 | **kwargs): 29 | """ 30 | Keras Layer that implements an Attention mechanism for temporal data. 31 | Supports Masking. 32 | Follows the work of Raffel et al. [https://arxiv.org/abs/1512.08756] 33 | # Input shape 34 | 3D tensor with shape: `(samples, steps, features)`. 35 | # Output shape 36 | 2D tensor with shape: `(samples, features)`. 37 | :param kwargs: 38 | Just put it on top of an RNN Layer (GRU/LSTM/SimpleRNN) with return_sequences=True. 39 | The dimensions are inferred based on the output shape of the RNN. 40 | 41 | 42 | Note: The layer has been tested with Keras 1.x 43 | 44 | Example: 45 | 46 | # 1 47 | model.add(LSTM(64, return_sequences=True)) 48 | model.add(Attention()) 49 | # next add a Dense layer (for classification/regression) or whatever... 50 | 51 | # 2 - Get the attention scores 52 | hidden = LSTM(64, return_sequences=True)(words) 53 | sentence, word_scores = Attention(return_attention=True)(hidden) 54 | 55 | """ 56 | self.supports_masking = True 57 | self.return_attention = return_attention 58 | self.init = initializers.get('glorot_uniform') 59 | 60 | self.W_regularizer = regularizers.get(W_regularizer) 61 | self.b_regularizer = regularizers.get(b_regularizer) 62 | 63 | self.W_constraint = constraints.get(W_constraint) 64 | self.b_constraint = constraints.get(b_constraint) 65 | 66 | self.bias = bias 67 | super(Attention, self).__init__(**kwargs) 68 | 69 | def get_config(self): 70 | config = { 71 | 'return_attention': self.return_attention, 72 | 'W_regularizer': regularizers.serialize(self.W_regularizer), 73 | 'b_regularizer': regularizers.serialize(self.b_regularizer), 74 | 'W_constraint': constraints.serialize(self.W_constraint), 75 | 'b_constraint': constraints.serialize(self.b_constraint), 76 | 'bias': self.bias 77 | } 78 | 79 | base_config = super(Attention, self).get_config() 80 | return dict(list(base_config.items()) + list(config.items())) 81 | 82 | def build(self, input_shape): 83 | assert len(input_shape) == 3 84 | 85 | self.W = self.add_weight(shape=(input_shape[-1],), 86 | initializer=self.init, 87 | name='{}_W'.format(self.name), 88 | regularizer=self.W_regularizer, 89 | constraint=self.W_constraint) 90 | if self.bias: 91 | self.b = self.add_weight(shape=(input_shape[1],), 92 | initializer='zero', 93 | name='{}_b'.format(self.name), 94 | regularizer=self.b_regularizer, 95 | constraint=self.b_constraint) 96 | else: 97 | self.b = None 98 | 99 | self.built = True 100 | 101 | def compute_mask(self, input, input_mask=None): 102 | # do not pass the mask to the next layers 103 | return None 104 | 105 | def call(self, x, mask=None): 106 | eij = dot_product(x, self.W) 107 | 108 | if self.bias: 109 | eij += self.b 110 | 111 | eij = K.tanh(eij) 112 | 113 | a = K.exp(eij) 114 | 115 | # apply mask after the exp. will be re-normalized next 116 | if mask is not None: 117 | # Cast the mask to floatX to avoid float64 upcasting in theano 118 | a *= K.cast(mask, K.floatx()) 119 | 120 | # in some cases especially in the early stages of training the sum may be almost zero 121 | # and this results in NaN's. A workaround is to add a very small positive number ε to the sum. 122 | # a /= K.cast(K.sum(a, axis=1, keepdims=True), K.floatx()) 123 | a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx()) 124 | 125 | weighted_input = x * K.expand_dims(a) 126 | 127 | result = K.sum(weighted_input, axis=1) 128 | 129 | if self.return_attention: 130 | return [result, a] 131 | return result 132 | 133 | def compute_output_shape(self, input_shape): 134 | if self.return_attention: 135 | return [(input_shape[0], input_shape[-1]), 136 | (input_shape[0], input_shape[1])] 137 | else: 138 | return input_shape[0], input_shape[-1] 139 | -------------------------------------------------------------------------------- /Drain.py: -------------------------------------------------------------------------------- 1 | """ 2 | Description : This file implements the Drain algorithm for log parsing 3 | Author : LogPAI team 4 | License : MIT 5 | """ 6 | 7 | import regex as re 8 | import os 9 | import numpy as np 10 | import pandas as pd 11 | import hashlib 12 | from datetime import datetime 13 | 14 | 15 | class Logcluster: 16 | def __init__(self, logTemplate='', logIDL=None): 17 | self.logTemplate = logTemplate 18 | if logIDL is None: 19 | logIDL = [] 20 | self.logIDL = logIDL 21 | 22 | 23 | class Node: 24 | def __init__(self, childD=None, depth=0, digitOrtoken=None): 25 | if childD is None: 26 | childD = dict() 27 | self.childD = childD 28 | self.depth = depth 29 | self.digitOrtoken = digitOrtoken 30 | 31 | 32 | class LogParser: 33 | def __init__(self, log_format, indir='./', outdir='./result/', depth=4, st=0.4, 34 | maxChild=100, rex=[], keep_para=True): 35 | """ 36 | Attributes 37 | ---------- 38 | rex : regular expressions used in preprocessing (step1) 39 | path : the input path stores the input log file name 40 | depth : depth of all leaf nodes 41 | st : similarity threshold 42 | maxChild : max number of children of an internal node 43 | logName : the name of the input file containing raw log messages 44 | savePath : the output path stores the file containing structured logs 45 | """ 46 | self.path = indir 47 | self.depth = depth - 2 48 | self.st = st 49 | self.maxChild = maxChild 50 | self.logName = None 51 | self.savePath = outdir 52 | self.df_log = None 53 | self.log_format = log_format 54 | self.rex = rex 55 | self.keep_para = keep_para 56 | 57 | def hasNumbers(self, s): 58 | return any(char.isdigit() for char in s) 59 | 60 | def treeSearch(self, rn, seq): 61 | retLogClust = None 62 | 63 | seqLen = len(seq) 64 | if seqLen not in rn.childD: 65 | return retLogClust 66 | 67 | parentn = rn.childD[seqLen] 68 | 69 | currentDepth = 1 70 | for token in seq: 71 | if currentDepth >= self.depth or currentDepth > seqLen: 72 | break 73 | 74 | if token in parentn.childD: 75 | parentn = parentn.childD[token] 76 | elif '<*>' in parentn.childD: 77 | parentn = parentn.childD['<*>'] 78 | else: 79 | return retLogClust 80 | currentDepth += 1 81 | 82 | logClustL = parentn.childD 83 | 84 | retLogClust = self.fastMatch(logClustL, seq) 85 | 86 | return retLogClust 87 | 88 | def addSeqToPrefixTree(self, rn, logClust): 89 | seqLen = len(logClust.logTemplate) 90 | if seqLen not in rn.childD: 91 | firtLayerNode = Node(depth=1, digitOrtoken=seqLen) 92 | rn.childD[seqLen] = firtLayerNode 93 | else: 94 | firtLayerNode = rn.childD[seqLen] 95 | 96 | parentn = firtLayerNode 97 | 98 | currentDepth = 1 99 | for token in logClust.logTemplate: 100 | 101 | #Add current log cluster to the leaf node 102 | if currentDepth >= self.depth or currentDepth > seqLen: 103 | if len(parentn.childD) == 0: 104 | parentn.childD = [logClust] 105 | else: 106 | parentn.childD.append(logClust) 107 | break 108 | 109 | #If token not matched in this layer of existing tree. 110 | if token not in parentn.childD: 111 | if not self.hasNumbers(token): 112 | if '<*>' in parentn.childD: 113 | if len(parentn.childD) < self.maxChild: 114 | newNode = Node(depth=currentDepth + 1, digitOrtoken=token) 115 | parentn.childD[token] = newNode 116 | parentn = newNode 117 | else: 118 | parentn = parentn.childD['<*>'] 119 | else: 120 | if len(parentn.childD)+1 < self.maxChild: 121 | newNode = Node(depth=currentDepth+1, digitOrtoken=token) 122 | parentn.childD[token] = newNode 123 | parentn = newNode 124 | elif len(parentn.childD)+1 == self.maxChild: 125 | newNode = Node(depth=currentDepth+1, digitOrtoken='<*>') 126 | parentn.childD['<*>'] = newNode 127 | parentn = newNode 128 | else: 129 | parentn = parentn.childD['<*>'] 130 | 131 | else: 132 | if '<*>' not in parentn.childD: 133 | newNode = Node(depth=currentDepth+1, digitOrtoken='<*>') 134 | parentn.childD['<*>'] = newNode 135 | parentn = newNode 136 | else: 137 | parentn = parentn.childD['<*>'] 138 | 139 | #If the token is matched 140 | else: 141 | parentn = parentn.childD[token] 142 | 143 | currentDepth += 1 144 | 145 | #seq1 is template 146 | def seqDist(self, seq1, seq2): 147 | assert len(seq1) == len(seq2) 148 | simTokens = 0 149 | numOfPar = 0 150 | 151 | for token1, token2 in zip(seq1, seq2): 152 | if token1 == '<*>': 153 | numOfPar += 1 154 | continue 155 | if token1 == token2: 156 | simTokens += 1 157 | 158 | retVal = float(simTokens) / len(seq1) 159 | 160 | return retVal, numOfPar 161 | 162 | 163 | def fastMatch(self, logClustL, seq): 164 | retLogClust = None 165 | 166 | maxSim = -1 167 | maxNumOfPara = -1 168 | maxClust = None 169 | 170 | for logClust in logClustL: 171 | curSim, curNumOfPara = self.seqDist(logClust.logTemplate, seq) 172 | if curSim>maxSim or (curSim==maxSim and curNumOfPara>maxNumOfPara): 173 | maxSim = curSim 174 | maxNumOfPara = curNumOfPara 175 | maxClust = logClust 176 | 177 | if maxSim >= self.st: 178 | retLogClust = maxClust 179 | 180 | return retLogClust 181 | 182 | def getTemplate(self, seq1, seq2): 183 | assert len(seq1) == len(seq2) 184 | retVal = [] 185 | 186 | i = 0 187 | for word in seq1: 188 | if word == seq2[i]: 189 | retVal.append(word) 190 | else: 191 | retVal.append('<*>') 192 | 193 | i += 1 194 | 195 | return retVal 196 | 197 | def outputResult(self, logClustL): 198 | log_templates = [0] * self.df_log.shape[0] 199 | log_templateids = [0] * self.df_log.shape[0] 200 | df_events = [] 201 | for logClust in logClustL: 202 | template_str = ' '.join(logClust.logTemplate) 203 | occurrence = len(logClust.logIDL) 204 | template_id = hashlib.md5(template_str.encode('utf-8')).hexdigest()[0:8] 205 | for logID in logClust.logIDL: 206 | logID -= 1 207 | log_templates[logID] = template_str 208 | log_templateids[logID] = template_id 209 | df_events.append([template_id, template_str, occurrence]) 210 | 211 | df_event = pd.DataFrame(df_events, columns=['EventId', 'EventTemplate', 'Occurrences']) 212 | self.df_log['EventId'] = log_templateids 213 | self.df_log['EventTemplate'] = log_templates 214 | 215 | if self.keep_para: 216 | self.df_log["ParameterList"] = self.df_log.apply(self.get_parameter_list, axis=1) 217 | self.df_log.to_csv(os.path.join(self.savePath, self.logName + '_structured.csv'), index=False) 218 | 219 | 220 | occ_dict = dict(self.df_log['EventTemplate'].value_counts()) 221 | df_event = pd.DataFrame() 222 | df_event['EventTemplate'] = self.df_log['EventTemplate'].unique() 223 | df_event['EventId'] = df_event['EventTemplate'].map(lambda x: hashlib.md5(x.encode('utf-8')).hexdigest()[0:8]) 224 | df_event['Occurrences'] = df_event['EventTemplate'].map(occ_dict) 225 | df_event.to_csv(os.path.join(self.savePath, self.logName + '_templates.csv'), index=False, columns=["EventId", "EventTemplate", "Occurrences"]) 226 | 227 | 228 | def printTree(self, node, dep): 229 | pStr = '' 230 | for i in range(dep): 231 | pStr += '\t' 232 | 233 | if node.depth == 0: 234 | pStr += 'Root' 235 | elif node.depth == 1: 236 | pStr += '<' + str(node.digitOrtoken) + '>' 237 | else: 238 | pStr += node.digitOrtoken 239 | 240 | print(pStr) 241 | 242 | if node.depth == self.depth: 243 | return 1 244 | for child in node.childD: 245 | self.printTree(node.childD[child], dep+1) 246 | 247 | 248 | def parse(self, logName): 249 | print('Parsing file: ' + os.path.join(self.path, logName)) 250 | start_time = datetime.now() 251 | self.logName = logName 252 | rootNode = Node() 253 | logCluL = [] 254 | 255 | self.load_data() 256 | 257 | count = 0 258 | for idx, line in self.df_log.iterrows(): 259 | logID = line['LineId'] 260 | logmessageL = self.preprocess(line['Content']).strip().split() 261 | # logmessageL = filter(lambda x: x != '', re.split('[\s=:,]', self.preprocess(line['Content']))) 262 | matchCluster = self.treeSearch(rootNode, logmessageL) 263 | 264 | #Match no existing log cluster 265 | if matchCluster is None: 266 | newCluster = Logcluster(logTemplate=logmessageL, logIDL=[logID]) 267 | logCluL.append(newCluster) 268 | self.addSeqToPrefixTree(rootNode, newCluster) 269 | 270 | #Add the new log message to the existing cluster 271 | else: 272 | newTemplate = self.getTemplate(logmessageL, matchCluster.logTemplate) 273 | matchCluster.logIDL.append(logID) 274 | if ' '.join(newTemplate) != ' '.join(matchCluster.logTemplate): 275 | matchCluster.logTemplate = newTemplate 276 | 277 | count += 1 278 | if count % 1000 == 0 or count == len(self.df_log): 279 | print('Processed {0:.1f}% of log lines.'.format(count * 100.0 / len(self.df_log))) 280 | 281 | 282 | if not os.path.exists(self.savePath): 283 | os.makedirs(self.savePath) 284 | 285 | self.outputResult(logCluL) 286 | 287 | print('Parsing done. [Time taken: {!s}]'.format(datetime.now() - start_time)) 288 | 289 | def load_data(self): 290 | headers, regex = self.generate_logformat_regex(self.log_format) 291 | self.df_log = self.log_to_dataframe(os.path.join(self.path, self.logName), regex, headers, self.log_format) 292 | 293 | def preprocess(self, line): 294 | for currentRex in self.rex: 295 | line = re.sub(currentRex, '<*>', line) 296 | return line 297 | 298 | def log_to_dataframe(self, log_file, regex, headers, logformat): 299 | """ Function to transform log file to dataframe 300 | """ 301 | log_messages = [] 302 | linecount = 0 303 | with open(log_file, 'r') as fin: 304 | for line in fin.readlines(): 305 | try: 306 | match = regex.search(line.strip()) 307 | message = [match.group(header) for header in headers] 308 | log_messages.append(message) 309 | linecount += 1 310 | except Exception as e: 311 | pass 312 | logdf = pd.DataFrame(log_messages, columns=headers) 313 | logdf.insert(0, 'LineId', None) 314 | logdf['LineId'] = [i + 1 for i in range(linecount)] 315 | return logdf 316 | 317 | 318 | def generate_logformat_regex(self, logformat): 319 | """ Function to generate regular expression to split log messages 320 | """ 321 | headers = [] 322 | splitters = re.split(r'(<[^<>]+>)', logformat) 323 | regex = '' 324 | for k in range(len(splitters)): 325 | if k % 2 == 0: 326 | splitter = re.sub(' +', '\\\s+', splitters[k]) 327 | regex += splitter 328 | else: 329 | header = splitters[k].strip('<').strip('>') 330 | regex += '(?P<%s>.*?)' % header 331 | headers.append(header) 332 | regex = re.compile('^' + regex + '$') 333 | return headers, regex 334 | 335 | def get_parameter_list(self, row): 336 | template_regex = re.sub(r"<.{1,5}>", "<*>", row["EventTemplate"]) 337 | if "<*>" not in template_regex: return [] 338 | template_regex = re.sub(r'([^A-Za-z0-9])', r'\\\1', template_regex) 339 | template_regex = re.sub(r'\\ +', r'\s+', template_regex) 340 | template_regex = "^" + template_regex.replace("\<\*\>", "(.*?)") + "$" 341 | parameter_list = re.findall(template_regex, row["Content"]) 342 | parameter_list = parameter_list[0] if parameter_list else () 343 | parameter_list = list(parameter_list) if isinstance(parameter_list, tuple) else [parameter_list] 344 | return parameter_list -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Log-based Anomaly Detection System 2 | The final project of deep learning and practice (summer 2020) in NCTU. 3 | 4 | The main procedures of this system are as follows: 5 | 1. Adopt Drain to parse log messages to extract log events (templates). 6 | 2. Extracts semantic information of log events and represents them as semantic vectors using Sentence-BERT. 7 | 3. Detects anomalies by utilizing an attention-based Bi-LSTM model, which has the ability to capture the contextual 8 | information in the log sequences and automatically learn the importance of different log events. 9 | 10 | We have evaluated our system using the public HDFS dataset, and the 11 | recall, precision and F1-score achieved by it are 0.9977, 0.9691 and 0.9832, respectively 12 | 13 | ## Preprocessing Order 14 | * unzip HDFS_1.tar.gz into log_data/ 15 | * parse_log.py 16 | * preprocessing.py 17 | 18 | ## Data Discription 19 | * total: 575,061 blocks 20 | - 16,838 anomaly blocks 21 | - 558,223 normal blocks 22 | * training: randomly select 23 | - 6,000 anomaly blocks 24 | - 6,000 normal blocks 25 | * testing: rest data 26 | 27 | ## Reference 28 | * [https://dl.acm.org/doi/10.1145/3338906.3338931](https://dl.acm.org/doi/10.1145/3338906.3338931) 29 | * [https://ieeexplore.ieee.org/document/8029742](https://ieeexplore.ieee.org/document/8029742) 30 | * [https://arxiv.org/abs/1908.10084](https://arxiv.org/abs/1908.10084) 31 | * [https://github.com/logpai/loghub](https://github.com/logpai/loghub) 32 | -------------------------------------------------------------------------------- /dataloader.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import keras 3 | from keras.preprocessing.sequence import pad_sequences 4 | import math 5 | 6 | 7 | EMBEDDING_DIM = 768 8 | 9 | 10 | class DataGenerator(keras.utils.Sequence): 11 | def __init__(self, x, y, batch_size): 12 | 'Initialization' 13 | self.x = x 14 | self.y = y 15 | self.batch_size = batch_size 16 | 17 | def __len__(self): 18 | 'Denotes the number of batches' 19 | return math.ceil(len(self.x) / self.batch_size) 20 | 21 | def __getitem__(self, index): 22 | 'Generate one batch of data' 23 | x = self.x[index * self.batch_size:(index + 1) * self.batch_size] 24 | y = self.y[index * self.batch_size:(index + 1) * self.batch_size] 25 | 26 | x = pad_sequences(x, dtype='object', padding='post', 27 | value=np.zeros(EMBEDDING_DIM)).astype(np.float32) 28 | 29 | return x, y 30 | -------------------------------------------------------------------------------- /parse_log.py: -------------------------------------------------------------------------------- 1 | import Drain 2 | 3 | input_dir = 'log_data/' # The input directory of log file 4 | output_dir = 'parse_result/' # The output directory of parsing results 5 | log_file = 'HDFS.log' # The input log file name 6 | log_format = '