├── .project ├── .pydevproject ├── .travis.yml ├── README.md ├── log_clustering.py └── requirements.txt /.project: -------------------------------------------------------------------------------- 1 | 2 | 3 | logclustering-py 4 | 5 | 6 | 7 | 8 | 9 | org.python.pydev.PyDevBuilder 10 | 11 | 12 | 13 | 14 | 15 | org.python.pydev.pythonNature 16 | 17 | 18 | -------------------------------------------------------------------------------- /.pydevproject: -------------------------------------------------------------------------------- 1 | 2 | 3 | Default 4 | python 2.7 5 | 6 | /${PROJECT_DIR_NAME} 7 | 8 | 9 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - "2.7" 4 | # command to install dependencies 5 | install: "pip install -r requirements.txt" 6 | # command to run tests 7 | script: nosetests 8 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # logclustering-py 2 | 3 | [![Build Status](https://travis-ci.org/fluency03/logclustering-py.svg?branch=master)](https://travis-ci.org/fluency03/logclustering-py) 4 | 5 | Log clustering by detecting the similarity (editing distance) between two logs. 6 | 7 | *This is part of my master thesis project and still in development.* 8 | 9 | ## Requirements 10 | 11 | - [Python 2.7](https://www.python.org/downloads/) 12 | - [NumPy](http://www.numpy.org/): The fundamental package needed for scientific computing with Python. 13 | - [editdistance](https://github.com/aflc/editdistance): Fast implementation of the edit distance(Levenshtein distance) in C++ and CPython. 14 | - [python-dateutil](https://github.com/dateutil/dateutil): Useful extensions to the standard Python datetime features. 15 | -------------------------------------------------------------------------------- /log_clustering.py: -------------------------------------------------------------------------------- 1 | """ 2 | This program is used to cluster log messages based on their similarity. The 3 | similarity between two logs is measured based the editing distance between them. 4 | For calculating editing distance, the basic unit of a log is not a character 5 | but a token. 6 | 7 | After the clustering, each of the generated cluster will be labeled with an 8 | integer ID starting from 1. ID 0 represents a place with 'no-log'. The last ID 9 | represents 'unknown-log' for further log matching. These IDs will be stored in 10 | a dictionary for matching new logs. 11 | 12 | The method 'levenshteinNumPy(source, target)'[1] implemented here is very slow 13 | compared with the package 'editdistance'[2], since the source code of 14 | 'editdistance' is written in C++. However, if we would like to modify the 15 | levenshtein algorithm by adding weights to different token classes 16 | 'levenshteinNumPy(source, target)' is easier to change the code. 17 | 18 | Author: Chang Liu (fluency03) 19 | Data: 2016-03-11 20 | 21 | [1] Wikibooks: Algorithm Implementation/Strings/Levenshtein distance - Python: 22 | https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Levenshtein_distance#Python 23 | 24 | [2] Fast implementation of the edit distance (Levenshtein distance) - C++: 25 | https://github.com/aflc/editdistance 26 | 27 | [3] The dateutil module provides powerful extensions to the standard datetime 28 | module, available in Python. 29 | https://github.com/dateutil/dateutil 30 | 31 | """ 32 | 33 | 34 | import glob 35 | import cPickle as pickle 36 | import os 37 | import re 38 | import socket 39 | import time 40 | import matplotlib.pyplot as plt 41 | import numpy as np 42 | import editdistance 43 | from dateutil.parser import parse as timeparser 44 | 45 | 46 | def is_timestamp(string): 47 | """ 48 | Check whether input str is with time format like: Feb 11 05:22:51 . 49 | 50 | Aruguments: 51 | string: {string}, input string for time stamp check. 52 | """ 53 | try: 54 | time.strptime(string, '%b %d %H:%M:%S ') 55 | return True 56 | except ValueError: 57 | return False 58 | 59 | 60 | def is_time(string): 61 | """ 62 | Check whether this is a time string. 63 | It supports most of the time format, more than is_timestamp(). 64 | 65 | Aruguments: 66 | string: {string}, input string for time check. 67 | """ 68 | try: 69 | timeparser(string) 70 | return True 71 | except ValueError: 72 | return False 73 | 74 | 75 | def is_ipv4(address): 76 | """ 77 | Check whether this is a velid ipv4 address. 78 | 79 | Aruguments: 80 | address: {string}, input string for ipv4 check. 81 | """ 82 | try: 83 | socket.inet_pton(socket.AF_INET, address) 84 | except AttributeError: # no inet_pton here, sorry 85 | try: 86 | socket.inet_aton(address) 87 | except socket.error: 88 | return False 89 | return address.count('.') == 3 90 | except socket.error: # not a valid address 91 | return False 92 | 93 | return True 94 | 95 | 96 | def is_ipv6(address): 97 | """ 98 | Check whether this is a velid ipv6 address. 99 | 100 | Aruguments: 101 | address: {string}, input string for ipv4 check. 102 | """ 103 | try: 104 | socket.inet_pton(socket.AF_INET6, address) 105 | except socket.error: # not a valid address 106 | return False 107 | return True 108 | 109 | 110 | def contain_hex(string): 111 | """ 112 | Check whether it contains hex values. 113 | 114 | Aruguments: 115 | string: {string}, input string for hex value check. 116 | """ 117 | hex_pattern = r'0x[\da-fA-F]+' 118 | 119 | return re.search(hex_pattern, string) is not None 120 | 121 | def is_ip_address(address): 122 | """ 123 | Check whether this is a valid ip address (ipv4 or ipv6). 124 | 125 | Aruguments: 126 | address: {string}, input string for ip address(ipv4/ipv6) check. 127 | """ 128 | return is_ipv4(address) or is_ipv6(address) 129 | 130 | 131 | def is_pci_address(address): 132 | """ 133 | Check whether this is a PCI address, like 0000:00:00.0 134 | 135 | Aruguments: 136 | address: {string}, input string for pci address check. 137 | """ 138 | pci_addr_pattern = r'(0000):([\da-fA-F]{2}):([\da-fA-F]{2}).(\d)' 139 | 140 | return re.search(pci_addr_pattern, address) is not None 141 | 142 | 143 | def is_number(number): 144 | """ 145 | Check whether this is a number (int, long, float, hex) . 146 | 147 | Aruguments: 148 | number: {string}, input string for number check. 149 | """ 150 | try: 151 | float(number) # for int, long, float 152 | except ValueError: 153 | try: 154 | int(number, 16) # for possible hex 155 | except ValueError: 156 | return False 157 | 158 | return True 159 | 160 | 161 | def to_wildcard(tokens): 162 | """ 163 | Replace number tokens, hex (0x...) tokens, ip addresses and 164 | pci addresses by wildcard symbol * . 165 | 166 | Aruguments: 167 | tokens: {list}, a list of tokens. 168 | """ 169 | for i in range(0, len(tokens)): 170 | token = tokens[i] 171 | if (is_number(token) or contain_hex(token) or 172 | is_ip_address(token) or is_pci_address(token)): 173 | tokens[i] = '*' 174 | else: 175 | # convert all digits in the token string into '0' 176 | tokens[i] = ''.join(('0' if char.isdigit() else char 177 | for char in token)) 178 | # tokens[i] = token 179 | 180 | return tokens 181 | 182 | 183 | def compare_two_tokens(token1, token2): 184 | """ 185 | Compare two string tokens: 186 | if either of them is *, or they are equal, return True; 187 | else, return False. 188 | 189 | Aruguments: 190 | token1, token2: {string}, two tokens two be compared. 191 | """ 192 | return token1 == '*' or token2 == '*' or token1 == token2 193 | 194 | 195 | def check_directory(path): 196 | """ 197 | Check whether the path/directory is existing. If not, create a new one. 198 | 199 | Aruguments: 200 | path: {string}, the given path/directory. 201 | """ 202 | if not os.path.exists(path): 203 | print "Directory '%s' does not exist. Creat it... " %path 204 | os.makedirs(path) 205 | 206 | 207 | class LogTemplateExtractor(object): 208 | """ 209 | A log template extractor. 210 | 211 | Attributes: 212 | logfile_path: {string}, the path of log files to be analyzed. 213 | template_file: {string}, the output file for storing log templates. 214 | cluster_file: {string}, the output file for storing clustered logs. 215 | seqfile_path: {string}, the output file for storing log sequences. 216 | search_dict_file: {string}, the output file for storing search 217 | dictionary. 218 | delimiter_kept: {regex}, delimiters for dividing a log into tokens. 219 | cmd_regex: {regex}, regular expression for extracting command token 220 | from a log message. 221 | distance_threshold: {float}, two logs with editing distance less than 222 | this distance_threshold are considered to be similar. Default: 0.1. 223 | ignored_chars: {integer}, how many chars are ignored from the beginning 224 | of a log (because of the time-stamp, server-name, etc.) Default: 21. 225 | template_dict: {dictionary}, storing all the clustered log templates 226 | and their IDs. 227 | search_dict: {dictionary}, stroing tempalte IDs for new log matching. 228 | """ 229 | def __init__(self, logfile_path): 230 | """ 231 | Inits LogTemplateExtractor class. 232 | 233 | Aruguments: 234 | logfile_path: {string}, the path of log files to be be analyzed. 235 | """ 236 | self.logfile_path = logfile_path 237 | self.template_file = "./template" 238 | self.cluster_file = "./clusters" 239 | self.search_dict_file = "./search_dict" 240 | self.seqfile_path = "./sequences/" 241 | 242 | # regex of delimiters for tokenization 243 | self.delimiter_kept = r'([\*\s,:()\[\]=|/\\{}\'\"<>\.\_\-])' 244 | 245 | # the command token could contain English letters, '-', '_', ' ' and '.' 246 | # example: rsyslogd, CMW, ERIC-RDA-Merged-Campaign, 247 | # mmas_syslog_control_setup.sh, JavaOaM install_imm_model.sh, etc. 248 | self.cmd_regex = r'([\w\-\_\./ \*]+)([\[:])(.*)' 249 | 250 | self.distance_threshold = 0.1 251 | self.ignored_chars = 21 252 | 253 | self.template_dict = {} 254 | self.search_dict = {} 255 | 256 | def set_logfile_path(self, logfile_path): 257 | """ 258 | Set the source log file (name/path) which is going to be analyzed. 259 | 260 | Aruguments: 261 | logfile_path: {string}, the path of log files to be be analyzed. 262 | """ 263 | self.logfile_path = logfile_path 264 | 265 | def set_seqfile_path(self, seqfile_path): 266 | """ 267 | Set the sequence log file (name/path) which final sequence of the logs. 268 | 269 | Aruguments: 270 | seqfile_path: {string}, the path of log files to be be matched for 271 | generating sequences. 272 | """ 273 | self.seqfile_path = seqfile_path 274 | 275 | def set_search_dict_file(self, search_dict_file): 276 | """ 277 | Set the search_dict file (name/path) which search dictionary for 278 | new input log files. 279 | 280 | Aruguments: 281 | search_dict_file: {string}, the name/path of search dictionary file. 282 | """ 283 | self.search_dict_file = search_dict_file 284 | 285 | def set_template_file(self, template_file): 286 | """ 287 | Set the template log file (name/path) which tempalte IDs and 288 | their representations. 289 | 290 | Aruguments: 291 | template_file: {string}, the name/path of template/IDs file. 292 | """ 293 | self.template_file = template_file 294 | 295 | def set_cluster_file(self, cluster_file): 296 | """ 297 | Set the log cluster file (name/path) which template ID and the 298 | logs contained in each of the clusters. 299 | 300 | Aruguments: 301 | cluster_file: {string}, the name/path of clustered logs file. 302 | """ 303 | self.cluster_file = cluster_file 304 | 305 | def set_delimiter(self, delimiter_kept): 306 | """ 307 | Set the delimiters (in regex) for dividing one log into tokens. 308 | 309 | Aruguments: 310 | delimiter_kept: {regex}, delimiters for dividing a log into tokens. 311 | """ 312 | self.delimiter_kept = delimiter_kept 313 | 314 | def set_distance_threshold(self, distance_threshold): 315 | """ 316 | Set the distance threshold 0 ~ 1 used for creating new cluster. 317 | The less the threshold is, the more similar two logs have to be 318 | if they want to be clustered together. 319 | 320 | Aruguments: 321 | distance_threshold: {float}, distance_threshold to be set. 322 | """ 323 | self.distance_threshold = distance_threshold 324 | 325 | def set_ignored_chars(self, ignored_chars): 326 | """ 327 | Set the ignored chars at the beginning of each log. 328 | 329 | Aruguments: 330 | ignored_chars: {integer}, number of ignored chars in the beginning. 331 | """ 332 | self.ignored_chars = ignored_chars 333 | 334 | def levenshtein_numpy(self, source, target): 335 | """ 336 | Dynamic Programming algorithm, with the added optimization that only 337 | the last two rows of the dynamic programming matrix are needed for 338 | the computation. Vectorized version using NumPy. 339 | 340 | Aruguments: 341 | source, target: {list}, two lists of tokens to be compared. 342 | """ 343 | if len(source) < len(target): 344 | return self.levenshtein_numpy(target, source) 345 | 346 | # So now we have len(source) >= len(target). 347 | if len(target) == 0: 348 | return len(source) 349 | 350 | # We call tuple() to force strings to be used as sequences 351 | # ('c', 'a', 't', 's') - numpy uses them as values by default. 352 | source = np.array(tuple(source)) 353 | target = np.array(tuple(target)) 354 | 355 | # We use a dynamic programming algorithm, but with the 356 | # added optimization that we only need the last two rows 357 | # of the matrix. 358 | previous_row = np.arange(target.size + 1) # pylint: disable=E1101 359 | for item in source: 360 | # Insertion (target grows longer than source): 361 | current_row = previous_row + 1 362 | 363 | # Substitution or matching: 364 | # Target and source items are aligned, and either 365 | # are different (cost of 1), or are the same (cost of 0). 366 | current_row[1:] = np.minimum(current_row[1:], 367 | np.add(previous_row[:-1], 368 | target != item)) 369 | 370 | # Deletion (target grows shorter than source): 371 | current_row[1:] = np.minimum(current_row[1:], 372 | current_row[0:-1] + 1) 373 | 374 | previous_row = current_row 375 | 376 | return float(previous_row[-1]) / float(max(len(source), len(target))) 377 | 378 | def tokenize(self, line): 379 | """ 380 | Tokenize the line. 381 | 382 | Aruguments: 383 | line: {string}, one input log message. 384 | """ 385 | return [t for t in re.split(self.delimiter_kept, line) if t is not ''] 386 | 387 | def min_distance(self, added_line, one_cluster_dict): 388 | """ 389 | Calculate the minimal distance between the log and all the sub-clusters 390 | from previous pre-partitioned cluster. 391 | 392 | Aruguments: 393 | added_line: {list}, a list of tokens. 394 | one_cluster_dict: {dictionary}, a dictionary for some clusters. 395 | Return the minimal distance and its index (key for cluster). 396 | """ 397 | # dictionary of the distance between this log and 398 | # each of its compared clusters 399 | distance = {} 400 | 401 | len_line = len(added_line) 402 | 403 | for i in one_cluster_dict: 404 | cluster = one_cluster_dict[i] 405 | 406 | # the first log of this cluster represents this cluster 407 | cluster_line = cluster[0] 408 | len_cluster = len(cluster_line) 409 | 410 | # if the length difference is already beyond the distance threshold 411 | # there is no need to calculate the editing distance, and 412 | # the distance ratio will be set to 1 413 | if (abs(len_cluster - len_line) / min(len_line, len_cluster) < 414 | self. distance_threshold): 415 | dis_ratio = ( 416 | float(editdistance.eval(cluster_line, added_line)) / 417 | float(min(len(added_line), len(cluster_line)))) 418 | else: 419 | dis_ratio = float(1) 420 | 421 | distance[i] = dis_ratio 422 | 423 | # find the minimal distance and its key value 424 | mini = min(distance.iteritems(), key=lambda x: x[1]) 425 | 426 | return mini[1], mini[0] 427 | 428 | def add_log(self, added_line, command_cluster): 429 | """ 430 | Add this log into partition, or create a new partition. 431 | 432 | Aruguments: 433 | added_line: {list}, a list of tokens. 434 | command_cluster: {dictionary}, a dictionary for certain cluster of 435 | logs with same command. 436 | Return the minimal distance and its index (key for cluster). 437 | """ 438 | # pattern for extracting the command. 439 | cmd_pattern = re.compile(self.cmd_regex) 440 | 441 | # print added_line 442 | # extract command 443 | command = re.match(cmd_pattern, 444 | added_line[self.ignored_chars:]).group(1) 445 | # tokenize the log message 446 | line_tokens = self.tokenize(added_line[self.ignored_chars:]) 447 | # convert numbers, hexs, ip address, pci address to * 448 | line_tokens = to_wildcard(line_tokens) 449 | 450 | # get the length of this token list 451 | length = len(line_tokens) 452 | 453 | # if this cluster (command, length) existing, 454 | # append current log into its cluster; 455 | # if not, create a new cluster, key is (command, length), 456 | # initial value is [current log] 457 | command_cluster.setdefault( 458 | (command, length), []).append(line_tokens) 459 | 460 | def partition_by_command(self): 461 | """ 462 | First partition the original logs based on their command type and the 463 | length of each log because: 464 | 1. Dramatically reduce the computational time, especially plenty of 465 | time spent on levenshtein distance. 466 | 2. Naturally, we should cluster logs starting with different command 467 | names into different clusters. 468 | 3. The logs within one cluster sharing same length will make the next 469 | template extraction step easier. 470 | """ 471 | # dictionary of the partitions divided based on 472 | # the tuple of command type and log length 473 | command_cluster = {} 474 | 475 | # keep track of the the number of each log 476 | # current_num = 0 477 | 478 | # log files 479 | logfiles = glob.glob(self.logfile_path) 480 | 481 | print " |-Number of log files to be analyzed: %d" %len(logfiles) 482 | 483 | for logfile in logfiles: 484 | print " " + logfile 485 | with open(logfile) as in_file: 486 | # read the first line 487 | added_line = in_file.readline() 488 | # current_num = current_num + 1 489 | 490 | # the real first line is the first log appearing with time-stamp 491 | while not is_timestamp(added_line[:16]): 492 | added_line = in_file.readline() 493 | # current_num = current_num + 1 494 | 495 | # read th following lines 496 | for line in in_file: 497 | # current_num = current_num + 1 498 | 499 | # if the current line is not with time-stamp, it will be 500 | # added together to its previous logs until the the previous 501 | # nearest log with time-stamp 502 | if not is_time(line[:16]): 503 | added_line = added_line.rstrip() + ' | ' + line 504 | continue 505 | else: 506 | self.add_log(added_line, command_cluster) 507 | # update added_line 508 | added_line = line 509 | 510 | # Take the last line into account 511 | self.add_log(added_line, command_cluster) 512 | 513 | return command_cluster 514 | 515 | def log_clustering(self, print_clusters=False): 516 | """ 517 | Similarity checks and clustering after partitioning based on command. 518 | Cluster ID starts from 1, all integers. 519 | 520 | Aruguments: 521 | print_clusters: {bool}, whether write the clusters into a file. 522 | """ 523 | print " |-Clustering logs..." 524 | 525 | # clusters based on command and log length 526 | command_cluster = self.partition_by_command() 527 | 528 | # dictionary of the log clusters 529 | cluster_dict = {} 530 | # keep track of the log cluster number 531 | cluster_num = 1 532 | 533 | for i in command_cluster: 534 | one_cluster_dict = {} 535 | for line in command_cluster[i]: 536 | if not one_cluster_dict: 537 | one_cluster_dict[cluster_num] = [line] 538 | cluster_num += 1 539 | else: 540 | # get the minimal distance ratio and its index key 541 | min_dis, min_key = self.min_distance(line, one_cluster_dict) 542 | 543 | # if minimal distance ratio is less than the threshold, 544 | # add this log into the cluster according to the index key; 545 | # otherwise, create a new cluster 546 | if min_dis < self.distance_threshold: 547 | one_cluster_dict[min_key].append(line) 548 | else: 549 | one_cluster_dict[cluster_num] = [line] 550 | cluster_num += 1 551 | 552 | # put all new clusters into the dictionary 553 | cluster_dict.update(one_cluster_dict) 554 | 555 | # print the clusters 556 | if print_clusters: 557 | print " |-Write the clusters into %s ..." %self.cluster_file 558 | with open(self.cluster_file, 'w') as cluster_file: 559 | for i in cluster_dict: 560 | cluster_file.write(str(i) + '\n') 561 | for item in cluster_dict[i]: 562 | cluster_file.write(''.join(item).rstrip() + '\n') 563 | print " |-Write the clusters into %s.pkl ..." %self.cluster_file 564 | with open(self.cluster_file + '.pkl', 'w') as cluster_pkl_file: 565 | pickle.dump(cluster_dict, cluster_pkl_file) 566 | 567 | print " |-Number of clusters generated: %d" %len(cluster_dict) 568 | 569 | return cluster_dict 570 | 571 | def log_template(self, cluster): # pylint: disable=R0201 572 | """ 573 | Collect the unique tokens at each position of a log within a cluster. 574 | Update the positions where >1 unique tokens by wildcard *. 575 | Generate the template representation for this cluster. 576 | 577 | Aruguments: 578 | cluster: {dictionary}, a cluster of similar logs. 579 | """ 580 | # the first log represents this cluster 581 | line_tokens = cluster[0] 582 | # get the length 583 | length = len(line_tokens) 584 | 585 | # a list of dictionaries represents at each of the token position 586 | # how many different tokens there are, and what are they 587 | token_collection = [] 588 | for line in cluster: 589 | for i in range(0, length): 590 | token = line[i] 591 | if len(token_collection) > i: 592 | token_collection[i].setdefault(token) 593 | else: 594 | token_collection.append({token: None}) 595 | 596 | # for positions sharing more than one unique token, 597 | # regard them as variables and convert them into * 598 | for i in range(0, length): 599 | if len(token_collection[i]) is not 1: 600 | line_tokens[i] = '*' 601 | 602 | return ''.join(line_tokens).rstrip() + '\n' 603 | 604 | def discover_template(self, print_clusters=False, print_templates=False): 605 | """ 606 | Abstract the template representation from each of the clusters. 607 | 608 | Aruguments: 609 | print_clusters: {bool}, whether write the clusters into a file. 610 | print_templates: {bool}, whether write the templates into a file. 611 | """ 612 | if os.path.isfile(self.cluster_file + '.pkl'): 613 | print "%s.pkl existing, loading it...\n" %self.cluster_file 614 | with open(self.cluster_file + '.pkl') as cluster_pkl_file: 615 | cluster_dict = pickle.load(cluster_pkl_file) 616 | else: 617 | # get the log cluster dictionary 618 | print "%s.pkl not existing, generate it...\n" %self.cluster_file 619 | cluster_dict = self.log_clustering(print_clusters=print_clusters) 620 | 621 | print "\n |-Extracting templates..." 622 | 623 | # get each of the tempalte representations into the template_dict 624 | for i in cluster_dict: 625 | self.template_dict.setdefault(i, self.log_template(cluster_dict[i])) 626 | 627 | # print the template representations 628 | if print_templates: 629 | print " |-Write the templates into %s ..." %self.template_file 630 | with open(self.template_file, 'w') as template_file: 631 | for i in self.template_dict: 632 | template_file.write(str(i) + '\n') 633 | for item in self.template_dict[i]: 634 | template_file.write(item) 635 | print " |-Write the templates into %s.pkl ..." %self.template_file 636 | with open(self.template_file + '.pkl', 'w') as template_pkl_file: 637 | pickle.dump(self.template_dict, template_pkl_file) 638 | 639 | print " |-Number of tempaltes extracted: %d" %len(self.template_dict) 640 | 641 | def generate_search_dict(self, print_search_dict=False, 642 | print_clusters=False, print_templates=False): 643 | """ 644 | Generate the hashtable for matching new logs and ID them. 645 | 646 | Aruguments: 647 | print_search_dict: {bool}, whether write the search dictionaries 648 | into a file. 649 | print_clusters: {bool}, whether write the clusters into a file. 650 | print_templates: {bool}, whether write the templates into a file. 651 | """ 652 | 653 | # Generate the template dictionary if it is empty. 654 | if os.path.isfile(self.template_file + '.pkl'): 655 | print "%s.pkl existing, loading it...\n" %self.template_file 656 | with open(self.template_file + '.pkl') as template_pkl_file: 657 | self.template_dict = pickle.load(template_pkl_file) 658 | else: 659 | print "%s.pkl not existing, generate it...\n" %self.template_file 660 | self.discover_template(print_clusters=print_clusters, 661 | print_templates=print_templates) 662 | 663 | print "\n |-Generating the search dictionary..." 664 | 665 | # regex for extracting command 666 | cmd_pattern = re.compile(self.cmd_regex) 667 | 668 | # go through each of the log templates in the dictionary 669 | # and put their IDs into the search dictionary according to 670 | # the command and tokenized log length 671 | for id_ in self.template_dict: 672 | # get te tempalte representation 673 | tempalte = self.template_dict[id_] 674 | # print tempalte 675 | # get the command of thie template 676 | command = re.match(cmd_pattern, tempalte).group(1) 677 | 678 | # get the token list of this template 679 | tempalte_tokens = self.tokenize(tempalte) 680 | # get the length of this template 681 | length = len(tempalte_tokens) 682 | 683 | self.search_dict.setdefault((command, length), 684 | []).append(id_) 685 | 686 | # print the template search dictionary 687 | if print_search_dict: 688 | print (" |-Writing the search dictionary into %s ..." 689 | %self.search_dict_file) 690 | with open(self.search_dict_file, 'w') as search_dict_file: 691 | for i in self.search_dict: 692 | search_dict_file.write('\n' + str(i) + '\n') 693 | for item in self.search_dict[i]: 694 | search_dict_file.write(str(item) + ' ') 695 | print (" |-Writing the search dictionary into %s.pkl ..." 696 | %self.search_dict_file) 697 | with open(self.search_dict_file + '.pkl', 698 | 'w') as search_dict_pkl_file: 699 | pickle.dump(self.search_dict, search_dict_pkl_file) 700 | 701 | print " |-Template search dictionary generated!\n" 702 | 703 | def match_log(self, added_line, seq_file): 704 | """ 705 | Match this log with the logs in search_dict. 706 | 707 | Aruguments: 708 | added_line: {string}, a line of log to be matched. 709 | seq_file: {file}, output sequence file. 710 | """ 711 | # match flag 712 | is_matched = False 713 | 714 | # regex for extracting command 715 | cmd_pattern = re.compile(self.cmd_regex) 716 | # extract command 717 | command = re.match(cmd_pattern, 718 | added_line[self.ignored_chars:]).group(1) 719 | 720 | # tokenize the log message 721 | line_tokens = self.tokenize(added_line[self.ignored_chars:]) 722 | # convert numbers, hexs, ip address, pci address to * 723 | line_tokens = to_wildcard(line_tokens) 724 | # get the length of this token list 725 | length = len(line_tokens) 726 | 727 | # find this log in the search_dict 728 | if self.search_dict.has_key((command, length)): 729 | matched_list = self.search_dict[(command, length)] 730 | 731 | # compare the current new log to all tempaltes 732 | # in the selected matched_list 733 | for id_ in matched_list: 734 | # each of the tokenized template to be compared 735 | to_be_compared = self.tokenize(self.template_dict[id_]) 736 | 737 | # put False into the compare result, if there is not-matching 738 | # between two tokens at certain position 739 | compare_result = [False for a, b in zip(to_be_compared, 740 | line_tokens) 741 | if not compare_two_tokens(a, b)] 742 | 743 | # if compare_result is empty, that means they are matched 744 | if not compare_result: 745 | is_matched = True 746 | 747 | # if they are matched, ouput the template ID 748 | if is_matched: 749 | seq_file.write(str(id_) + '\n') 750 | # print str(current_num) + ' True' 751 | break 752 | 753 | # if no match, that means this log is a new one 754 | # output the tmplate ID is '0', which means 'unknown' 755 | if not is_matched: 756 | seq_file.write('0\n') 757 | # print str(current_num) + ' False' 758 | 759 | 760 | def generate_sequence(self, new_logfile_path, print_search_dict=False, 761 | print_clusters=False, print_templates=False): 762 | """ 763 | Generate the log sequence based on previous generated templates and 764 | new input log files. 765 | Either: find the correct ID for each of the new log; 766 | Or: put the un-matched logs into the cluster '0', representing 'unknown' 767 | 768 | Aruguments: 769 | new_logfile_path: {string}, the path of log files to be matched for 770 | generating sequences. 771 | print_search_dict: {bool}, whether write the search dictionaries 772 | into a file. 773 | print_clusters: {bool}, whether write the clusters into a file. 774 | print_templates: {bool}, whether write the templates into a file. 775 | """ 776 | # Generate the template dictionary if it is empty. 777 | if os.path.isfile(self.template_file + '.pkl'): 778 | print "%s.pkl existing, loading it...\n" %self.template_file 779 | with open(self.template_file + '.pkl') as template_pkl_file: 780 | self.template_dict = pickle.load(template_pkl_file) 781 | else: 782 | print "%s.pkl not existing, generate it...\n" %self.template_file 783 | self.discover_template(print_clusters=print_clusters, 784 | print_templates=print_templates) 785 | 786 | # Generate the search_dict if it is empty. 787 | if os.path.isfile(self.search_dict_file + '.pkl'): 788 | print "%s.pkl existing, loading it...\n" %self.search_dict_file 789 | with open(self.search_dict_file + '.pkl') as search_dict_pkl_file: 790 | self.search_dict = pickle.load(search_dict_pkl_file) 791 | else: 792 | print "%s.pkl not existing, generate it...\n" %self.search_dict_file 793 | self.generate_search_dict(print_search_dict=print_search_dict, 794 | print_clusters=print_clusters, 795 | print_templates=print_templates) 796 | 797 | # current_num = 0 798 | 799 | print "Start to generate sequence." 800 | print "Writing the sequence into %s ..." %self.seqfile_path 801 | check_directory(self.seqfile_path) 802 | 803 | # log files 804 | new_logfiles = glob.glob(new_logfile_path) 805 | 806 | for new_logfile in new_logfiles: 807 | # print the template representations 808 | with open(new_logfile, 'r') as new_file: 809 | seqfile_path = self.seqfile_path + new_logfile.split("/")[-1] 810 | with open(seqfile_path, 'w') as seq_file: 811 | print " " + seqfile_path 812 | added_line = new_file.readline() 813 | # current_num = current_num + 1 814 | 815 | # the real first line is the first log appearing with time-stamp 816 | while not is_timestamp(added_line[:16]): 817 | added_line = new_file.readline() 818 | # current_num = current_num + 1 819 | 820 | # read th following lines 821 | for line in new_file: 822 | # current_num = current_num + 1 823 | # print current_num 824 | 825 | # if the current line is not starting with time-stamp, it 826 | # will be added together to its previous logs until the 827 | # previous nearest log with time-stamp 828 | if not is_time(line[:16]): 829 | added_line = added_line.rstrip() + ' | ' + line 830 | continue 831 | else: 832 | # match the log with search_dict 833 | self.match_log(added_line, seq_file) 834 | # update added_line 835 | added_line = line 836 | 837 | # Take the last line into account 838 | self.match_log(added_line, seq_file) 839 | 840 | print "Sequece generated!\n" 841 | 842 | def generate_histogram(self): 843 | """ 844 | Calculate the histogram for each of the generated sequence files. 845 | """ 846 | print "Generate histogram...\n" 847 | # sequence files 848 | seq_files = glob.glob(self.seqfile_path + "*") 849 | 850 | for seq_file in seq_files: 851 | print " " + seq_file 852 | with open(seq_file, 'r') as seqfile: 853 | sequence = [int(id_) for id_ in seqfile] 854 | hist, bin_edges = np.histogram(sequence, # pylint: disable=W0612 855 | bins=range(max(sequence))) 856 | plt.hist(hist, bins=range(max(sequence))) 857 | plt.xlim(0, 100) 858 | plt.ylim(0, 600) 859 | plt.savefig(seq_file.split("/")[-1]) 860 | plt.clf() 861 | plt.cla() 862 | 863 | def plot_dots(self): 864 | """ 865 | Plot curve for each of the generated sequence files. 866 | """ 867 | print "Plot curve...\n" 868 | # sequence files 869 | seq_files = glob.glob(self.seqfile_path + "*") 870 | 871 | for seq_file in seq_files: 872 | print " " + seq_file 873 | with open(seq_file, 'r') as seqfile: 874 | sequence = [int(id_) for id_ in seqfile] 875 | # t = np.arange(0, len(sequence), 1) 876 | plt.plot(sequence, 'r*') 877 | plt.xlim(0, 50000) 878 | plt.ylim(0, 3500) 879 | plt.savefig("curve_" + seq_file.split("/")[-1]) 880 | plt.clf() 881 | plt.cla() 882 | 883 | def main(): 884 | """ 885 | Main function 886 | """ 887 | print "\nStart...\n" 888 | 889 | start_time = time.time() 890 | 891 | logfile_path = "./log-normal/*" 892 | new_logfile_path = "./log-big/*" 893 | extractor = LogTemplateExtractor(logfile_path) 894 | extractor.set_template_file("./template") 895 | extractor.set_cluster_file("./clusters") 896 | extractor.set_seqfile_path("./sequences-big/") 897 | extractor.set_search_dict_file("./search_dict") 898 | 899 | extractor.generate_sequence(new_logfile_path, print_search_dict=True, 900 | print_clusters=True, print_templates=True) 901 | 902 | # extractor.generate_histogram() 903 | 904 | # extractor.plot_dots() 905 | 906 | stop_time = time.time() 907 | 908 | print "Stop...\n" 909 | 910 | print "--- %s seconds ---\n" % (stop_time - start_time) 911 | 912 | 913 | # ---------------------------- For debugging ---------------------------- # 914 | 915 | 916 | # ---------------------------- For debugging ---------------------------- # 917 | 918 | 919 | 920 | if __name__ == "__main__": 921 | main() 922 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | editdistance 2 | python-dateutil 3 | numpy 4 | --------------------------------------------------------------------------------