├── test ├── train.log ├── README.md ├── test.log ├── slm_match.py └── slm_create.py ├── README.md ├── main.py ├── LICENSE ├── syslogparser.py └── spell.py /test/train.log: -------------------------------------------------------------------------------- 1 | this is a pen 2 | this is the pen 3 | this is a pen 4 | i am gun 5 | i am bebe 6 | i am gun and bebe 7 | i am a and b 8 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # pyspell 2 | 3 | python log parser using "Spell: Streaming Parsing of System Event Logs" 4 | 5 | ``` 6 | $ cat hoge.log | python main.py 7 | ``` 8 | -------------------------------------------------------------------------------- /test/README.md: -------------------------------------------------------------------------------- 1 | # pyspell 2 | 3 | sample 4 | 5 | ``` 6 | $ cat train.log | python slm_create.py 7 | $ ls -al slm.pickle 8 | $ cat test.log | python slm_match.py 9 | ``` 10 | -------------------------------------------------------------------------------- /test/test.log: -------------------------------------------------------------------------------- 1 | this is test_a pen 2 | this is test_the pen 3 | this is test_a pen 4 | i am test_gun 5 | i am test_bebe 6 | i am test_gun and test_bebe 7 | i am test_a and test_b 8 | -------------------------------------------------------------------------------- /test/slm_match.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | sys.path.append("../") 5 | 6 | import spell as s 7 | 8 | if __name__ == '__main__': 9 | slm = s.load('slm.pickle') 10 | #slm.dump() 11 | for i in sys.stdin.readlines(): 12 | sub = i.strip('\n') 13 | obj = slm.match(sub) 14 | print(obj.get_id(), obj.param(sub)) 15 | 16 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | import spell as s 5 | 6 | if __name__ == '__main__': 7 | slm = s.lcsmap('[\\s]+') 8 | #s.save('test.pickle', slm) 9 | #slm = s.load('test.pickle') 10 | for i in sys.stdin.readlines(): 11 | sub = i.strip('\n') 12 | obj = slm.insert(sub) 13 | print(obj.get_id(), obj.param(sub)) 14 | 15 | #print(slm.dump()) 16 | -------------------------------------------------------------------------------- /test/slm_create.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | sys.path.append("../") 5 | 6 | import spell as s 7 | 8 | 9 | if __name__ == '__main__': 10 | slm = s.lcsmap('[\\s]+') 11 | for i in sys.stdin.readlines(): 12 | sub = i.strip('\n') 13 | obj = slm.insert(sub) 14 | #print(obj.get_id(), obj.param(sub)) 15 | s.save('slm.pickle', slm) 16 | 17 | slm.dump() 18 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2018, inoue.tomoya 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | * Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /syslogparser.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | import pyparsing 5 | 6 | from pyparsing import Word 7 | from pyparsing import alphas 8 | from pyparsing import Suppress 9 | from pyparsing import Combine 10 | from pyparsing import nums 11 | from pyparsing import string 12 | from pyparsing import Optional 13 | from pyparsing import Regex 14 | #from pyparsing import Literal 15 | #from pyparsing import delimitedList 16 | 17 | from time import strftime 18 | 19 | 20 | class syslogparser(object): 21 | def __init__(self): 22 | # timestamp 23 | month = Word(string.ascii_uppercase, string.ascii_lowercase, exact=3) 24 | day = Word(nums) 25 | hour = Combine(Word(nums) + ":" + Word(nums) + ":" + Word(nums)) 26 | timestamp = Combine(month + " " + day + " " + hour) 27 | 28 | # hostname 29 | hostname = Word(alphas + nums + "_" + "-" + ".") 30 | 31 | # appname 32 | appword = Word(alphas + nums + "/" + "-" + "_" + "." + "(" + ")" + "[" + "]") 33 | appname = Combine(appword + Optional(" (" + appword)) 34 | 35 | # ProcessID 36 | #pid = Word(Suppress("[") + Word(nums) + Suppress("]")) 37 | 38 | # message 39 | message = Combine(Suppress(":") + Regex(".*")) 40 | 41 | self._pattern = timestamp + hostname + appname + message 42 | 43 | def parse(self, line): 44 | 45 | parsed = self._pattern.parseString(line) 46 | 47 | payload = {} 48 | #payload["timestamp"] = strftime("%Y-%m-%d %H:%M:%S") 49 | payload["timestamp"] = parsed[0] 50 | payload["hostname"] = parsed[1] 51 | payload["appname"] = parsed[2] 52 | payload["message"] = parsed[3] 53 | #payload["pid"] = parsed[4] 54 | 55 | return payload 56 | 57 | 58 | def main(): 59 | parser = syslogparser() 60 | 61 | for i in sys.stdin.readlines(): 62 | sub = i.strip('\n') 63 | fields = parser.parse(sub) 64 | print(fields) 65 | 66 | if __name__ == "__main__": 67 | main() 68 | 69 | 70 | -------------------------------------------------------------------------------- /spell.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import re 5 | import json 6 | import pickle 7 | 8 | class lcsobj(): 9 | 10 | def __init__(self, objid, seq, lineid, refmt): 11 | self._refmt = refmt 12 | if isinstance(seq, str) == True: 13 | self._lcsseq = re.split(self._refmt, seq.lstrip().rstrip()) 14 | else: 15 | self._lcsseq = seq 16 | self._lineids = [lineid] 17 | self._pos = [] 18 | self._sep = " " 19 | self._id = objid 20 | return 21 | 22 | def getlcs(self, seq): 23 | if isinstance(seq, str) == True: 24 | seq = re.split(self._refmt, seq.lstrip().rstrip()) 25 | count = 0 26 | lastmatch = -1 27 | for i in range(len(self._lcsseq)): 28 | #if self._lcsseq[i] == '*': 29 | if self._ispos(i) == True: 30 | continue 31 | for j in range(lastmatch+1, len(seq)): 32 | if self._lcsseq[i] == seq[j]: 33 | lastmatch = j 34 | count += 1 35 | break 36 | return count 37 | 38 | def insert(self, seq, lineid): 39 | if isinstance(seq, str) == True: 40 | seq = re.split(self._refmt, seq.lstrip().rstrip()) 41 | self._lineids.append(lineid) 42 | temp = "" 43 | lastmatch = -1 44 | placeholder = False 45 | 46 | for i in range(len(self._lcsseq)): 47 | #if self._lcsseq[i] == '*': 48 | if self._ispos(i) == True: 49 | if not placeholder: 50 | temp = temp + "* " 51 | placeholder = True 52 | continue 53 | for j in range(lastmatch+1, len(seq)): 54 | if self._lcsseq[i] == seq[j]: 55 | placeholder = False 56 | temp = temp + self._lcsseq[i] + " " 57 | lastmatch = j 58 | break 59 | elif not placeholder: 60 | temp = temp + "* " 61 | placeholder = True 62 | temp = temp.lstrip().rstrip() 63 | self._lcsseq = re.split(" ", temp) 64 | 65 | self._pos = self._get_pos() 66 | self._sep = self._get_sep() 67 | 68 | def tojson(self): 69 | temp = "" 70 | for i in self._lcsseq: 71 | temp = temp + i + " " 72 | ret = {} 73 | ret["lcsseq"] = temp 74 | ret["lineids"] = self._lineids 75 | ret["postion"] = self._pos 76 | return json.dumps(ret) 77 | 78 | def length(self): 79 | return len(self._lcsseq) 80 | 81 | def param(self, seq): 82 | if isinstance(seq, str) == True: 83 | seq = re.split(self._refmt, seq.lstrip().rstrip()) 84 | 85 | j = 0 86 | ret = [] 87 | for i in range(len(self._lcsseq)): 88 | slot = [] 89 | if self._ispos(i) == True: 90 | while j < len(seq): 91 | if i != len(self._lcsseq)-1 and self._lcsseq[i+1] == seq[j]: 92 | break 93 | else: 94 | slot.append(seq[j]) 95 | j+=1 96 | ret.append(slot) 97 | elif self._lcsseq[i] != seq[j]: 98 | return None 99 | else: 100 | j += 1 101 | 102 | if j != len(seq): 103 | return None 104 | else: 105 | return ret 106 | 107 | def re_param(self, seq): 108 | if isinstance(seq, list) == True: 109 | seq = ' '.join(seq) 110 | seq = seq.lstrip().rstrip() 111 | 112 | ret = [] 113 | print(self._sep) 114 | print(seq) 115 | p = re.split(self._sep, seq) 116 | for i in p: 117 | if len(i) != 0: 118 | ret.append(re.split(self._refmt, i.lstrip().rstrip())) 119 | if len(ret) == len(self._pos): 120 | return ret 121 | else: 122 | return None 123 | 124 | 125 | 126 | def _ispos(self, idx): 127 | for i in self._pos: 128 | if i == idx: 129 | return True 130 | return False 131 | 132 | def _tcat(self, seq, s, e): 133 | sub = '' 134 | for i in range(s, e + 1): 135 | sub += seq[i] + " " 136 | return sub.rstrip() 137 | 138 | def _get_sep(self): 139 | sep_token = [] 140 | s = 0 141 | e = 0 142 | for i in range(len(self._lcsseq)): 143 | if self._ispos(i) == True: 144 | if s != e: 145 | sep_token.append(self._tcat(self._lcsseq, s, e)) 146 | s = i + 1 147 | e = i + 1 148 | else: 149 | e = i 150 | if e == len(self._lcsseq) - 1: 151 | sep_token.append(self._tcat(self._lcsseq, s, e)) 152 | break 153 | 154 | ret = "" 155 | for i in range(len(sep_token)): 156 | if i == len(sep_token)-1: 157 | ret += sep_token[i] 158 | else: 159 | ret += sep_token[i] + '|' 160 | return ret 161 | 162 | def _get_pos(self): 163 | pos = [] 164 | for i in range(len(self._lcsseq)): 165 | if self._lcsseq[i] == '*': 166 | pos.append(i) 167 | return pos 168 | 169 | def get_id(self): 170 | return self._id 171 | 172 | class lcsmap(): 173 | 174 | def __init__(self, refmt): 175 | self._refmt = refmt 176 | self._lcsobjs = [] 177 | self._lineid = 0 178 | self._id = 0 179 | return 180 | 181 | def insert(self, entry): 182 | seq = re.split(self._refmt, entry.lstrip().rstrip()) 183 | obj = self.match(seq) 184 | if obj == None: 185 | self._lineid += 1 186 | obj = lcsobj(self._id, seq, self._lineid, self._refmt) 187 | self._lcsobjs.append(obj) 188 | self._id += 1 189 | else: 190 | self._lineid += 1 191 | obj.insert(seq, self._lineid) 192 | 193 | return obj 194 | 195 | def match(self, seq): 196 | if isinstance(seq, str) == True: 197 | seq = re.split(self._refmt, seq.lstrip().rstrip()) 198 | bestmatch = None 199 | bestmatch_len = 0 200 | seqlen = len(seq) 201 | for obj in self._lcsobjs: 202 | objlen = obj.length() 203 | if objlen < seqlen/2 or objlen > seqlen*2: continue 204 | 205 | l = obj.getlcs(seq) 206 | if l >= seqlen/2 and l > bestmatch_len: 207 | bestmatch = obj 208 | bestmatch_len = l 209 | return bestmatch 210 | 211 | def objat(self, idx): 212 | return self._lcsobjs[idx] 213 | 214 | def size(self): 215 | return len(self._lcsobjs) 216 | 217 | def dump(self): 218 | count = 0 219 | for i in self._lcsobjs: 220 | print(count, i.tojson()) 221 | count += 1 222 | 223 | 224 | def save(filename, spell_lcsmap): 225 | if type(spell_lcsmap) == lcsmap: 226 | with open(filename,'wb') as f: 227 | pickle.dump(spell_lcsmap, f) 228 | else: 229 | if __debug__ == True: 230 | print("%s isnt slm object"%filename) 231 | 232 | def load(filename): 233 | with open(filename,'rb') as f: 234 | slm = pickle.load(f) 235 | if type(slm) == lcsmap: 236 | return slm 237 | else: 238 | if __debug__ == True: 239 | print("%s isnt slm object"%filename) 240 | return None 241 | --------------------------------------------------------------------------------