├── test
    ├── train.log
    ├── README.md
    ├── test.log
    ├── slm_match.py
    └── slm_create.py
├── README.md
├── main.py
├── LICENSE
├── syslogparser.py
└── spell.py


/test/train.log:
--------------------------------------------------------------------------------
1 | this is a pen
2 | this is the pen
3 | this is a pen
4 | i am gun
5 | i am bebe
6 | i am gun and bebe
7 | i am a and b
8 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # pyspell
2 | 
3 | python log parser using "Spell: Streaming Parsing of System Event Logs"
4 | 
5 | ```
6 | $ cat hoge.log | python main.py
7 | ```
8 | 


--------------------------------------------------------------------------------
/test/README.md:
--------------------------------------------------------------------------------
 1 | # pyspell
 2 | 
 3 | sample
 4 | 
 5 | ```
 6 | $ cat train.log | python slm_create.py
 7 | $ ls -al slm.pickle
 8 | $ cat test.log | python slm_match.py
 9 | ```
10 | 


--------------------------------------------------------------------------------
/test/test.log:
--------------------------------------------------------------------------------
1 | this is test_a pen
2 | this is test_the pen
3 | this is test_a pen
4 | i am test_gun
5 | i am test_bebe
6 | i am test_gun and test_bebe
7 | i am test_a and test_b
8 | 


--------------------------------------------------------------------------------
/test/slm_match.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import sys
 4 | sys.path.append("../")
 5 | 
 6 | import spell as s
 7 | 
 8 | if __name__ == '__main__':
 9 |     slm = s.load('slm.pickle')
10 |     #slm.dump()
11 |     for i in sys.stdin.readlines():
12 |         sub = i.strip('\n')
13 |         obj = slm.match(sub)
14 |         print(obj.get_id(), obj.param(sub))
15 | 
16 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import sys
 4 | import spell as s
 5 | 
 6 | if __name__ == '__main__':
 7 |     slm = s.lcsmap('[\\s]+')
 8 |     #s.save('test.pickle', slm)
 9 |     #slm = s.load('test.pickle')
10 |     for i in sys.stdin.readlines():
11 |         sub = i.strip('\n')
12 |         obj = slm.insert(sub)
13 |         print(obj.get_id(), obj.param(sub))
14 | 
15 | #print(slm.dump())
16 | 


--------------------------------------------------------------------------------
/test/slm_create.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import sys
 4 | sys.path.append("../")
 5 | 
 6 | import spell as s
 7 | 
 8 | 
 9 | if __name__ == '__main__':
10 |     slm = s.lcsmap('[\\s]+')
11 |     for i in sys.stdin.readlines():
12 |         sub = i.strip('\n')
13 |         obj = slm.insert(sub)
14 |         #print(obj.get_id(), obj.param(sub))
15 |     s.save('slm.pickle', slm)
16 | 
17 | slm.dump()
18 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2018, inoue.tomoya
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | * Redistributions of source code must retain the above copyright notice, this
10 |   list of conditions and the following disclaimer.
11 | 
12 | * Redistributions in binary form must reproduce the above copyright notice,
13 |   this list of conditions and the following disclaimer in the documentation
14 |   and/or other materials provided with the distribution.
15 | 
16 | * Neither the name of the copyright holder nor the names of its
17 |   contributors may be used to endorse or promote products derived from
18 |   this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | 


--------------------------------------------------------------------------------
/syslogparser.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import sys
 4 | import pyparsing
 5 | 
 6 | from pyparsing import Word
 7 | from pyparsing import alphas
 8 | from pyparsing import Suppress
 9 | from pyparsing import Combine
10 | from pyparsing import nums
11 | from pyparsing import string
12 | from pyparsing import Optional
13 | from pyparsing import Regex
14 | #from pyparsing import Literal
15 | #from pyparsing import delimitedList
16 | 
17 | from time import strftime
18 | 
19 | 
20 | class syslogparser(object):
21 |     def __init__(self):
22 |         # timestamp
23 |         month = Word(string.ascii_uppercase, string.ascii_lowercase, exact=3)
24 |         day   = Word(nums)
25 |         hour  = Combine(Word(nums) + ":" + Word(nums) + ":" + Word(nums))
26 |         timestamp = Combine(month + " " + day + " " + hour)
27 | 
28 |         # hostname
29 |         hostname = Word(alphas + nums + "_" + "-" + ".")
30 | 
31 |         # appname
32 |         appword = Word(alphas + nums + "/" + "-" + "_" + "." + "(" + ")" + "[" + "]")
33 |         appname = Combine(appword + Optional(" (" + appword))
34 | 
35 |         # ProcessID
36 |         #pid = Word(Suppress("[") + Word(nums) + Suppress("]"))
37 | 
38 |         # message
39 |         message = Combine(Suppress(":") + Regex(".*"))
40 |       
41 |         self._pattern = timestamp + hostname + appname + message
42 | 
43 |     def parse(self, line):
44 | 
45 |         parsed = self._pattern.parseString(line)
46 | 
47 |         payload              = {}
48 |         #payload["timestamp"] = strftime("%Y-%m-%d %H:%M:%S")
49 |         payload["timestamp"] = parsed[0]
50 |         payload["hostname"]  = parsed[1]
51 |         payload["appname"]   = parsed[2]
52 |         payload["message"]   = parsed[3]
53 |         #payload["pid"]       = parsed[4]
54 | 
55 |         return payload
56 | 
57 | 
58 | def main():
59 |     parser = syslogparser()
60 | 
61 |     for i in sys.stdin.readlines():
62 |         sub = i.strip('\n')
63 |         fields = parser.parse(sub)
64 |         print(fields)
65 |   
66 | if __name__ == "__main__":
67 |     main()
68 | 
69 | 
70 | 


--------------------------------------------------------------------------------
/spell.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | import re
  5 | import json
  6 | import pickle
  7 | 
  8 | class lcsobj():
  9 | 
 10 |     def __init__(self, objid, seq, lineid, refmt):
 11 |         self._refmt = refmt
 12 |         if isinstance(seq, str) == True:
 13 |             self._lcsseq = re.split(self._refmt, seq.lstrip().rstrip())
 14 |         else:
 15 |             self._lcsseq = seq
 16 |         self._lineids = [lineid]
 17 |         self._pos = []
 18 |         self._sep = "	"
 19 |         self._id = objid
 20 |         return
 21 | 
 22 |     def getlcs(self, seq):
 23 |         if isinstance(seq, str) == True:
 24 |             seq = re.split(self._refmt, seq.lstrip().rstrip())
 25 |         count = 0
 26 |         lastmatch = -1
 27 |         for i in range(len(self._lcsseq)):
 28 |             #if self._lcsseq[i] == '*':
 29 |             if self._ispos(i) == True:
 30 |                 continue
 31 |             for j in range(lastmatch+1, len(seq)):
 32 |                 if self._lcsseq[i] == seq[j]:
 33 |                     lastmatch = j
 34 |                     count += 1
 35 |                     break
 36 |         return count
 37 | 
 38 |     def insert(self, seq, lineid):
 39 |         if isinstance(seq, str) == True:
 40 |             seq = re.split(self._refmt, seq.lstrip().rstrip())
 41 |         self._lineids.append(lineid)
 42 |         temp = ""
 43 |         lastmatch = -1
 44 |         placeholder = False
 45 | 
 46 |         for i in range(len(self._lcsseq)):
 47 |             #if self._lcsseq[i] == '*':
 48 |             if self._ispos(i) == True:
 49 |                 if not placeholder:
 50 |                     temp = temp + "* "
 51 |                 placeholder = True
 52 |                 continue
 53 |             for j in range(lastmatch+1, len(seq)):
 54 |                 if self._lcsseq[i] == seq[j]:
 55 |                     placeholder = False
 56 |                     temp = temp + self._lcsseq[i] + " "
 57 |                     lastmatch = j
 58 |                     break
 59 |                 elif not placeholder:
 60 |                     temp = temp + "* "
 61 |                     placeholder = True
 62 |         temp = temp.lstrip().rstrip()
 63 |         self._lcsseq = re.split(" ", temp)
 64 | 
 65 |         self._pos = self._get_pos()
 66 |         self._sep = self._get_sep()
 67 | 
 68 |     def tojson(self):
 69 |         temp = ""
 70 |         for i in self._lcsseq:
 71 |             temp = temp + i + " "
 72 |         ret = {}
 73 |         ret["lcsseq"] = temp
 74 |         ret["lineids"] = self._lineids
 75 |         ret["postion"] = self._pos
 76 |         return json.dumps(ret)
 77 | 
 78 |     def length(self):
 79 |         return len(self._lcsseq)
 80 | 
 81 |     def param(self, seq):
 82 |         if isinstance(seq, str) == True:
 83 |             seq = re.split(self._refmt, seq.lstrip().rstrip())
 84 | 
 85 |         j = 0
 86 |         ret = []
 87 |         for i in range(len(self._lcsseq)):
 88 |             slot = []
 89 |             if self._ispos(i) == True:
 90 |                 while j < len(seq):
 91 |                     if i != len(self._lcsseq)-1 and self._lcsseq[i+1] == seq[j]:
 92 |                         break
 93 |                     else:
 94 |                         slot.append(seq[j])
 95 |                     j+=1
 96 |                 ret.append(slot)
 97 |             elif self._lcsseq[i] != seq[j]:
 98 |                 return None
 99 |             else:
100 |                 j += 1
101 | 
102 |         if j != len(seq):
103 |             return None
104 |         else:
105 |             return ret
106 | 
107 |     def re_param(self, seq):
108 |         if isinstance(seq, list) == True:
109 |             seq = ' '.join(seq)
110 |         seq = seq.lstrip().rstrip()
111 | 
112 |         ret = []
113 |         print(self._sep)
114 |         print(seq)
115 |         p = re.split(self._sep, seq)
116 |         for i in p:
117 |             if len(i) != 0:
118 |                 ret.append(re.split(self._refmt, i.lstrip().rstrip()))
119 |         if len(ret) == len(self._pos):
120 |             return ret
121 |         else:
122 |             return None
123 | 
124 | 
125 | 
126 |     def _ispos(self, idx):
127 |         for i in self._pos:
128 |             if i == idx:
129 |                 return True
130 |         return False
131 | 
132 |     def _tcat(self, seq, s, e):
133 |         sub = ''
134 |         for i in range(s, e + 1):
135 |             sub += seq[i] + " "
136 |         return sub.rstrip()
137 | 
138 |     def _get_sep(self):
139 |         sep_token = []
140 |         s = 0
141 |         e = 0
142 |         for i in range(len(self._lcsseq)):
143 |             if self._ispos(i) == True:
144 |                 if s != e:
145 |                     sep_token.append(self._tcat(self._lcsseq, s, e))
146 |                 s = i + 1
147 |                 e = i + 1
148 |             else:
149 |                 e = i
150 |             if e == len(self._lcsseq) - 1:
151 |                 sep_token.append(self._tcat(self._lcsseq, s, e))
152 |                 break
153 | 
154 |         ret = ""
155 |         for i in range(len(sep_token)):
156 |             if i == len(sep_token)-1:
157 |                 ret += sep_token[i]
158 |             else:
159 |                 ret += sep_token[i] + '|'
160 |         return ret
161 | 
162 |     def _get_pos(self):
163 |         pos = []
164 |         for i in range(len(self._lcsseq)):
165 |             if self._lcsseq[i] == '*':
166 |                 pos.append(i)
167 |         return pos
168 | 
169 |     def get_id(self):
170 |         return self._id
171 | 
172 | class lcsmap():
173 | 
174 |     def __init__(self, refmt):
175 |         self._refmt = refmt
176 |         self._lcsobjs = []
177 |         self._lineid = 0
178 |         self._id = 0
179 |         return
180 | 
181 |     def insert(self, entry):
182 |         seq = re.split(self._refmt, entry.lstrip().rstrip())
183 |         obj = self.match(seq)
184 |         if obj == None:
185 |             self._lineid += 1
186 |             obj = lcsobj(self._id, seq, self._lineid, self._refmt)
187 |             self._lcsobjs.append(obj)
188 |             self._id += 1
189 |         else:
190 |             self._lineid += 1
191 |             obj.insert(seq, self._lineid)
192 | 
193 |         return obj
194 | 
195 |     def match(self, seq):
196 |         if isinstance(seq, str) == True:
197 |             seq = re.split(self._refmt, seq.lstrip().rstrip())
198 |         bestmatch = None
199 |         bestmatch_len = 0
200 |         seqlen = len(seq)
201 |         for obj in self._lcsobjs:
202 |             objlen = obj.length()
203 |             if objlen < seqlen/2 or objlen > seqlen*2: continue
204 | 
205 |             l = obj.getlcs(seq)
206 |             if l >= seqlen/2 and l > bestmatch_len:
207 |                 bestmatch = obj
208 |                 bestmatch_len = l
209 |         return bestmatch
210 | 
211 |     def objat(self, idx):
212 |         return self._lcsobjs[idx]
213 | 
214 |     def size(self):
215 |         return len(self._lcsobjs)
216 | 
217 |     def dump(self):
218 |         count = 0
219 |         for i in self._lcsobjs:
220 |             print(count, i.tojson())
221 |             count += 1
222 | 
223 | 
224 | def save(filename, spell_lcsmap):
225 |     if type(spell_lcsmap) == lcsmap:
226 |         with open(filename,'wb') as f:
227 |             pickle.dump(spell_lcsmap, f)
228 |     else:
229 |         if __debug__ == True:
230 |             print("%s isnt slm object"%filename)
231 | 
232 | def load(filename):
233 |     with open(filename,'rb') as f:
234 |         slm = pickle.load(f)
235 |         if type(slm) == lcsmap:
236 |             return slm
237 |         else:
238 |             if __debug__ == True:
239 |                 print("%s isnt slm object"%filename)
240 |             return None
241 | 


--------------------------------------------------------------------------------