├── GeoIPData
    ├── GeoIPASNum.dat
    ├── GeoLite2-City.mmdb
    └── GeoLiteCity.dat
├── README.MD
├── common
    ├── GeoIPUtils.py
    ├── __init__.py
    ├── convert.py
    ├── logger.py
    ├── rule.json
    ├── threadpool.py
    └── units.py
├── main.py
└── module
    ├── __init__.py
    ├── apache_parser.py
    ├── iis_parser.py
    ├── nginx_parser.py
    └── tomcat_parser.py


/GeoIPData/GeoIPASNum.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/0xa-saline/Logpara/b0be8febb7a9287157a119d5eb7b1dc8bc988a29/GeoIPData/GeoIPASNum.dat


--------------------------------------------------------------------------------
/GeoIPData/GeoLite2-City.mmdb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/0xa-saline/Logpara/b0be8febb7a9287157a119d5eb7b1dc8bc988a29/GeoIPData/GeoLite2-City.mmdb


--------------------------------------------------------------------------------
/GeoIPData/GeoLiteCity.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/0xa-saline/Logpara/b0be8febb7a9287157a119d5eb7b1dc8bc988a29/GeoIPData/GeoLiteCity.dat


--------------------------------------------------------------------------------
/README.MD:
--------------------------------------------------------------------------------
  1 | ## Logpara
  2 | > 一个对常见的web日志进行解析处理的粗糙DEMO。  
  3 | 
  4 | [![Python 2.7](https://img.shields.io/badge/python-2.7-yellow.svg)](https://www.python.org/)  [![License](https://img.shields.io/badge/license-GPLv2-red.svg)](https://github.com/0xa-saline/Logpara/blob/master/GPL-2.0)
  5 | 
  6 | #### 目标
  7 | - 对被请求的URL进行解析，解析出是否常见的攻击方式
  8 | - 对来访的IP进行深度解析，包含经纬度,物理地址
  9 | - 对来访的UA进行深度解析，解析出设备,浏览器种类,是否爬虫
 10 | - 把全部的日志解析了入库，做RELK处理
 11 | 
 12 | 
 13 | ### TO DO
 14 | - 对入库elasticsearch的日志进行处理并展示
 15 | 
 16 | 
 17 | ### Useage
 18 | - 使用之前先修改common/units.py
 19 |     ```
 20 |     redis_host = '192.168.87.222'
 21 |     redis_port = 6379
 22 |     redis_pass = 'cft67ygv'
 23 |     redis_db = 0
 24 |     redis_key = 'logstash:redis'
 25 |     ```
 26 | 
 27 | - 使用
 28 | 
 29 | ```
 30 | Usage: main.py --type IIS|Apache|Tomcat|Nginx --file file|directory
 31 | 
 32 | log parser
 33 | 
 34 | Options:
 35 |   -h, --help   show this help message and exit
 36 |   --type=TYPE  chose which log type
 37 |   --file=FILE  chose file or directory
 38 | ```
 39 | 
 40 | RELK是什么
 41 | ----
 42 | * elasticsearch
 43 | * logstash
 44 | * kibana
 45 | * redis
 46 | 
 47 | 
 48 | 以上这些程序的首字母、简称，每次都打出全部的感觉好累，就这样简称吧。
 49 | 
 50 | 服务器要求
 51 | ----
 52 | ```
 53 | elk+redis 一台    简称为ELK机器
 54 | ```
 55 | 
 56 | RELK机器配置
 57 | ----
 58 | 
 59 | ##### redis设置密码
 60 | 
 61 | ##### 修改 ElasticSearch 配置 `sudo vim /etc/elasticsearch/elasticsearch.yml`，搜索`network.host`，修改如下配置
 62 | ```
 63 | network.host: localhost
 64 | ```
 65 | 
 66 | ##### 添加 Logstash 配置 `sudo vim /etc/logstash/conf.d/config.conf`
 67 | ```
 68 | input {    
 69 |     redis {
 70 |         host => '127.0.0.1'
 71 |         password => 'password'
 72 |         data_type => 'list'
 73 |         key => 'logstash:redis'
 74 |     }
 75 | }
 76 | output {
 77 |     elasticsearch { hosts => localhost }
 78 |     stdout { codec => rubydebug }
 79 | }
 80 | ```
 81 | 
 82 | ##### 运行Logstash
 83 | ```
 84 | sudo nohup /opt/logstash/bin/logstash -f /etc/logstash/conf.d/ &
 85 | ```
 86 | 
 87 | ##### 修改 Kibana 配置 `sudo vim /opt/kibana/config/kibana.yml`，搜索`server.host`，修改如下配置：
 88 | ```
 89 | server.host: "0.0.0.0"
 90 | ```
 91 | 
 92 | ##### 以上修改配置之后均要重启一次服务。
 93 | 
 94 | 
 95 | 流程
 96 | 
 97 | ## FAQ
 98 | - 非常规配置的日志不能识别？  
 99 |     不能识别也是很难受，但是自己修改正则去匹配吧
100 | - 没有weblogic/oracle日志模块？  
101 |     暂时没有测试环境
102 | 
103 | 
104 | ## 免责
105 | 日志分析的项目仅用于学习，自检网络安全，禁止用于其他用途。
106 | 
107 | 


--------------------------------------------------------------------------------
/common/GeoIPUtils.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #-*- coding:utf-8 -*-
 3 | import pygeoip, geoip2.database, os, re
 4 | 
 5 | class GeoIPUtil():
 6 |     def __init__(self):
 7 |         self.geocity = pygeoip.GeoIP(os.path.join(os.path.split(os.path.realpath(__file__))[0], '../GeoIPData', 'GeoLiteCity.dat'))
 8 |         self.geoasn = pygeoip.GeoIP(os.path.join(os.path.split(os.path.realpath(__file__))[0], '../GeoIPData', 'GeoIPASNum.dat'))
 9 |         self.as_re = re.compile('AS(?P<num>\d+)(?: (?P<name>.+))?')
10 | 
11 |     def get_AS_info_by_ip(self, ip):
12 |         asn = self.geoasn.asn_by_addr(ip)
13 |         if asn != None:
14 |             (asnum, asname) = self.as_re.match(asn).groups()
15 |         else:
16 |             (asnum, asname) = (0, 'None')
17 |         return (asnum, asname)
18 |     
19 |     def get_lat_alt(self, ip):
20 |         loc=self.geocity.record_by_name(ip)
21 |         if loc is None:
22 |             return None
23 |         return [loc['longitude'],loc['latitude']]
24 | 
25 | class GeoIP2Util():
26 |     def __init__(self):
27 |         self.reader = geoip2.database.Reader(os.path.join(os.path.split(os.path.realpath(__file__))[0], '../GeoIPData', 'GeoLite2-City.mmdb'))
28 | 
29 |     def get_lat_alt(self, ip):
30 |         response = self.reader.city(ip)
31 |         if response is None:
32 |             return None
33 |         return [response.location.longitude, response.location.latitude]
34 | 
35 |     def get_ip_location(self, ip):
36 |         response = self.reader.city(ip)
37 | 
38 |         if response.country.names.has_key('zh-CN'):
39 |             country = response.country.names['zh-CN']
40 |         else:
41 |             country = response.country.name
42 |         if response.subdivisions.most_specific.names.has_key('zh-CN'):
43 |             subdivision = response.subdivisions.most_specific.names['zh-CN']
44 |         else:
45 |             subdivision = response.subdivisions.most_specific.name
46 |         if response.city.names.has_key('zh-CN'):
47 |             city = response.city.names['zh-CN']
48 |         else:
49 |             city = response.city.name
50 |         return (country, subdivision, city)
51 | 
52 | if __name__ == '__main__':
53 |     gi = GeoIPUtil()
54 |     longitude,latitude = gi.get_lat_alt('220.181.171.119')
55 |     print latitude,",",longitude
56 |     gi = GeoIP2Util()
57 |     #longitude1,latitude1 = gi.get_lat_alt('220.181.171.119')
58 |     #print longitude1,latitude1
59 |     country, subdivision, city =  gi.get_ip_location('220.181.171.119')
60 |     print country, subdivision, city
61 | 


--------------------------------------------------------------------------------
/common/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/0xa-saline/Logpara/b0be8febb7a9287157a119d5eb7b1dc8bc988a29/common/__init__.py


--------------------------------------------------------------------------------
/common/convert.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # from slqmap
  3 | try:
  4 |     import cPickle as pickle
  5 | except:
  6 |     import pickle
  7 | finally:
  8 |     import pickle as picklePy
  9 | import urllib
 10 | import base64
 11 | import json
 12 | import re
 13 | import StringIO
 14 | import sys
 15 | import subprocess
 16 | 
 17 | # System variables
 18 | IS_WIN = subprocess.mswindows
 19 | # Encoding used for Unicode data
 20 | UNICODE_ENCODING = "utf8"
 21 | 
 22 | # Chars which can be used as a failsafe values in case of too long URL encoding value
 23 | URLENCODE_FAILSAFE_CHARS = '()|,'
 24 | 
 25 | # Table used for Base64 conversion in WordPress hash cracking routine
 26 | 
 27 | class PLACE:
 28 |     GET = "GET"
 29 |     POST = "POST"
 30 |     URI = "URI"
 31 |     COOKIE = "Cookie"
 32 |     USER_AGENT = "User-Agent"
 33 |     REFERER = "Referer"
 34 |     HOST = "Host"
 35 |     CUSTOM_POST = "(custom) POST"
 36 |     CUSTOM_HEADER = "(custom) HEADER"
 37 | 
 38 | def base64decode(value):
 39 |     """
 40 |     Decodes string value from Base64 to plain format
 41 | 
 42 |     >>> base64decode('Zm9vYmFy')
 43 |     'foobar'
 44 |     """
 45 | 
 46 |     return base64.b64decode(value)
 47 | 
 48 | def base64encode(value):
 49 |     """
 50 |     Encodes string value from plain to Base64 format
 51 | 
 52 |     >>> base64encode('foobar')
 53 |     'Zm9vYmFy'
 54 |     """
 55 | 
 56 |     return base64.b64encode(value)
 57 | 
 58 | def base64pickle(value):
 59 |     """
 60 |     Serializes (with pickle) and encodes to Base64 format supplied (binary) value
 61 | 
 62 |     >>> base64pickle('foobar')
 63 |     'gAJVBmZvb2JhcnEBLg=='
 64 |     """
 65 | 
 66 |     retVal = None
 67 | 
 68 |     try:
 69 |         retVal = base64encode(pickle.dumps(value, pickle.HIGHEST_PROTOCOL))
 70 |     except:
 71 |         warnMsg = "problem occurred while serializing "
 72 |         warnMsg += "instance of a type '%s'" % type(value)
 73 |         singleTimeWarnMessage(warnMsg)
 74 | 
 75 |         try:
 76 |             retVal = base64encode(pickle.dumps(value))
 77 |         except:
 78 |             retVal = base64encode(pickle.dumps(str(value), pickle.HIGHEST_PROTOCOL))
 79 | 
 80 |     return retVal
 81 | 
 82 | def base64unpickle(value, unsafe=False):
 83 |     """
 84 |     Decodes value from Base64 to plain format and deserializes (with pickle) its content
 85 | 
 86 |     >>> base64unpickle('gAJVBmZvb2JhcnEBLg==')
 87 |     'foobar'
 88 |     """
 89 | 
 90 |     retVal = None
 91 | 
 92 |     def _(self):
 93 |         if len(self.stack) > 1:
 94 |             func = self.stack[-2]
 95 |             if func not in PICKLE_REDUCE_WHITELIST:
 96 |                 raise Exception, "abusing reduce() is bad, Mkay!"
 97 |         self.load_reduce()
 98 | 
 99 |     def loads(str):
100 |         f = StringIO.StringIO(str)
101 |         if unsafe:
102 |             unpickler = picklePy.Unpickler(f)
103 |             unpickler.dispatch[picklePy.REDUCE] = _
104 |         else:
105 |             unpickler = pickle.Unpickler(f)
106 |         return unpickler.load()
107 | 
108 |     try:
109 |         retVal = loads(base64decode(value))
110 |     except TypeError: 
111 |         retVal = loads(base64decode(bytes(value)))
112 | 
113 |     return retVal
114 | 
115 | def hexdecode(value):
116 |     """
117 |     Decodes string value from hex to plain format
118 | 
119 |     >>> hexdecode('666f6f626172')
120 |     'foobar'
121 |     """
122 | 
123 |     value = value.lower()
124 |     return (value[2:] if value.startswith("0x") else value).decode("hex")
125 | 
126 | def hexencode(value):
127 |     """
128 |     Encodes string value from plain to hex format
129 | 
130 |     >>> hexencode('foobar')
131 |     '666f6f626172'
132 |     """
133 | 
134 |     return utf8encode(value).encode("hex")
135 | 
136 | def unicodeencode(value, encoding=None):
137 |     """
138 |     Returns 8-bit string representation of the supplied unicode value
139 | 
140 |     >>> unicodeencode(u'foobar')
141 |     'foobar'
142 |     """
143 | 
144 |     retVal = value
145 |     if isinstance(value, unicode):
146 |         try:
147 |             retVal = value.encode(encoding or UNICODE_ENCODING)
148 |         except UnicodeEncodeError:
149 |             retVal = value.encode(UNICODE_ENCODING, "replace")
150 |     return retVal
151 | 
152 | 
153 | def unicode_encode(value, encoding=None):
154 |     """
155 |     Return 8-bit string representation of the supplied unicode value:
156 | 
157 |     >>> unicode_encode(u'test')
158 |     'test'
159 |     """
160 | 
161 |     ret_val = value
162 |     if isinstance(value, unicode):
163 |         try:
164 |             ret_val = value.encode(encoding or UNICODE_ENCODING)
165 |         except UnicodeEncodeError:
166 |             ret_val = value.encode(UNICODE_ENCODING, "replace")
167 |     return ret_val
168 | 
169 | 
170 | def utf8encode(value):
171 |     return unicode_encode(value, "utf-8")
172 | 
173 | 
174 | def utf8decode(value):
175 |     """
176 |     Returns UTF-8 representation of the supplied 8-bit string representation
177 | 
178 |     >>> utf8decode('foobar')
179 |     u'foobar'
180 |     """
181 | 
182 |     return value.decode("utf-8")
183 | 
184 | 
185 | def urldecode(value, encoding=None):
186 |     """
187 |     URL decodes given value
188 |     >>> urldecode('AND%201%3E%282%2B3%29%23', convall=True)
189 |     u'AND 1>(2+3)#'
190 |     """
191 |     result = None
192 | 
193 |     if value:
194 |         try:
195 |             # for cases like T%C3%BCrk%C3%A7e
196 |             value = str(value)
197 |         except ValueError:
198 |             pass
199 |         finally:
200 |             result = urllib.unquote_plus(value)
201 | 
202 |     if isinstance(result, str):
203 |         result = unicode(result, encoding or UNICODE_ENCODING, errors="replace")
204 | 
205 |     return result
206 | 
207 | 
208 | def urlencode(value, safe="%&=", convall=False, limit=False):
209 |     """
210 |     URL encodes given value
211 |     >>> urlencode('AND 1>(2+3)#')
212 |     'AND%201%3E%282%2B3%29%23'
213 |     """
214 |     count = 0
215 |     result = None
216 | 
217 |     if value is None:
218 |         return result
219 | 
220 |     if convall or safe is None:
221 |         safe = ""
222 | 
223 |     # corner case when character % really needs to be
224 |     # encoded (when not representing url encoded char)
225 |     if all(map(lambda x: '%' in x, [safe, value])):
226 |         value = re.sub("%(?![0-9a-fA-F]{2})", "%25", value, re.DOTALL | re.IGNORECASE)
227 | 
228 |     while True:
229 |         result = urllib.quote(utf8_encode(value), safe)
230 | 
231 |         if limit and len(result) > URLENCODE_CHAR_LIMIT:
232 |             if count >= len(URLENCODE_FAILSAFE_CHARS):
233 |                 break
234 | 
235 |             while count < len(URLENCODE_FAILSAFE_CHARS):
236 |                 safe += URLENCODE_FAILSAFE_CHARS[count]
237 |                 count += 1
238 |                 if safe[-1] in value:
239 |                     break
240 |         else:
241 |             break
242 | 
243 |     return result
244 | 
245 | def htmlunescape(value):
246 |     """
247 |     Returns (basic conversion) HTML unescaped value
248 | 
249 |     >>> htmlunescape('a&lt;b')
250 |     'a<b'
251 |     """
252 | 
253 |     retVal = value
254 |     if value and isinstance(value, basestring):
255 |         codes = (('&lt;', '<'), ('&gt;', '>'), ('&quot;', '"'), ('&nbsp;', ' '), ('&amp;', '&'))
256 |         retVal = reduce(lambda x, y: x.replace(y[0], y[1]), codes, retVal)
257 |         try:
258 |             retVal = re.sub(r"&#x([^ ;]+);", lambda match: unichr(int(match.group(1), 16)), retVal)
259 |         except ValueError:
260 |             pass
261 |     return retVal
262 | 
263 | def singleTimeWarnMessage(message):  # Cross-linked function
264 |     sys.stdout.write(message)
265 |     sys.stdout.write("\n")
266 |     sys.stdout.flush()
267 | 
268 | def stdoutencode(data):
269 |     retVal = None
270 | 
271 |     try:
272 |         data = data or ""
273 | 
274 |         # Reference: http://bugs.python.org/issue1602
275 |         if IS_WIN:
276 |             output = data.encode(sys.stdout.encoding, "replace")
277 | 
278 |             if '?' in output and '?' not in data:
279 |                 warnMsg = "cannot properly display Unicode characters "
280 |                 warnMsg += "inside Windows OS command prompt "
281 |                 warnMsg += "(http://bugs.python.org/issue1602). All "
282 |                 warnMsg += "unhandled occurances will result in "
283 |                 warnMsg += "replacement with '?' character. Please, find "
284 |                 warnMsg += "proper character representation inside "
285 |                 warnMsg += "corresponding output files. "
286 |                 singleTimeWarnMessage(warnMsg)
287 | 
288 |             retVal = output
289 |         else:
290 |             retVal = data.encode(sys.stdout.encoding)
291 |     except:
292 |         retVal = data.encode(UNICODE_ENCODING) if isinstance(data, unicode) else data
293 | 
294 |     return retVal
295 | 
296 | def jsonize(data):
297 |     """
298 |     Returns JSON serialized data
299 | 
300 |     >>> jsonize({'foo':'bar'})
301 |     '{\\n    "foo": "bar"\\n}'
302 |     """
303 | 
304 |     return json.dumps(data, sort_keys=False, indent=4)
305 | 
306 | def dejsonize(data):
307 |     """
308 |     Returns JSON deserialized data
309 | 
310 |     >>> dejsonize('{\\n    "foo": "bar"\\n}')
311 |     {u'foo': u'bar'}
312 |     """
313 | 
314 |     return json.loads(data)
315 | 
316 | 
317 | def to_param_dict(params):
318 |     """a=1&b=2 to {'a':1,'b':2}"""
319 |     param_dict = {}
320 |     if not params:
321 |         return param_dict
322 |     try:
323 |         split_params = params.split('&')
324 |         for element in split_params:
325 |             elem = element.split("=")
326 |             if len(elem) >= 2:
327 |                 parameter = elem[0].replace(" ", "")
328 |                 value = "=".join(elem[1:])
329 |                 param_dict[parameter] = value
330 |     except:
331 |         pass
332 | 
333 |     return param_dict
334 | 
335 | def to_param_str(param_dict):
336 |     """{'a':1,'b':2} to a=1&b=2"""
337 |     params = '&'.join([k + '=' + v for k, v in param_dict.items()])
338 |     return params
339 | 
340 | if __name__ == '__main__':
341 |     url = '&lt;?xml&nbsp;version=&quot;1.0&quot;&nbsp;encoding=&quot;UTF-8&quot;?&gt;&lt;collection&gt;&lt;element&nbsp;ID=&quot;UserName&quot;&nbsp;Type=&quot;String&quot;&gt;&lt;![CDATA[test]]&gt;&lt;/element&gt;&lt;element&nbsp;ID=&quot;Password&quot;&nbsp;Type=&quot;String&quot;&gt;&lt;![CDATA[1234]]&gt;&lt;/element&gt;&lt;element&nbsp;ID=&quot;VerifyCode&quot;&nbsp;Type=&quot;String&quot;&gt;&lt;![CDATA[3kv3]]&gt;&lt;/element&gt;&lt;element&nbsp;ID=&quot;LoginImg&quot;&nbsp;Type=&quot;String&quot;&gt;&lt;![CDATA[]]&gt;&lt;/element&gt;&lt;element&nbsp;ID=&quot;Cancel&quot;&nbsp;Type=&quot;String&quot;&gt;&lt;![CDATA[]]&gt;&lt;/element&gt;&lt;/collection&gt;'
342 |     print htmlunescape(url)


--------------------------------------------------------------------------------
/common/logger.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | #
 4 | # Author: fuck@0day5.com
 5 | #
 6 | import time
 7 | import ctypes,sys
 8 | import platform
 9 | 
10 | def get_current_isostr():
11 |     iso = time.strftime("%Y-%m-%dT%H:%M:%S", time.localtime()) 
12 |     return iso
13 | 
14 | if platform.system()=='Linux' or platform.system()=='Darwin':
15 |     class colors:
16 |         BLACK         = '\033[0;30m'
17 |         DARK_GRAY     = '\033[1;30m'
18 |         LIGHT_GRAY    = '\033[0;37m'
19 |         BLUE          = '\033[0;34m'
20 |         LIGHT_BLUE    = '\033[1;34m'
21 |         GREEN         = '\033[0;32m'
22 |         LIGHT_GREEN   = '\033[1;32m'
23 |         CYAN          = '\033[0;36m'
24 |         LIGHT_CYAN    = '\033[1;36m'
25 |         RED           = '\033[0;31m'
26 |         LIGHT_RED     = '\033[1;31m'
27 |         PURPLE        = '\033[0;35m'
28 |         LIGHT_PURPLE  = '\033[1;35m'
29 |         BROWN         = '\033[0;33m'
30 |         YELLOW        = '\033[1;33m'
31 |         WHITE         = '\033[1;37m'
32 |         DEFAULT_COLOR = '\033[00m'
33 |         RED_BOLD      = '\033[01;31m'
34 |         ENDC          = '\033[0m'
35 | 
36 |     def print_error(mess):
37 |         mess=mess.strip('\r\n')
38 |         print colors.RED+get_current_isostr() + mess + colors.ENDC
39 | 
40 |     def print_warm(mess):
41 |         mess=mess.strip('\r\n')
42 |         print colors.LIGHT_PURPLE + get_current_isostr()+ mess+ colors.ENDC
43 | 
44 |     def print_debug(mess):
45 |         mess=mess.strip('\r\n')
46 |         print colors.GREEN + get_current_isostr()+ mess + colors.ENDC
47 | 
48 | 
49 | if platform.system()=='Windows':
50 |     STD_INPUT_HANDLE = -10
51 |     STD_OUTPUT_HANDLE = -11
52 |     STD_ERROR_HANDLE = -12
53 | 
54 |     FOREGROUND_BLACK = 0x0
55 |     FOREGROUND_BLUE = 0x01 # text color contains blue.
56 |     FOREGROUND_GREEN = 0x02 # text color contains green.
57 |     FOREGROUND_RED = 0x04 # text color contains red.
58 | 
59 |     FOREGROUND_INTENSITY = 0x08 # text color is intensified.
60 |     BACKGROUND_BLUE = 0x10 # background color contains blue.
61 |     BACKGROUND_GREEN = 0x20 # background color contains green.
62 |     BACKGROUND_RED = 0x40 # background color contains red.
63 |     BACKGROUND_INTENSITY = 0x80 # background color is intensified.
64 | 
65 | 
66 |     std_out_handle = ctypes.windll.kernel32.GetStdHandle(STD_OUTPUT_HANDLE)
67 | 
68 |     def set_cmd_text_color(color, handle=std_out_handle):
69 |         Bool = ctypes.windll.kernel32.SetConsoleTextAttribute(handle, color)
70 |         return Bool
71 | 
72 |     def resetColor():
73 |         set_cmd_text_color(FOREGROUND_RED | FOREGROUND_GREEN | FOREGROUND_BLUE)
74 | 
75 |     def print_error(mess):
76 |         set_cmd_text_color(FOREGROUND_RED | FOREGROUND_INTENSITY)
77 |         sys.stdout.write("%s %s\n" % (get_current_isostr(), msg))
78 |         resetColor()
79 | 
80 |     def print_warm(mess):
81 |         set_cmd_text_color(FOREGROUND_YELLOW | FOREGROUND_BLUE| FOREGROUND_INTENSITY)
82 |         sys.stdout.write("%s %s\n" % (get_current_isostr(), msg))
83 |         resetColor()
84 | 
85 |     def print_debug(mess):
86 |         set_cmd_text_color(FOREGROUND_GREEN | FOREGROUND_INTENSITY)
87 |         sys.stdout.write("%s %s\n" % (get_current_isostr(), msg))
88 |         resetColor()
89 | 
90 | 
91 | if __name__ == '__main__':
92 |     mess = "hello,world"
93 |     print_error(mess)
94 |     print_debug(mess)
95 |     print_warm(mess)


--------------------------------------------------------------------------------
/common/rule.json:
--------------------------------------------------------------------------------
  1 | [
  2 |     {
  3 |         "patterns": [
  4 |             {
  5 |                 "part": "etc.*?issue|etc.*?hosts|etc.*?passwd|windows/win.ini|proc\\/version|WEB-INF\\/web.xml",
  6 |                 "type": "regex"
  7 |             },
  8 |             {
  9 |                 "part":"\\/hosts|\\.bash_history|bashrc|config\\[root_dir]\\=|appserv_root=|path\\[docroot]\\=|GALLERY_BASEDIR\\=|_SERVER\\[DOCUMENT_ROOT]|_CONF\\[path]|mosConfig_absolute_path\\=",
 10 |                 "type":"match"
 11 |             }   
 12 |         ],
 13 |         "tag": "file_include", 
 14 |         "level": "high"
 15 |     },
 16 |     {
 17 |         "patterns": [
 18 |             {
 19 |                 "part": "\\<.+javascript:window\\[.{1}\\\\x|<.*=(&#\\d+?;?)+?>|\\<.*(data|src)=data:text\\/html.*>|\\b(alert\\(|confirm\\(|expression\\(|s*script\\(|prompt\\(|benchmark\\s*?\\(.*\\))",
 20 |                 "type": "regex"
 21 |             }
 22 |         ],
 23 |         "tag": "XSS", 
 24 |         "level": "medium"
 25 |     },
 26 |     {
 27 |         "patterns": [
 28 |             {
 29 |                 "part": "nslookup|whoami|set|dir|net",
 30 |                 "type": "match"
 31 |             }
 32 |         ],
 33 |         "tag": "command", 
 34 |         "level": "high"
 35 |     },
 36 |     {
 37 |         "patterns": [
 38 |             {
 39 |                 "part": "(sleep\\s*?\\(.*\\)|\\b(group_)?concat[\\s\\/\\*]*?\\([^\\)]+?\\)|\bcase[\\s\/\\*]*?when[\\s\/\\*]*?\\([^\\)]+?\\)|load_file\\s*?\\()|<[a-z]+?\\b[^>]*?\\bon([a-z]{4,})\\s*?=|^\\+\\/v(8|9)|\\b(and|or|OR)\\b\\s*?([\\(\\)'\"\\d]+?=[\\(\\)'\"\\d]+?|[\\(\\)'\"a-zA-Z]+?=[\\(\\)'\"a-zA-Z]+?|>|<|\\s+?[\\w]+?\\s+?\\bin\\b\\s*?\\(|\\blike\\b\\s+?[\"'])|\\/\\*.*\\*\\/|\\bEXEC\\b|UNION.+?SELECT\\s*(\\(.+\\)\\s*|@{1,2}.+?\\s*|\\s+?.+?|(`|'|\").*?(`|'|\")\\s*)|UPDATE\\s*(\\(.+\\)\\s*|@{1,2}.+?\\s*|\\s+?.+?|(`|'|\").*?(`|'|\")\\s*)SET|INSERT\\s+INTO.+?VALUES|(SELECT|DELETE)@{0,2}(\\(.+\\)|\\s+?.+?\\s+?|(`|'|\").*?(`|'|\"))FROM(\\(.+\\)|\\s+?.+?|(`|'|\").*?(`|'|\"))|(CREATE|ALTER|DROP|TRUNCATE)\\s+(TABLE|DATABASE)",
 40 |                 "type": "regex"
 41 |             },
 42 |             {
 43 |                 "part": "(convert\\s*?\\(int,CHAR\\(|\\(CONVERT\\s*?\\(INT|\\b(and|or|OR|xor|XOR|AND|/**/)\\b",
 44 |                 "type": "regex"
 45 |             },
 46 |             {
 47 |                 "part": "\\(SELECT.*FROM\\)|\\(union.*SELECT\\)|\\(select.*load_file\\)",
 48 |                 "type": "regex"
 49 |             },
 50 |             {
 51 |                 "part": "SELECT.*concat\\(",
 52 |                 "type": "regex"
 53 |             },
 54 |             {
 55 |                 "part": "pg_sleep|benchmark\\(|if\\(|shutdown",
 56 |                 "type": "match"
 57 |             }
 58 |         ],
 59 |         "tag": "sqlinject", 
 60 |         "level": "high"
 61 |     },
 62 |     {
 63 |         "patterns": [
 64 |             {
 65 |                 "part": "_memberAccess",
 66 |                 "type": "match"
 67 |             },
 68 |             {
 69 |                 "part": "debug.*expression",
 70 |                 "type": "regex"
 71 |             },
 72 |             {
 73 |                 "part": "_memberAccess.*java.lang.Runtime",
 74 |                 "type": "regex"
 75 |             }
 76 |         ],
 77 |         "tag": "struts2", 
 78 |         "level": "high"
 79 |     },
 80 |     {
 81 |         "patterns": [
 82 |             {
 83 |                 "part": "assert\\(|eval\\(|phpinfo\\(|echo\\(|print\\(|var_dump\\(|print_r\\(|Execute\\(|Response.Write\\(|z0=GB2312|z0=UTF-8|\\${new java.lang",
 84 |                 "type": "regex"
 85 |             }
 86 |         ],
 87 |         "tag": "code_excute", 
 88 |         "level": "high"
 89 |     },
 90 |     {
 91 |         "patterns": [
 92 |             {
 93 |                 "part": "shell|allow_url_include|auto_prepend_file|php://input",
 94 |                 "type": "match"
 95 |             }
 96 |         ],
 97 |         "tag": "code_rce", 
 98 |         "level": "high"
 99 |     },
100 |     {
101 |         "patterns": [
102 |             {
103 |                 "part": "java.lang.String.*println",
104 |                 "type": "regex"
105 |             },
106 |             {
107 |                 "part": "java.lang.ProcessBuilder",
108 |                 "type": "match"
109 |             }
110 |         ],
111 |         "tag": "java expression", 
112 |         "level": "high"
113 |     },
114 |     {
115 |         "patterns": [
116 |             {
117 |                 "part": ".git/config",
118 |                 "type": "match"
119 |             }
120 |         ],
121 |         "tag": "git", 
122 |         "level": "high"
123 |     },
124 |     {
125 |         "patterns": [
126 |             {
127 |                 "part": ".DS_Store",
128 |                 "type": "match"
129 |             }
130 |         ],
131 |         "tag": "DS_Store", 
132 |         "level": "high"
133 |     },
134 |     {
135 |         "patterns": [
136 |             {
137 |                 "part": ".svn\\\/(all-wcprops|all-wcpropss|entries|trunk)",
138 |                 "type": "match"
139 |             }
140 |         ],
141 |         "tag": "svn", 
142 |         "level": "high"
143 |     },
144 |     {
145 |         "patterns": [
146 |             {
147 |                 "part": "vul_webscan",
148 |                 "type": "match"
149 |             }
150 |         ],
151 |         "tag": "360Webscan", 
152 |         "level": "low"
153 |     },
154 |     {
155 |         "patterns": [
156 |             {
157 |                 "part": "dbappsecurity|dbappsec|dbapp|\"%d5\\'|%21(()%26%26%21%7c*%7c*%7c|(()))",
158 |                 "type": "match"
159 |             }
160 |         ],
161 |         "tag": "anhengWebscan", 
162 |         "level": "low"
163 |     },
164 |     {
165 |         "patterns": [
166 |             {
167 |                 "part": "vulnweb.com|acunetix|bxss.me|injected_by_wvs|wvstest",
168 |                 "type": "match"
169 |             }
170 |         ],
171 |         "tag": "AWVS", 
172 |         "level": "low"
173 |     },
174 |     {
175 |         "patterns": [
176 |             {
177 |                 "part": "crossdomain.xml",
178 |                 "type": "match"
179 |             }
180 |         ],
181 |         "tag": "crossdomain.xml", 
182 |         "level": "low"
183 |     },
184 |     {
185 |         "patterns": [
186 |             {
187 |                 "part": "CVS/ROOT",
188 |                 "type": "match"
189 |             }
190 |         ],
191 |         "tag": "cvs_root", 
192 |         "level": "low"
193 |     },
194 |     {
195 |         "patterns": [
196 |             {
197 |                 "part": "\\/axis\\/services",
198 |                 "type": "match"
199 |             }
200 |         ],
201 |         "tag": "Aixs", 
202 |         "level": "low"
203 |     },
204 |     {
205 |         "patterns": [
206 |             {
207 |                 "part": "\\/server-status",
208 |                 "type": "match"
209 |             }
210 |         ],
211 |         "tag": "Apache status", 
212 |         "level": "low"
213 |     },
214 |     {
215 |         "patterns": [
216 |             {
217 |                 "part": "~.aspx",
218 |                 "type": "match"
219 |             }
220 |         ],
221 |         "tag": "IIS short", 
222 |         "level": "low"
223 |     },
224 |     {
225 |         "patterns": [
226 |             {
227 |                 "part": "services\\/listServices",
228 |                 "type": "match"
229 |             }
230 |         ],
231 |         "tag": "services scan", 
232 |         "level": "low"
233 |     }
234 | ]
235 | 


--------------------------------------------------------------------------------
/common/threadpool.py:
--------------------------------------------------------------------------------
  1 | # coding:utf-8
  2 | # 模拟一个进城池 线程池，可以向里面添加任务，
  3 | 
  4 | import threading
  5 | import time
  6 | import traceback
  7 | import logger
  8 | import Queue
  9 | import random
 10 | 
 11 | class new_threadpool:
 12 | 
 13 |     def __init__(self,threadnum,func_scan,Isjoin = False):
 14 |         self.thread_count = self.thread_nums = threadnum
 15 |         self.scan_count_lock = threading.Lock()
 16 |         self.thread_count_lock = threading.Lock()
 17 |         self.load_lock = threading.Lock()
 18 |         self.scan_count = 0
 19 |         self.isContinue = True
 20 |         self.func_scan = func_scan
 21 |         self.queue = Queue.Queue()
 22 |         self.isjoin = Isjoin
 23 | 
 24 |     def push(self,payload):
 25 |         self.queue.put(payload)
 26 | 
 27 |     def changeScanCount(self,num):
 28 |         self.scan_count_lock.acquire()
 29 |         self.scan_count += num
 30 |         self.scan_count_lock.release()
 31 | 
 32 |     def changeThreadCount(self,num):
 33 |         self.thread_count_lock.acquire()
 34 |         self.thread_count += num
 35 |         self.thread_count_lock.release()
 36 | 
 37 |     def run(self):
 38 |         th = []
 39 |         for i in range(self.thread_nums):
 40 |             t = threading.Thread(target=self.scan)
 41 |             t.setDaemon(True)
 42 |             t.start()
 43 |             th.append(t)
 44 |         
 45 |         # It can quit with Ctrl-C
 46 |         if self.isjoin:
 47 |             for tt in th:
 48 |                 tt.join()
 49 |         else:
 50 |             while 1:
 51 |                 if self.thread_count > 0 and self.isContinue:
 52 |                     time.sleep(0.01)
 53 |                 else:
 54 |                     break
 55 |                     
 56 |     def stop(self):
 57 |         self.load_lock.acquire()
 58 |         self.isContinue = False
 59 |         self.load_lock.release()
 60 |         
 61 |     def scan(self):
 62 |         while 1:
 63 |             self.load_lock.acquire()
 64 |             if self.queue.qsize() > 0 and self.isContinue:
 65 |                 payload = self.queue.get()
 66 | 
 67 |                 self.load_lock.release()
 68 |             else:
 69 |                 self.load_lock.release()
 70 |                 break
 71 |             try:
 72 |                 # 在执行时报错如果不被处理，线程会停止并退出
 73 |                 self.func_scan(payload)
 74 |                 time.sleep(0.3)
 75 |             except KeyboardInterrupt:
 76 |                 self.isContinue = False
 77 |                 raise KeyboardInterrupt
 78 |             except Exception:
 79 |                 errmsg = traceback.format_exc()
 80 |                 self.isContinue = False
 81 |                 print_error(errmsg)
 82 | 
 83 |         self.changeThreadCount(-1)
 84 | 
 85 | 
 86 | if __name__ == '__main__':
 87 |     def calucator(args):
 88 |         num,numt = args
 89 |         print numt
 90 |         i = random.randint(1, 100)
 91 |         u = num
 92 |         a = i * u
 93 |         if (a % 6 == 0):
 94 |             for x in range(5):
 95 |                 print "new thread",x
 96 |                 #p.push(x)
 97 | 
 98 |     p = new_threadpool(3, calucator)
 99 |     for i in range(20):
100 |         args=(i,i+1,)
101 |         p.push(args)
102 |     p.run()
103 | 
104 | 


--------------------------------------------------------------------------------
/common/units.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #-*- coding:utf-8 -*-
  3 | import redis
  4 | import os,re,json
  5 | from GeoIPUtils import GeoIPUtil,GeoIP2Util
  6 | from convert import urldecode,htmlunescape
  7 | try:
  8 |     from user_agents import parse as ua_parse
  9 | except:
 10 |     print "try to pip install pyyaml ua-parser user-agents"
 11 | 
 12 | import sys
 13 | default_encoding = 'utf-8'
 14 | if sys.getdefaultencoding() != default_encoding:
 15 |     reload(sys)
 16 |     sys.setdefaultencoding(default_encoding)
 17 |     
 18 | EXCLUDE_EXTENSIONS = ("ico","3ds", "3g2", "3gp", "7z", "DS_Store", "a", "aac", "adp", "ai", "aif", "aiff", "apk", "ar", "asf", "au", "avi", "bak", "bin", "bk", "bmp", "btif", "bz2", "cab", "caf", "cgm", "cmx", "cpio", "cr2", "dat", "deb", "djvu", "dll", "dmg", "dmp", "dng", "doc", "docx", "dot", "dotx", "dra", "dsk", "dts", "dtshd", "dvb", "dwg", "dxf", "ear", "ecelp4800", "ecelp7470", "ecelp9600", "egg", "eol", "eot", "epub", "exe", "f4v", "fbs", "fh", "fla", "flac", "fli", "flv", "fpx", "fst", "fvt", "g3", "gif", "gz", "h261", "h263", "h264", "ico", "ief", "image", "img", "ipa", "iso", "jar", "jpeg", "jpg", "jpgv", "jpm", "jxr", "ktx", "lvp", "lz", "lzma", "lzo", "m3u", "m4a", "m4v", "mar", "mdi", "mid", "mj2", "mka", "mkv", "mmr", "mng", "mov", "movie", "mp3", "mp4", "mp4a", "mpeg", "mpg", "mpga", "mxu", "nef", "npx", "o", "oga", "ogg", "ogv", "otf", "pbm", "pcx", "pdf", "pea", "pgm", "pic", "png", "pnm", "ppm", "pps", "ppt", "pptx", "ps", "psd", "pya", "pyc", "pyo", "pyv", "qt", "rar", "ras", "raw", "rgb", "rip", "rlc", "rz", "s3m", "s7z", "scm", "scpt", "sgi", "shar", "sil", "smv", "so", "sub", "swf", "tar", "tbz2", "tga", "tgz", "tif", "tiff", "tlz", "ts", "ttf", "uvh", "uvi", "uvm", "uvp", "uvs", "uvu", "viv", "vob", "war", "wav", "wax", "wbmp", "wdp", "weba", "webm", "webp", "whl", "wm", "wma", "wmv", "wmx", "woff", "woff2", "wvx", "xbm", "xif", "xls", "xlsx", "xlt", "xm", "xpi", "xpm", "xwd", "xz", "z", "zip", "zipx")
 19 | 
 20 | def push_msg(msg):
 21 |     '''
 22 |     redis连接池
 23 |     '''
 24 |     redis_host = '192.168.87.222'
 25 |     redis_port = 6379
 26 |     redis_pass = 'cft67ygv'
 27 |     redis_db = 0
 28 |     redis_key = 'logstash:redis'
 29 |     try:
 30 |         #r = redis.StrictRedis(host=redis_host, port=redis_port, db=redis_db, password=redis_pass)
 31 |         pool = redis.ConnectionPool(host=redis_host, port=redis_port, db=redis_db, password=redis_pass)
 32 |         r = redis.StrictRedis(connection_pool=pool)
 33 |         r.rpush(redis_key, msg)
 34 |     except Exception as e:
 35 |         print str(e),msg
 36 | 
 37 | def is_intranet(ip):
 38 |     """
 39 |     匹配内网ip地址
 40 |     """
 41 |     ret = ip.split('.')
 42 |     if not len(ret) == 4:
 43 |         return True
 44 |     if ret[0] == '10':
 45 |         return True
 46 |     if ret[0] == '127' and ret[1] == '0':
 47 |         return True
 48 |     if ret[0] == '172' and 16 <= int(ret[1]) <= 32:
 49 |         return True
 50 |     if ret[0] == '192' and ret[1] == '168':
 51 |         return True
 52 |     return False
 53 | 
 54 | def parser_ua(ua_string):
 55 |     '''
 56 |     解析user-agent
 57 |     '''
 58 |     info = {}
 59 |     info['spider'] = False
 60 |     try:
 61 |         msg = ua_parse(ua_string)
 62 | 
 63 |         if msg.is_pc:
 64 |             info['dev'] = 'PC'
 65 |         elif msg.is_tablet:
 66 |             info['dev'] = 'Pad'
 67 |         elif msg.is_mobile:
 68 |             info['dev'] = 'MObile'
 69 |         else:
 70 |             info['dev'] = 'Unknow'
 71 | 
 72 |         if msg.is_bot:
 73 |             info['spider'] = True
 74 |         info["type"] = msg.os.family+' '+str(msg.os.version_string)
 75 | 
 76 |         info["ua"] = msg.browser.family+' '+str(msg.browser.version_string)
 77 |         return info
 78 |     except Exception as e:
 79 |         return info
 80 | 
 81 |     info = {}
 82 |     addr = ""
 83 |     g1 = GeoIPUtil()
 84 |     g2 = GeoIP2Util()
 85 |     if is_intranet(ipaddr):
 86 |         info["address"] = "局域网内地址"
 87 |         info['weidu'] = info['jingdu'] = info['country'] = info['subdivision'] =info['city'] =""
 88 |     else:
 89 |         try:
 90 |             longitude,latitude = g1.get_lat_alt(ipaddr)
 91 |             info['weidu'] = latitude
 92 |             info['jingdu'] = longitude
 93 |             country, subdivision, city =  g2.get_ip_location(ipaddr)
 94 |             if country == u"中国":
 95 |                 if not subdivision and not city:
 96 |                     addr = country
 97 |                 elif subdivision.find(u'市')==-1 and subdivision not in [u'上海'] and city:
 98 |                     addr = subdivision.strip(u'省')+u"省\t"+city
 99 |                 elif subdivision and not city:
100 |                     addr = country +' '+ subdivision
101 |                 else:
102 |                     addr = subdivision+"\t"+city
103 |             else:
104 |                 if subdivision in [u'台北市',u'新北市',u'基隆市',u'新竹市',u'嘉义市',u'台中市',u'台南市',u'高雄市',u'屏东市']:
105 |                     subdivision = "台湾省"+' '+subdivision
106 |                 if country in [u'香港',u'澳门'] and subdivision:
107 |                     subdivision = country +' '+ subdivision
108 |                     country = '中国'
109 |                 elif country in [u'香港',u'澳门'] and not subdivision:
110 |                     subdivision = country
111 |                     country = '中国'
112 | 
113 |                 if not subdivision and not city:
114 |                     addr = country
115 |                 elif subdivision and not city:
116 |                     addr = country +' '+ subdivision
117 |                 else:
118 |                     addr = country.replace(u'台湾',u'中国') +"\t"+subdivision+"\t"+city
119 |             if addr:
120 |                 info["address"] = addr
121 | 
122 |             info["country"] = country.replace(u'台湾',u'中国')
123 |             info["subdivision"]= subdivision
124 |             info["city"] = city
125 |         except Exception as e:
126 |             print str(e)
127 |             return info
128 | 
129 | def check_rule(file_data):
130 |     '''
131 |     匹配规则
132 |     '''
133 |     rule = ''
134 |     results = []
135 |     try:
136 |         file_data = urldecode(file_data)
137 |     except:
138 |         try:
139 |             file_data = htmlunescape(file_data)
140 |         except:
141 |             file_data = file_data
142 | 
143 |     try:
144 |         default_conf_path = os.path.abspath(os.path.dirname(__file__)) + "/"
145 |         flist = default_conf_path+"rule.json"
146 |         patterns_list = json.load(file(flist))
147 |         for patterns in patterns_list:
148 |             sensitive = True
149 |             for pattern in patterns['patterns']:
150 |                 if pattern['type'] == 'match':
151 |                     re_pattern = re.compile(pattern['part'],  re.IGNORECASE | re.DOTALL | re.MULTILINE)
152 |                     re_result = re.findall(pattern['part'], file_data)
153 |                     if not re_result:
154 |                         sensitive = False
155 |                         break
156 |                 elif pattern['type'] == 'regex':
157 |                     re_pattern = re.compile(str(pattern['part']),  re.IGNORECASE | re.DOTALL | re.MULTILINE)
158 |                     if re_pattern.search(file_data) == None:
159 |                         sensitive = False
160 |                         break
161 | 
162 |                 if sensitive:
163 |                     results.append({
164 |                         'tag': patterns['tag'],
165 |                         'level': patterns['level']
166 |                     })
167 |     except Exception as e:
168 |         pass
169 |     return results
170 | 
171 | if __name__ == '__main__':
172 |     print json.dumps(parser_ip('111.122.172.163'))
173 |     print json.dumps(parser_ip('220.181.171.119'))
174 |     print json.dumps(parser_ip('192.168.199.233'))
175 | 
176 |     url = "/drupal/?destination=node/8153%23comment-form&q=../../../../../../../../../../WEB-INF/web.xml"
177 |     print check_rule(url)
178 |     ua = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.21 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.21"
179 |     print parser_ua(ua)
180 |     
181 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #-*- coding:utf-8 -*-
  3 | 
  4 | from module.apache_parser import ApahceParser
  5 | from module.iis_parser import IISLineParser
  6 | from module.nginx_parser import NgLineParser
  7 | from module.tomcat_parser import TomcatParser
  8 | from common.threadpool import new_threadpool
  9 | from optparse import OptionParser
 10 | from common.units import *
 11 | 
 12 | def check_file(path):
 13 |     '''
 14 |     检测文件是目录还是文件
 15 |     '''
 16 |     import os
 17 |     if os.path.isdir(path):
 18 |         return "directory"
 19 |     elif os.path.isfile(path):
 20 |         return "file"
 21 |     else:
 22 |         return False
 23 | 
 24 | def para(line,xtype):
 25 |     '''
 26 |     对日志类型进程分类处理
 27 |     '''
 28 |     if xtype == "apache":
 29 |         ng_line_parser = ApahceParser()
 30 |     elif xtype == "iis":
 31 |         ng_line_parser = IISLineParser()
 32 |     elif xtype == "nginx":
 33 |         ng_line_parser = NgLineParser()
 34 |     elif xtype == "tomcat":
 35 |         ng_line_parser = TomcatParser()
 36 |     else:
 37 |         return "not found,waitting...."
 38 |     if line.startswith("#"):
 39 |         pass
 40 |     else:
 41 |         try:
 42 |             ng_line_parser.parse(line)
 43 |             if ng_line_parser.real_ip:
 44 |                 ippara = parser_ip(ng_line_parser.real_ip)
 45 |             else:
 46 |                 ippara = parser_ip(ng_line_parser.cdn_ip)
 47 | 
 48 |             if ng_line_parser.browser:
 49 |                 parses = parser_ua(ng_line_parser.browser)
 50 |                 mydict = {
 51 |                     "status":ng_line_parser.response_status, 
 52 |                     "cdn_ip":ng_line_parser.real_ip,
 53 |                     "real_ip":ng_line_parser.cdn_ip,
 54 |                     "access_time":ng_line_parser.access_time,
 55 |                     "method":ng_line_parser.method,
 56 |                     "url":ng_line_parser.request_url,
 57 |                     "urldecode":urldecode(ng_line_parser.request_url),
 58 |                     "referer":ng_line_parser.reference_url,
 59 |                     "body_bytes":ng_line_parser.bbytes,
 60 |                     "user_agent":ng_line_parser.browser,
 61 |                     "ua_value":parses['ua'],
 62 |                     "dev_type":parses['dev'],
 63 |                     "dev_value":parses['type'],
 64 |                     "spider":parses['spider'],
 65 |                     "addr":ippara['address'],
 66 |                     "city":ippara['city'],
 67 |                     "jingdu":ippara['jingdu'],
 68 |                     "weidu":ippara['weidu'],
 69 |                     "country":ippara['country'],
 70 |                     "subdivision":ippara['subdivision'],
 71 |                     }
 72 |             else:
 73 |                 mydict = {
 74 |                     "status":ng_line_parser.response_status, 
 75 |                     "cdn_ip":ng_line_parser.cdn_ip,
 76 |                     "real_ip":ng_line_parser.real_ip,
 77 |                     "access_time":ng_line_parser.access_time,
 78 |                     "method":ng_line_parser.method,
 79 |                     "url":ng_line_parser.request_url,
 80 |                     "urldecode":urldecode(ng_line_parser.request_url),
 81 |                     "referer":ng_line_parser.reference_url,
 82 |                     "body_bytes":ng_line_parser.bbytes,
 83 |                     "user_agent":ng_line_parser.browser,
 84 |                     "addr":ippara['address'],
 85 |                     "city":ippara['city'],
 86 |                     "jingdu":ippara['jingdu'],
 87 |                     "weidu":ippara['weidu'],
 88 |                     "country":ippara['country'],
 89 |                     "subdivision":ippara['subdivision'],
 90 |                     }
 91 |             resu = check_rule(ng_line_parser.request_url)
 92 |             if resu:
 93 |                 if len(resu)==1:
 94 |                     mydict["rule"]= resu[0]
 95 |                 else:
 96 |                     mydict["rule"]= resu
 97 |             else:
 98 |                 mydict["rule"]= False
 99 |             '''
100 |             try:
101 |                 push_msg(json.dumps(mydict))
102 |             except Exception as e:
103 |                 try:
104 |                     del mydict["url"]
105 |                     del mydict["urldecode"]
106 |                     mydict["url"] = str(ng_line_parser.request_url).decode('gbk', 'ignore').encode('utf-8', 'ignore')
107 |                     mydict["urldecode"] = str(ng_line_parser.request_url).decode('gbk', 'ignore').encode('utf-8', 'ignore')
108 |                     push_msg(json.dumps(mydict))
109 |                 except Exception as why:
110 |                     print why
111 |                     print mydict
112 |                     pass
113 |             '''
114 |             return json.dumps(mydict)
115 |         except Exception as why:
116 |             print why
117 |             pass
118 | 
119 | def push_para(xtype,logfile):
120 |     '''
121 |     对每一个文件进行处理
122 |     '''
123 |     with open(logfile, 'r') as f:
124 |         #p = new_threadpool(3, calucator)
125 |         for index, line in enumerate(f):
126 |             print para(line,xtype)
127 | 
128 | def main(xtype,log_name):
129 |     '''
130 |     检测文件的类型
131 |     '''
132 |     xcheck = check_file(log_name)
133 |     if not xcheck:
134 |         print "file not found"
135 |         return
136 |     else:
137 |         if xcheck == "directory":
138 |             files = get_file(log_name)
139 |             for xfile in files:
140 |                 push_para(xtype,log_name+"/"+xfile)
141 |         else:
142 |             push_para(xtype,log_name)
143 | 
144 | def get_file(mypath):
145 |     '''
146 |     获取文件夹下面的全部文件
147 |     '''
148 |     from os import listdir
149 |     from os.path import isfile, join
150 |     onlyfiles = [ f for f in listdir(mypath) if isfile(join(mypath,f)) ]
151 |     return onlyfiles
152 | 
153 | def init_parser():
154 |     usage = "Usage: %prog --type IIS|Apache|Tomcat|Nginx --file file|directory"
155 |     parser = OptionParser(usage=usage, description="log parser ")
156 |     parser.add_option("--type", type="str", dest="type", help="chose which log type")
157 |     parser.add_option("--file", type="str", dest="file", default=None,help="chose file or directory")
158 |     return parser
159 | 
160 | if __name__ == '__main__':
161 |     #main('./log/access.log.9')
162 |     parser = init_parser()
163 |     option, _ = parser.parse_args()
164 |     logtype = str(option.type).lower()
165 |     logfile  = option.file
166 |     if not logfile or not logtype:
167 |         parser.print_help()
168 |     
169 |     main(logtype,logfile)
170 | 


--------------------------------------------------------------------------------
/module/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/0xa-saline/Logpara/b0be8febb7a9287157a119d5eb7b1dc8bc988a29/module/__init__.py


--------------------------------------------------------------------------------
/module/apache_parser.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #-*- coding:utf-8 -*-
  3 | 
  4 | import datetime
  5 | import re,urllib
  6 | 
  7 | request_re= re.compile(r'(?P<request_method>(GET|POST|HEAD|DELETE|PUT|OPTIONS)?)\s+(?P<request_uri>.*?)\s+(?P<server_protocol>.*)$')
  8 | log_line_re = re.compile(r'(?P<remote_host>((\d{1,3}\.){3}\d{1,3})+) - - (\[(?P<date_time>\S+)\s+\S+\])\s+\"(?P<request>(.*?))\"\s+(?P<status>([1-9]\d*))\s+(?P<body_bytes_sent>(.*?))\s+\"(?P<http_referer>.*?)\"\s+\"(?P<http_user_agent>.*?)\"')
  9 | logline_re = re.compile(r'(?P<remote_host>((\d{1,3}\.){3}\d{1,3})+) - - (\[(?P<date_time>\S+)\s+\S+\])\s+\"(?P<request>(.*?))\"\s+(?P<status>([1-9]\d*))\s+(?P<body_bytes_sent>(.*?))')
 10 | 
 11 | """
 12 | 192.168.0.23 - - [19/Aug/2017:05:33:54 +0200] "GET /drupal/templates/blue/js/default.js HTTP/1.1" 404 402 "http://192.168.0.102/drupal/templates/blue/js/default.js" "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36"
 13 | ip time method url status body referer ua
 14 | """
 15 | class ApahceParser(object):
 16 |     """将 Apache 日志解析成多个字段"""
 17 | 
 18 |     def __init__(self):
 19 |         self._cdn_ip = '' # CDN请求IP
 20 |         self._access_time = '' # 请求时间
 21 |         self._request_url = '' # 请求的URL
 22 |         self._reference_url = '' # 外链URL
 23 |         self._response_status = '' # NG 响应状态码
 24 |         self._browser = '' # 用户使用的浏览器
 25 |         self._real_ip = '' # 用户真实IP
 26 |         self._mthod = '' # 请求方式
 27 |         self._bbytes = '' # 内容大小
 28 | 
 29 |     def parse(self, line):
 30 |         """通过传入的一行数据进行解析
 31 |         """
 32 |         processed = log_line_re.search(line)
 33 |         if processed:
 34 |             #ip_time_tmp = line_item[2].strip().split()
 35 |             self.cdn_ip = processed.group('remote_host') # 服务器IP
 36 |             self.access_time = processed.group('date_time')# 请求发起的时间
 37 |             request = processed.group('request')
 38 |             request_ur = request_re.search(request)
 39 |             if request_ur:
 40 |                 self.method = request_ur.group('request_method') # 请求方式
 41 |                 self.request_url = request_ur.group('request_uri')
 42 |             self.response_status = processed.group('status') # NG 响应状态码
 43 |             self.bbytes = processed.group('body_bytes_sent') # 浏览所用时间
 44 |             if self.bbytes == "-":
 45 |                 self.bbytes = ""
 46 |             self.browser = processed.group('http_user_agent')
 47 |             self.reference_url = processed.group('http_referer') # 外链URL
 48 |             '''
 49 |             self.port = line_item[6]
 50 |             self.reference_url = "" # 外链URL
 51 |             self.real_ip = line_item[8].strip()
 52 |             
 53 |              # 用户使用的浏览器
 54 |             
 55 |             '''
 56 |         else:
 57 |             processed = logline_re.search(line)
 58 |             if processed:
 59 |                 self.cdn_ip = processed.group('remote_host') # 服务器IP
 60 |                 self.access_time = processed.group('date_time')# 请求发起的时间
 61 |                 request = processed.group('request')
 62 |                 request_ur = request_re.search(request)
 63 |                 if request_ur:
 64 |                     self.method = request_ur.group('request_method') # 请求方式
 65 |                     self.request_url = request_ur.group('request_uri')
 66 |                 self.response_status = processed.group('status') # NG 响应状态码
 67 |                 self.bbytes = processed.group('body_bytes_sent') # 浏览所用时间
 68 |                 if self.bbytes == "-":
 69 |                     self.bbytes = ""
 70 | 
 71 |     def to_dict(self):
 72 |         """将属性(@property)的转化为dict输出
 73 |         """
 74 |         propertys = {}
 75 |         propertys['real_ip'] = self.real_ip
 76 |         propertys['cdn_ip'] = self.cdn_ip
 77 |         propertys['method'] = self.method
 78 |         propertys['access_time'] = self.access_time
 79 |         propertys['request_url'] = self.request_url
 80 |         propertys['reference_url'] = self.reference_url
 81 |         propertys['response_status'] = self.response_status
 82 |         propertys['browser'] = self.browser
 83 |         propertys['bbytes'] = self.bbytes
 84 |         return propertys
 85 | 
 86 |     @property
 87 |     def real_ip(self):
 88 |         return self._real_ip
 89 | 
 90 |     @real_ip.setter
 91 |     def real_ip(self, real_ip):
 92 |         self._real_ip = real_ip.split(', ')[0]
 93 | 
 94 |     @property
 95 |     def browser(self):
 96 |         return self._browser
 97 | 
 98 |     @browser.setter
 99 |     def browser(self, browser):
100 |         self._browser = browser.replace('+',' ')
101 | 
102 |     @property
103 |     def response_status(self):
104 |         return self._response_status
105 | 
106 |     @response_status.setter
107 |     def response_status(self, response_status):
108 |         self._response_status = response_status
109 | 
110 |     @property
111 |     def reference_url(self):
112 |         return self._reference_url
113 | 
114 |     @reference_url.setter
115 |     def reference_url(self, reference_url):
116 |         """解析外链URL
117 |         只需要解析后的域名, 如:
118 |             传入: http://www.ttmark.com/diannao/2014/11/04/470.html
119 |             解析成: www.ttmark.com
120 |         """
121 |         proto, rest = urllib.splittype(reference_url)
122 |         res, rest = urllib.splithost(rest)
123 |         if not res:
124 |             self._reference_url = '-'
125 |         else:
126 |             self._reference_url = res
127 | 
128 |     @property
129 |     def request_url(self):
130 |         return self._request_url
131 | 
132 |     @request_url.setter
133 |     def request_url(self, request_url):
134 |         """
135 |         解析请求的URL
136 |         只需要解析后的URL路径不需要参数, 如:
137 |             传入: /wp-admin/admin-ajax.php?postviews_id=1348
138 |             解析成: /wp-admin/admin-ajax.php
139 |         
140 |         proto, rest = urllib.splittype(request_url)
141 |         url_path, url_param = urllib.splitquery(rest)
142 | 
143 |         if url_path.startswith('/tag/'):
144 |             url_path = '/tag/'
145 |         """
146 |         self._request_url = request_url
147 | 
148 |     @property
149 |     def access_time(self):
150 |         return str(self._access_time)
151 | 
152 |     @access_time.setter
153 |     def access_time(self, access_time):
154 |         # Apache log 解析日志格式
155 |         #input_datetime_format = '%d/%b/%Y:%H:%M:%S'
156 |         input_datetime_format = '%d/%b/%Y:%H:%M:%S'
157 |         self._access_time = datetime.datetime.strptime( access_time,input_datetime_format)
158 | 
159 |     @property
160 |     def cdn_ip(self):
161 |         return self._cdn_ip
162 | 
163 |     @cdn_ip.setter
164 |     def cdn_ip(self, cdn_ip):
165 |         self._cdn_ip = cdn_ip
166 | 
167 | if __name__ == '__main__':
168 |     pass
169 |     


--------------------------------------------------------------------------------
/module/iis_parser.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #-*- coding:utf-8 -*-
  3 | 
  4 | import datetime
  5 | import urllib
  6 | '''
  7 | date time           日期/时间
  8 | s-ip                服务器IP
  9 | cs-method           方法
 10 | cs-uri-stem         请求访问的页面
 11 | cs-uri-query        访问的查询字符串
 12 | s-port              服务器端口
 13 | cs-username         
 14 | c-ip                客户端IP
 15 | cs(User-Agent)      用户代理
 16 | sc-status           协议返回状态
 17 | sc-substatus        HTTP子协议的状态
 18 | sc-win32-status     Win32® 状态
 19 | time-taken          所用时间
 20 | '''
 21 | class IISLineParser(object):
 22 |     """将 Nginx 日志解析成多个字段"""
 23 | 
 24 |     def __init__(self):
 25 |         self._cdn_ip = '' # CDN请求IP
 26 |         self._access_time = '' # 请求时间
 27 |         self._request_url = '' # 请求的URL
 28 |         self._reference_url = '' # 外链URL
 29 |         self._response_status = '' # NG 响应状态码
 30 |         self._browser = '' # 用户使用的浏览器
 31 |         self._real_ip = '' # 用户真实IP
 32 |         self._mthod = '' # 请求方式
 33 |         self._bbytes = '' # 内容大小
 34 | 
 35 |     def parse(self, line):
 36 |         """通过传入的一行数据进行解析
 37 |         """
 38 |         line_item = line.strip().split(' ')
 39 | 
 40 |         #ip_time_tmp = line_item[2].strip().split()
 41 |         self.cdn_ip = line_item[2] # 服务器IP
 42 |         self.access_time = str(line_item[0]+" "+ line_item[1])# 请求发起的时间
 43 |         self.method = line_item[3].strip() # 请求方式
 44 |         if line_item[5].strip() == "-":
 45 |             self.request_url = line_item[4].strip() # 请求的URL
 46 |         else:
 47 |             self.request_url = line_item[4].strip()+"?"+ line_item[5].strip()
 48 |         self.port = line_item[6]
 49 |         self.reference_url = "" # 外链URL
 50 |         self.real_ip = line_item[8].strip()
 51 |         self.response_status = line_item[10].strip() # NG 响应状态码
 52 |         self.browser = line_item[9].strip() # 用户使用的浏览器
 53 |         self.bbytes = line_item[13].strip() # 浏览所用时间
 54 | 
 55 |     def to_dict(self):
 56 |         """将属性(@property)的转化为dict输出
 57 |         """
 58 |         propertys = {}
 59 | 
 60 |         propertys['real_ip'] = self.real_ip
 61 |         propertys['ser_ip'] = self.cdn_ip
 62 |         propertys['method'] = self.method
 63 |         propertys['access_time'] = self.access_time
 64 |         propertys['request_url'] = self.request_url
 65 |         propertys['reference_url'] = self.reference_url
 66 |         propertys['response_status'] = self.response_status
 67 |         propertys['browser'] = self.browser
 68 |         propertys['bbytes'] = self.bbytes
 69 |         return propertys
 70 | 
 71 |     @property
 72 |     def real_ip(self):
 73 |         return self._real_ip
 74 | 
 75 |     @real_ip.setter
 76 |     def real_ip(self, real_ip):
 77 |         self._real_ip = real_ip.split(', ')[0]
 78 | 
 79 |     @property
 80 |     def browser(self):
 81 |         return self._browser
 82 | 
 83 |     @browser.setter
 84 |     def browser(self, browser):
 85 |         self._browser = browser.replace('+',' ')
 86 | 
 87 |     @property
 88 |     def response_status(self):
 89 |         return self._response_status
 90 | 
 91 |     @response_status.setter
 92 |     def response_status(self, response_status):
 93 |         self._response_status = response_status
 94 | 
 95 |     @property
 96 |     def reference_url(self):
 97 |         return self._reference_url
 98 | 
 99 |     @reference_url.setter
100 |     def reference_url(self, reference_url):
101 |         """解析外链URL
102 |         只需要解析后的域名, 如:
103 |             传入: http://www.ttmark.com/diannao/2014/11/04/470.html
104 |             解析成: www.ttmark.com
105 |         """
106 |         proto, rest = urllib.splittype(reference_url)
107 |         res, rest = urllib.splithost(rest)
108 |         if not res:
109 |             self._reference_url = '-'
110 |         else:
111 |             self._reference_url = res
112 | 
113 |     @property
114 |     def request_url(self):
115 |         return self._request_url
116 | 
117 |     @request_url.setter
118 |     def request_url(self, request_url):
119 |         """解析请求的URL
120 |         只需要解析后的URL路径不需要参数, 如:
121 |             传入: /wp-admin/admin-ajax.php?postviews_id=1348
122 |             解析成: /wp-admin/admin-ajax.php
123 |         
124 |         proto, rest = urllib.splittype(request_url)
125 |         url_path, url_param = urllib.splitquery(rest)
126 | 
127 |         if url_path.startswith('/tag/'):
128 |             url_path = '/tag/'
129 |         """
130 |         self._request_url = request_url
131 | 
132 |     @property
133 |     def access_time(self):
134 |         return str(self._access_time)
135 | 
136 |     @access_time.setter
137 |     def access_time(self, access_time):
138 |         # IIS log 解析日志格式
139 |         #input_datetime_format = '%d/%b/%Y:%H:%M:%S'
140 |         input_datetime_format = '%Y-%m-%d %H:%M:%S'
141 |         self._access_time = datetime.datetime.strptime( access_time,input_datetime_format)
142 | 
143 |     @property
144 |     def cdn_ip(self):
145 |         return self._cdn_ip
146 | 
147 |     @cdn_ip.setter
148 |     def cdn_ip(self, cdn_ip):
149 |         self._cdn_ip = cdn_ip
150 | 
151 | if __name__ == '__main__':
152 |     pass
153 |     


--------------------------------------------------------------------------------
/module/nginx_parser.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #-*- coding:utf-8 -*-
  3 | import datetime
  4 | import urllib
  5 | 
  6 | class NgLineParser(object):
  7 |     """将 Nginx 日志解析成多个字段"""
  8 | 
  9 |     def __init__(self):
 10 |         self._cdn_ip = '' # CDN请求IP
 11 |         self._access_time = '' # 请求时间
 12 |         self._request_url = '' # 请求的URL
 13 |         self._reference_url = '' # 外链URL
 14 |         self._response_status = '' # NG 响应状态码
 15 |         self._browser = '' # 用户使用的浏览器
 16 |         self._real_ip = '' # 用户真实IP
 17 |         self._mthod = '' # 请求方式
 18 |         self._bbytes = '' # 内容大小
 19 | 
 20 |     def parse(self, line):
 21 |         """通过传入的一行数据进行解析
 22 |         """
 23 |         line_item = line.strip().split('"')
 24 |         if len(line_item) > 9: # 由于日志有改变需要删除一些元素
 25 |             del line_item[1]
 26 |             del line_item[1]
 27 | 
 28 |         # 获取临时的 CDN IP 和 访问文件
 29 |         ip_time_tmp = line_item[0].strip().split()
 30 |         if len(line_item)>7:
 31 |             self.real_ip = line_item[7] # 用户真实IP
 32 |         else:
 33 |             self.real_ip = ''
 34 |         self.cdn_ip = ip_time_tmp[0] # CDN请求IP
 35 |         self.access_time = ip_time_tmp[3].lstrip('[') # 请求时间
 36 |         self.method = line_item[1].strip().split()[0] # 请求方式
 37 |         self.request_url = line_item[1].strip().split()[1] # 请求的URL
 38 |         self.reference_url = line_item[3].strip() # 外链URL
 39 |         self.response_status = line_item[2].strip().split()[0] # NG 响应状态码
 40 |         self.bbytes = line_item[2].strip().split()[1] # NG 响应状态码
 41 |         self.browser = line_item[5].strip() # 用户使用的浏览器
 42 | 
 43 |     def to_dict(self):
 44 |         """将属性(@property)的转化为dict输出
 45 |         """
 46 |         propertys = {}
 47 | 
 48 |         propertys['real_ip'] = self.real_ip
 49 |         propertys['cdn_ip'] = self.cdn_ip
 50 |         propertys['method'] = self.method
 51 |         propertys['access_time'] = self.access_time
 52 |         propertys['request_url'] = self.request_url
 53 |         propertys['reference_url'] = self.reference_url
 54 |         propertys['response_status'] = self.response_status
 55 |         propertys['browser'] = self.browser
 56 |         propertys['bbytes'] = self.bbytes
 57 |         return propertys
 58 | 
 59 |     def parser_ua(self,ua_string):
 60 |         info = {}
 61 |         info['spider'] = False
 62 |         msg = ua_parse(ua_string)
 63 | 
 64 |         if msg.is_pc:
 65 |             info['dev'] = 'PC'
 66 |         elif msg.is_tablet:
 67 |             info['dev'] = 'Pad'
 68 |         elif msg.is_mobile:
 69 |             info['dev'] = 'MObile'
 70 |         else:
 71 |             info['dev'] = 'Unknow'
 72 | 
 73 |         if msg.is_bot:
 74 |             info['spider'] = True
 75 |         info["type"] = msg.os.family+' '+str(msg.os.version_string)
 76 | 
 77 |         info["ua"] = msg.browser.family+' '+str(msg.browser.version_string)
 78 |         return info
 79 | 
 80 | 
 81 |     @property
 82 |     def real_ip(self):
 83 |         return self._real_ip
 84 | 
 85 |     @real_ip.setter
 86 |     def real_ip(self, real_ip):
 87 |         self._real_ip = real_ip.split(', ')[0]
 88 | 
 89 |     @property
 90 |     def browser(self):
 91 |         return self._browser
 92 | 
 93 |     @browser.setter
 94 |     def browser(self, browser):
 95 |         self._browser = browser
 96 | 
 97 |     @property
 98 |     def response_status(self):
 99 |         return self._response_status
100 | 
101 |     @response_status.setter
102 |     def response_status(self, response_status):
103 |         self._response_status = response_status
104 | 
105 |     @property
106 |     def reference_url(self):
107 |         return self._reference_url
108 | 
109 |     @reference_url.setter
110 |     def reference_url(self, reference_url):
111 |         """解析外链URL
112 |         只需要解析后的域名, 如:
113 |             传入: http://www.ttmark.com/diannao/2014/11/04/470.html
114 |             解析成: www.ttmark.com
115 |         """
116 |         proto, rest = urllib.splittype(reference_url)
117 |         res, rest = urllib.splithost(rest)
118 |         if not res:
119 |             self._reference_url = '-'
120 |         else:
121 |             self._reference_url = res
122 | 
123 |     @property
124 |     def request_url(self):
125 |         return self._request_url
126 | 
127 |     @request_url.setter
128 |     def request_url(self, request_url):
129 |         """解析请求的URL
130 |         只需要解析后的URL路径不需要参数, 如:
131 |             传入: /wp-admin/admin-ajax.php?postviews_id=1348
132 |             解析成: /wp-admin/admin-ajax.php
133 |         
134 |         proto, rest = urllib.splittype(request_url)
135 |         url_path, url_param = urllib.splitquery(rest)
136 | 
137 |         if url_path.startswith('/tag/'):
138 |             url_path = '/tag/'
139 |         """
140 |         self._request_url = request_url
141 | 
142 |     @property
143 |     def access_time(self):
144 |         return str(self._access_time)
145 | 
146 |     @access_time.setter
147 |     def access_time(self, access_time):
148 |         # Nginx log 解析日志格式
149 |         input_datetime_format = '%d/%b/%Y:%H:%M:%S'
150 |         self._access_time = datetime.datetime.strptime( access_time,input_datetime_format)
151 | 
152 |     @property
153 |     def cdn_ip(self):
154 |         return self._cdn_ip
155 | 
156 |     @cdn_ip.setter
157 |     def cdn_ip(self, cdn_ip):
158 |         self._cdn_ip = cdn_ip
159 | 
160 | 
161 | if __name__ == '__main__':
162 |     pass
163 | 


--------------------------------------------------------------------------------
/module/tomcat_parser.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #-*- coding:utf-8 -*-
  3 | 
  4 | import datetime
  5 | import urllib,json
  6 | from common.convert import urldecode,htmlunescape
  7 | try:
  8 |     from user_agents import parse as ua_parse
  9 | except:
 10 |     print "try to pip install pyyaml ua-parser user-agents"
 11 | 
 12 | class TomcatParser(object):
 13 |     """将 Nginx 日志解析成多个字段"""
 14 | 
 15 |     def __init__(self):
 16 |         self._cdn_ip = '' # CDN请求IP
 17 |         self._access_time = '' # 请求时间
 18 |         self._request_url = '' # 请求的URL
 19 |         self._reference_url = '' # 外链URL
 20 |         self._response_status = '' # NG 响应状态码
 21 |         self._browser = '' # 用户使用的浏览器
 22 |         self._real_ip = '' # 用户真实IP
 23 |         self._mthod = '' # 请求方式
 24 |         self._bbytes = '' # 内容大小
 25 | 
 26 |     def parse(self, line):
 27 |         """
 28 |         通过传入的一行数据进行解析
 29 |         """
 30 |         line_item = line.strip().split()
 31 |         # 获取临时的 CDN IP 和 访问文件
 32 |         self.real_ip = line_item[0] # 请求IP
 33 |         self.access_time = line_item[3].replace("[","") # 请求时间
 34 |         self.method = line_item[5].strip('"') # 请求方式
 35 |         self.request_url = line_item[6] # 请求的URL
 36 |         self.response_status = line_item[8] # NG 响应状态码
 37 |         self.bbytes = line_item[9] # NG 响应状态码
 38 | 
 39 |     def to_dict(self):
 40 |         """
 41 |         将属性(@property)的转化为dict输出
 42 |         """
 43 |         propertys = {}
 44 | 
 45 |         propertys['real_ip'] = self.real_ip
 46 |         propertys['cdn_ip'] = self.cdn_ip
 47 |         propertys['method'] = self.method
 48 |         propertys['access_time'] = self.access_time
 49 |         propertys['request_url'] = self.request_url
 50 |         propertys['reference_url'] = self.reference_url
 51 |         propertys['response_status'] = self.response_status
 52 |         propertys['browser'] = self.browser
 53 |         propertys['bbytes'] = self.bbytes
 54 |         return propertys
 55 | 
 56 |     def parser_ua(self,ua_string):
 57 |         info = {}
 58 |         info['spider'] = False
 59 |         msg = ua_parse(ua_string)
 60 | 
 61 |         if msg.is_pc:
 62 |             info['dev'] = 'PC'
 63 |         elif msg.is_tablet:
 64 |             info['dev'] = 'Pad'
 65 |         elif msg.is_mobile:
 66 |             info['dev'] = 'MObile'
 67 |         else:
 68 |             info['dev'] = 'Unknow'
 69 | 
 70 |         if msg.is_bot:
 71 |             info['spider'] = True
 72 |         info["type"] = msg.os.family+' '+str(msg.os.version_string)
 73 | 
 74 |         info["ua"] = msg.browser.family+' '+str(msg.browser.version_string)
 75 |         return info
 76 | 
 77 | 
 78 |     @property
 79 |     def real_ip(self):
 80 |         return self._real_ip
 81 | 
 82 |     @real_ip.setter
 83 |     def real_ip(self, real_ip):
 84 |         self._real_ip = real_ip.split(', ')[0]
 85 | 
 86 |     @property
 87 |     def browser(self):
 88 |         return self._browser
 89 | 
 90 |     @browser.setter
 91 |     def browser(self, browser):
 92 |         self._browser = browser
 93 | 
 94 |     @property
 95 |     def response_status(self):
 96 |         return self._response_status
 97 | 
 98 |     @response_status.setter
 99 |     def response_status(self, response_status):
100 |         self._response_status = response_status
101 | 
102 |     @property
103 |     def reference_url(self):
104 |         return self._reference_url
105 | 
106 |     @reference_url.setter
107 |     def reference_url(self, reference_url):
108 |         """
109 |         解析外链URL
110 |         只需要解析后的域名, 如:
111 |         传入: http://www.ttmark.com/diannao/2014/11/04/470.html
112 |         解析成: www.ttmark.com
113 |         """
114 |         proto, rest = urllib.splittype(reference_url)
115 |         res, rest = urllib.splithost(rest)
116 |         if not res:
117 |             self._reference_url = '-'
118 |         else:
119 |             self._reference_url = res
120 | 
121 |     @property
122 |     def request_url(self):
123 |         return self._request_url
124 | 
125 |     @request_url.setter
126 |     def request_url(self, request_url):
127 |         """
128 |         解析请求的URL
129 |         只需要解析后的URL路径不需要参数, 如:
130 |             传入: /wp-admin/admin-ajax.php?postviews_id=1348
131 |             解析成: /wp-admin/admin-ajax.php
132 |         
133 |         proto, rest = urllib.splittype(request_url)
134 |         url_path, url_param = urllib.splitquery(rest)
135 | 
136 |         if url_path.startswith('/tag/'):
137 |             url_path = '/tag/'
138 |         """
139 |         self._request_url = request_url
140 | 
141 |     @property
142 |     def access_time(self):
143 |         return str(self._access_time)
144 | 
145 |     @access_time.setter
146 |     def access_time(self, access_time):
147 |         # Nginx log 解析日志格式
148 |         input_datetime_format = '%d/%b/%Y:%H:%M:%S'
149 |         self._access_time = datetime.datetime.strptime( access_time,input_datetime_format)
150 | 
151 |     @property
152 |     def cdn_ip(self):
153 |         return self._cdn_ip
154 | 
155 |     @cdn_ip.setter
156 |     def cdn_ip(self, cdn_ip):
157 |         self._cdn_ip = cdn_ip
158 |     


--------------------------------------------------------------------------------