├── GeoIPData ├── GeoIPASNum.dat ├── GeoLite2-City.mmdb └── GeoLiteCity.dat ├── README.MD ├── common ├── GeoIPUtils.py ├── __init__.py ├── convert.py ├── logger.py ├── rule.json ├── threadpool.py └── units.py ├── main.py └── module ├── __init__.py ├── apache_parser.py ├── iis_parser.py ├── nginx_parser.py └── tomcat_parser.py /GeoIPData/GeoIPASNum.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/0xa-saline/Logpara/b0be8febb7a9287157a119d5eb7b1dc8bc988a29/GeoIPData/GeoIPASNum.dat -------------------------------------------------------------------------------- /GeoIPData/GeoLite2-City.mmdb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/0xa-saline/Logpara/b0be8febb7a9287157a119d5eb7b1dc8bc988a29/GeoIPData/GeoLite2-City.mmdb -------------------------------------------------------------------------------- /GeoIPData/GeoLiteCity.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/0xa-saline/Logpara/b0be8febb7a9287157a119d5eb7b1dc8bc988a29/GeoIPData/GeoLiteCity.dat -------------------------------------------------------------------------------- /README.MD: -------------------------------------------------------------------------------- 1 | ## Logpara 2 | > 一个对常见的web日志进行解析处理的粗糙DEMO。 3 | 4 | [![Python 2.7](https://img.shields.io/badge/python-2.7-yellow.svg)](https://www.python.org/) [![License](https://img.shields.io/badge/license-GPLv2-red.svg)](https://github.com/0xa-saline/Logpara/blob/master/GPL-2.0) 5 | 6 | #### 目标 7 | - 对被请求的URL进行解析,解析出是否常见的攻击方式 8 | - 对来访的IP进行深度解析,包含经纬度,物理地址 9 | - 对来访的UA进行深度解析,解析出设备,浏览器种类,是否爬虫 10 | - 把全部的日志解析了入库,做RELK处理 11 | 12 | 13 | ### TO DO 14 | - 对入库elasticsearch的日志进行处理并展示 15 | 16 | 17 | ### Useage 18 | - 使用之前先修改common/units.py 19 | ``` 20 | redis_host = '192.168.87.222' 21 | redis_port = 6379 22 | redis_pass = 'cft67ygv' 23 | redis_db = 0 24 | redis_key = 'logstash:redis' 25 | ``` 26 | 27 | - 使用 28 | 29 | ``` 30 | Usage: main.py --type IIS|Apache|Tomcat|Nginx --file file|directory 31 | 32 | log parser 33 | 34 | Options: 35 | -h, --help show this help message and exit 36 | --type=TYPE chose which log type 37 | --file=FILE chose file or directory 38 | ``` 39 | 40 | RELK是什么 41 | ---- 42 | * elasticsearch 43 | * logstash 44 | * kibana 45 | * redis 46 | 47 | 48 | 以上这些程序的首字母、简称,每次都打出全部的感觉好累,就这样简称吧。 49 | 50 | 服务器要求 51 | ---- 52 | ``` 53 | elk+redis 一台 简称为ELK机器 54 | ``` 55 | 56 | RELK机器配置 57 | ---- 58 | 59 | ##### redis设置密码 60 | 61 | ##### 修改 ElasticSearch 配置 `sudo vim /etc/elasticsearch/elasticsearch.yml`,搜索`network.host`,修改如下配置 62 | ``` 63 | network.host: localhost 64 | ``` 65 | 66 | ##### 添加 Logstash 配置 `sudo vim /etc/logstash/conf.d/config.conf` 67 | ``` 68 | input { 69 | redis { 70 | host => '127.0.0.1' 71 | password => 'password' 72 | data_type => 'list' 73 | key => 'logstash:redis' 74 | } 75 | } 76 | output { 77 | elasticsearch { hosts => localhost } 78 | stdout { codec => rubydebug } 79 | } 80 | ``` 81 | 82 | ##### 运行Logstash 83 | ``` 84 | sudo nohup /opt/logstash/bin/logstash -f /etc/logstash/conf.d/ & 85 | ``` 86 | 87 | ##### 修改 Kibana 配置 `sudo vim /opt/kibana/config/kibana.yml`,搜索`server.host`,修改如下配置: 88 | ``` 89 | server.host: "0.0.0.0" 90 | ``` 91 | 92 | ##### 以上修改配置之后均要重启一次服务。 93 | 94 | 95 | 流程 96 | 97 | ## FAQ 98 | - 非常规配置的日志不能识别? 99 | 不能识别也是很难受,但是自己修改正则去匹配吧 100 | - 没有weblogic/oracle日志模块? 101 | 暂时没有测试环境 102 | 103 | 104 | ## 免责 105 | 日志分析的项目仅用于学习,自检网络安全,禁止用于其他用途。 106 | 107 | -------------------------------------------------------------------------------- /common/GeoIPUtils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #-*- coding:utf-8 -*- 3 | import pygeoip, geoip2.database, os, re 4 | 5 | class GeoIPUtil(): 6 | def __init__(self): 7 | self.geocity = pygeoip.GeoIP(os.path.join(os.path.split(os.path.realpath(__file__))[0], '../GeoIPData', 'GeoLiteCity.dat')) 8 | self.geoasn = pygeoip.GeoIP(os.path.join(os.path.split(os.path.realpath(__file__))[0], '../GeoIPData', 'GeoIPASNum.dat')) 9 | self.as_re = re.compile('AS(?P\d+)(?: (?P.+))?') 10 | 11 | def get_AS_info_by_ip(self, ip): 12 | asn = self.geoasn.asn_by_addr(ip) 13 | if asn != None: 14 | (asnum, asname) = self.as_re.match(asn).groups() 15 | else: 16 | (asnum, asname) = (0, 'None') 17 | return (asnum, asname) 18 | 19 | def get_lat_alt(self, ip): 20 | loc=self.geocity.record_by_name(ip) 21 | if loc is None: 22 | return None 23 | return [loc['longitude'],loc['latitude']] 24 | 25 | class GeoIP2Util(): 26 | def __init__(self): 27 | self.reader = geoip2.database.Reader(os.path.join(os.path.split(os.path.realpath(__file__))[0], '../GeoIPData', 'GeoLite2-City.mmdb')) 28 | 29 | def get_lat_alt(self, ip): 30 | response = self.reader.city(ip) 31 | if response is None: 32 | return None 33 | return [response.location.longitude, response.location.latitude] 34 | 35 | def get_ip_location(self, ip): 36 | response = self.reader.city(ip) 37 | 38 | if response.country.names.has_key('zh-CN'): 39 | country = response.country.names['zh-CN'] 40 | else: 41 | country = response.country.name 42 | if response.subdivisions.most_specific.names.has_key('zh-CN'): 43 | subdivision = response.subdivisions.most_specific.names['zh-CN'] 44 | else: 45 | subdivision = response.subdivisions.most_specific.name 46 | if response.city.names.has_key('zh-CN'): 47 | city = response.city.names['zh-CN'] 48 | else: 49 | city = response.city.name 50 | return (country, subdivision, city) 51 | 52 | if __name__ == '__main__': 53 | gi = GeoIPUtil() 54 | longitude,latitude = gi.get_lat_alt('220.181.171.119') 55 | print latitude,",",longitude 56 | gi = GeoIP2Util() 57 | #longitude1,latitude1 = gi.get_lat_alt('220.181.171.119') 58 | #print longitude1,latitude1 59 | country, subdivision, city = gi.get_ip_location('220.181.171.119') 60 | print country, subdivision, city 61 | -------------------------------------------------------------------------------- /common/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/0xa-saline/Logpara/b0be8febb7a9287157a119d5eb7b1dc8bc988a29/common/__init__.py -------------------------------------------------------------------------------- /common/convert.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # from slqmap 3 | try: 4 | import cPickle as pickle 5 | except: 6 | import pickle 7 | finally: 8 | import pickle as picklePy 9 | import urllib 10 | import base64 11 | import json 12 | import re 13 | import StringIO 14 | import sys 15 | import subprocess 16 | 17 | # System variables 18 | IS_WIN = subprocess.mswindows 19 | # Encoding used for Unicode data 20 | UNICODE_ENCODING = "utf8" 21 | 22 | # Chars which can be used as a failsafe values in case of too long URL encoding value 23 | URLENCODE_FAILSAFE_CHARS = '()|,' 24 | 25 | # Table used for Base64 conversion in WordPress hash cracking routine 26 | 27 | class PLACE: 28 | GET = "GET" 29 | POST = "POST" 30 | URI = "URI" 31 | COOKIE = "Cookie" 32 | USER_AGENT = "User-Agent" 33 | REFERER = "Referer" 34 | HOST = "Host" 35 | CUSTOM_POST = "(custom) POST" 36 | CUSTOM_HEADER = "(custom) HEADER" 37 | 38 | def base64decode(value): 39 | """ 40 | Decodes string value from Base64 to plain format 41 | 42 | >>> base64decode('Zm9vYmFy') 43 | 'foobar' 44 | """ 45 | 46 | return base64.b64decode(value) 47 | 48 | def base64encode(value): 49 | """ 50 | Encodes string value from plain to Base64 format 51 | 52 | >>> base64encode('foobar') 53 | 'Zm9vYmFy' 54 | """ 55 | 56 | return base64.b64encode(value) 57 | 58 | def base64pickle(value): 59 | """ 60 | Serializes (with pickle) and encodes to Base64 format supplied (binary) value 61 | 62 | >>> base64pickle('foobar') 63 | 'gAJVBmZvb2JhcnEBLg==' 64 | """ 65 | 66 | retVal = None 67 | 68 | try: 69 | retVal = base64encode(pickle.dumps(value, pickle.HIGHEST_PROTOCOL)) 70 | except: 71 | warnMsg = "problem occurred while serializing " 72 | warnMsg += "instance of a type '%s'" % type(value) 73 | singleTimeWarnMessage(warnMsg) 74 | 75 | try: 76 | retVal = base64encode(pickle.dumps(value)) 77 | except: 78 | retVal = base64encode(pickle.dumps(str(value), pickle.HIGHEST_PROTOCOL)) 79 | 80 | return retVal 81 | 82 | def base64unpickle(value, unsafe=False): 83 | """ 84 | Decodes value from Base64 to plain format and deserializes (with pickle) its content 85 | 86 | >>> base64unpickle('gAJVBmZvb2JhcnEBLg==') 87 | 'foobar' 88 | """ 89 | 90 | retVal = None 91 | 92 | def _(self): 93 | if len(self.stack) > 1: 94 | func = self.stack[-2] 95 | if func not in PICKLE_REDUCE_WHITELIST: 96 | raise Exception, "abusing reduce() is bad, Mkay!" 97 | self.load_reduce() 98 | 99 | def loads(str): 100 | f = StringIO.StringIO(str) 101 | if unsafe: 102 | unpickler = picklePy.Unpickler(f) 103 | unpickler.dispatch[picklePy.REDUCE] = _ 104 | else: 105 | unpickler = pickle.Unpickler(f) 106 | return unpickler.load() 107 | 108 | try: 109 | retVal = loads(base64decode(value)) 110 | except TypeError: 111 | retVal = loads(base64decode(bytes(value))) 112 | 113 | return retVal 114 | 115 | def hexdecode(value): 116 | """ 117 | Decodes string value from hex to plain format 118 | 119 | >>> hexdecode('666f6f626172') 120 | 'foobar' 121 | """ 122 | 123 | value = value.lower() 124 | return (value[2:] if value.startswith("0x") else value).decode("hex") 125 | 126 | def hexencode(value): 127 | """ 128 | Encodes string value from plain to hex format 129 | 130 | >>> hexencode('foobar') 131 | '666f6f626172' 132 | """ 133 | 134 | return utf8encode(value).encode("hex") 135 | 136 | def unicodeencode(value, encoding=None): 137 | """ 138 | Returns 8-bit string representation of the supplied unicode value 139 | 140 | >>> unicodeencode(u'foobar') 141 | 'foobar' 142 | """ 143 | 144 | retVal = value 145 | if isinstance(value, unicode): 146 | try: 147 | retVal = value.encode(encoding or UNICODE_ENCODING) 148 | except UnicodeEncodeError: 149 | retVal = value.encode(UNICODE_ENCODING, "replace") 150 | return retVal 151 | 152 | 153 | def unicode_encode(value, encoding=None): 154 | """ 155 | Return 8-bit string representation of the supplied unicode value: 156 | 157 | >>> unicode_encode(u'test') 158 | 'test' 159 | """ 160 | 161 | ret_val = value 162 | if isinstance(value, unicode): 163 | try: 164 | ret_val = value.encode(encoding or UNICODE_ENCODING) 165 | except UnicodeEncodeError: 166 | ret_val = value.encode(UNICODE_ENCODING, "replace") 167 | return ret_val 168 | 169 | 170 | def utf8encode(value): 171 | return unicode_encode(value, "utf-8") 172 | 173 | 174 | def utf8decode(value): 175 | """ 176 | Returns UTF-8 representation of the supplied 8-bit string representation 177 | 178 | >>> utf8decode('foobar') 179 | u'foobar' 180 | """ 181 | 182 | return value.decode("utf-8") 183 | 184 | 185 | def urldecode(value, encoding=None): 186 | """ 187 | URL decodes given value 188 | >>> urldecode('AND%201%3E%282%2B3%29%23', convall=True) 189 | u'AND 1>(2+3)#' 190 | """ 191 | result = None 192 | 193 | if value: 194 | try: 195 | # for cases like T%C3%BCrk%C3%A7e 196 | value = str(value) 197 | except ValueError: 198 | pass 199 | finally: 200 | result = urllib.unquote_plus(value) 201 | 202 | if isinstance(result, str): 203 | result = unicode(result, encoding or UNICODE_ENCODING, errors="replace") 204 | 205 | return result 206 | 207 | 208 | def urlencode(value, safe="%&=", convall=False, limit=False): 209 | """ 210 | URL encodes given value 211 | >>> urlencode('AND 1>(2+3)#') 212 | 'AND%201%3E%282%2B3%29%23' 213 | """ 214 | count = 0 215 | result = None 216 | 217 | if value is None: 218 | return result 219 | 220 | if convall or safe is None: 221 | safe = "" 222 | 223 | # corner case when character % really needs to be 224 | # encoded (when not representing url encoded char) 225 | if all(map(lambda x: '%' in x, [safe, value])): 226 | value = re.sub("%(?![0-9a-fA-F]{2})", "%25", value, re.DOTALL | re.IGNORECASE) 227 | 228 | while True: 229 | result = urllib.quote(utf8_encode(value), safe) 230 | 231 | if limit and len(result) > URLENCODE_CHAR_LIMIT: 232 | if count >= len(URLENCODE_FAILSAFE_CHARS): 233 | break 234 | 235 | while count < len(URLENCODE_FAILSAFE_CHARS): 236 | safe += URLENCODE_FAILSAFE_CHARS[count] 237 | count += 1 238 | if safe[-1] in value: 239 | break 240 | else: 241 | break 242 | 243 | return result 244 | 245 | def htmlunescape(value): 246 | """ 247 | Returns (basic conversion) HTML unescaped value 248 | 249 | >>> htmlunescape('a<b') 250 | 'a'), ('"', '"'), (' ', ' '), ('&', '&')) 256 | retVal = reduce(lambda x, y: x.replace(y[0], y[1]), codes, retVal) 257 | try: 258 | retVal = re.sub(r"&#x([^ ;]+);", lambda match: unichr(int(match.group(1), 16)), retVal) 259 | except ValueError: 260 | pass 261 | return retVal 262 | 263 | def singleTimeWarnMessage(message): # Cross-linked function 264 | sys.stdout.write(message) 265 | sys.stdout.write("\n") 266 | sys.stdout.flush() 267 | 268 | def stdoutencode(data): 269 | retVal = None 270 | 271 | try: 272 | data = data or "" 273 | 274 | # Reference: http://bugs.python.org/issue1602 275 | if IS_WIN: 276 | output = data.encode(sys.stdout.encoding, "replace") 277 | 278 | if '?' in output and '?' not in data: 279 | warnMsg = "cannot properly display Unicode characters " 280 | warnMsg += "inside Windows OS command prompt " 281 | warnMsg += "(http://bugs.python.org/issue1602). All " 282 | warnMsg += "unhandled occurances will result in " 283 | warnMsg += "replacement with '?' character. Please, find " 284 | warnMsg += "proper character representation inside " 285 | warnMsg += "corresponding output files. " 286 | singleTimeWarnMessage(warnMsg) 287 | 288 | retVal = output 289 | else: 290 | retVal = data.encode(sys.stdout.encoding) 291 | except: 292 | retVal = data.encode(UNICODE_ENCODING) if isinstance(data, unicode) else data 293 | 294 | return retVal 295 | 296 | def jsonize(data): 297 | """ 298 | Returns JSON serialized data 299 | 300 | >>> jsonize({'foo':'bar'}) 301 | '{\\n "foo": "bar"\\n}' 302 | """ 303 | 304 | return json.dumps(data, sort_keys=False, indent=4) 305 | 306 | def dejsonize(data): 307 | """ 308 | Returns JSON deserialized data 309 | 310 | >>> dejsonize('{\\n "foo": "bar"\\n}') 311 | {u'foo': u'bar'} 312 | """ 313 | 314 | return json.loads(data) 315 | 316 | 317 | def to_param_dict(params): 318 | """a=1&b=2 to {'a':1,'b':2}""" 319 | param_dict = {} 320 | if not params: 321 | return param_dict 322 | try: 323 | split_params = params.split('&') 324 | for element in split_params: 325 | elem = element.split("=") 326 | if len(elem) >= 2: 327 | parameter = elem[0].replace(" ", "") 328 | value = "=".join(elem[1:]) 329 | param_dict[parameter] = value 330 | except: 331 | pass 332 | 333 | return param_dict 334 | 335 | def to_param_str(param_dict): 336 | """{'a':1,'b':2} to a=1&b=2""" 337 | params = '&'.join([k + '=' + v for k, v in param_dict.items()]) 338 | return params 339 | 340 | if __name__ == '__main__': 341 | url = '<?xml version="1.0" encoding="UTF-8"?><collection><element ID="UserName" Type="String"><![CDATA[test]]></element><element ID="Password" Type="String"><![CDATA[1234]]></element><element ID="VerifyCode" Type="String"><![CDATA[3kv3]]></element><element ID="LoginImg" Type="String"><![CDATA[]]></element><element ID="Cancel" Type="String"><![CDATA[]]></element></collection>' 342 | print htmlunescape(url) -------------------------------------------------------------------------------- /common/logger.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Author: fuck@0day5.com 5 | # 6 | import time 7 | import ctypes,sys 8 | import platform 9 | 10 | def get_current_isostr(): 11 | iso = time.strftime("%Y-%m-%dT%H:%M:%S", time.localtime()) 12 | return iso 13 | 14 | if platform.system()=='Linux' or platform.system()=='Darwin': 15 | class colors: 16 | BLACK = '\033[0;30m' 17 | DARK_GRAY = '\033[1;30m' 18 | LIGHT_GRAY = '\033[0;37m' 19 | BLUE = '\033[0;34m' 20 | LIGHT_BLUE = '\033[1;34m' 21 | GREEN = '\033[0;32m' 22 | LIGHT_GREEN = '\033[1;32m' 23 | CYAN = '\033[0;36m' 24 | LIGHT_CYAN = '\033[1;36m' 25 | RED = '\033[0;31m' 26 | LIGHT_RED = '\033[1;31m' 27 | PURPLE = '\033[0;35m' 28 | LIGHT_PURPLE = '\033[1;35m' 29 | BROWN = '\033[0;33m' 30 | YELLOW = '\033[1;33m' 31 | WHITE = '\033[1;37m' 32 | DEFAULT_COLOR = '\033[00m' 33 | RED_BOLD = '\033[01;31m' 34 | ENDC = '\033[0m' 35 | 36 | def print_error(mess): 37 | mess=mess.strip('\r\n') 38 | print colors.RED+get_current_isostr() + mess + colors.ENDC 39 | 40 | def print_warm(mess): 41 | mess=mess.strip('\r\n') 42 | print colors.LIGHT_PURPLE + get_current_isostr()+ mess+ colors.ENDC 43 | 44 | def print_debug(mess): 45 | mess=mess.strip('\r\n') 46 | print colors.GREEN + get_current_isostr()+ mess + colors.ENDC 47 | 48 | 49 | if platform.system()=='Windows': 50 | STD_INPUT_HANDLE = -10 51 | STD_OUTPUT_HANDLE = -11 52 | STD_ERROR_HANDLE = -12 53 | 54 | FOREGROUND_BLACK = 0x0 55 | FOREGROUND_BLUE = 0x01 # text color contains blue. 56 | FOREGROUND_GREEN = 0x02 # text color contains green. 57 | FOREGROUND_RED = 0x04 # text color contains red. 58 | 59 | FOREGROUND_INTENSITY = 0x08 # text color is intensified. 60 | BACKGROUND_BLUE = 0x10 # background color contains blue. 61 | BACKGROUND_GREEN = 0x20 # background color contains green. 62 | BACKGROUND_RED = 0x40 # background color contains red. 63 | BACKGROUND_INTENSITY = 0x80 # background color is intensified. 64 | 65 | 66 | std_out_handle = ctypes.windll.kernel32.GetStdHandle(STD_OUTPUT_HANDLE) 67 | 68 | def set_cmd_text_color(color, handle=std_out_handle): 69 | Bool = ctypes.windll.kernel32.SetConsoleTextAttribute(handle, color) 70 | return Bool 71 | 72 | def resetColor(): 73 | set_cmd_text_color(FOREGROUND_RED | FOREGROUND_GREEN | FOREGROUND_BLUE) 74 | 75 | def print_error(mess): 76 | set_cmd_text_color(FOREGROUND_RED | FOREGROUND_INTENSITY) 77 | sys.stdout.write("%s %s\n" % (get_current_isostr(), msg)) 78 | resetColor() 79 | 80 | def print_warm(mess): 81 | set_cmd_text_color(FOREGROUND_YELLOW | FOREGROUND_BLUE| FOREGROUND_INTENSITY) 82 | sys.stdout.write("%s %s\n" % (get_current_isostr(), msg)) 83 | resetColor() 84 | 85 | def print_debug(mess): 86 | set_cmd_text_color(FOREGROUND_GREEN | FOREGROUND_INTENSITY) 87 | sys.stdout.write("%s %s\n" % (get_current_isostr(), msg)) 88 | resetColor() 89 | 90 | 91 | if __name__ == '__main__': 92 | mess = "hello,world" 93 | print_error(mess) 94 | print_debug(mess) 95 | print_warm(mess) -------------------------------------------------------------------------------- /common/rule.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "patterns": [ 4 | { 5 | "part": "etc.*?issue|etc.*?hosts|etc.*?passwd|windows/win.ini|proc\\/version|WEB-INF\\/web.xml", 6 | "type": "regex" 7 | }, 8 | { 9 | "part":"\\/hosts|\\.bash_history|bashrc|config\\[root_dir]\\=|appserv_root=|path\\[docroot]\\=|GALLERY_BASEDIR\\=|_SERVER\\[DOCUMENT_ROOT]|_CONF\\[path]|mosConfig_absolute_path\\=", 10 | "type":"match" 11 | } 12 | ], 13 | "tag": "file_include", 14 | "level": "high" 15 | }, 16 | { 17 | "patterns": [ 18 | { 19 | "part": "\\<.+javascript:window\\[.{1}\\\\x|<.*=(&#\\d+?;?)+?>|\\<.*(data|src)=data:text\\/html.*>|\\b(alert\\(|confirm\\(|expression\\(|s*script\\(|prompt\\(|benchmark\\s*?\\(.*\\))", 20 | "type": "regex" 21 | } 22 | ], 23 | "tag": "XSS", 24 | "level": "medium" 25 | }, 26 | { 27 | "patterns": [ 28 | { 29 | "part": "nslookup|whoami|set|dir|net", 30 | "type": "match" 31 | } 32 | ], 33 | "tag": "command", 34 | "level": "high" 35 | }, 36 | { 37 | "patterns": [ 38 | { 39 | "part": "(sleep\\s*?\\(.*\\)|\\b(group_)?concat[\\s\\/\\*]*?\\([^\\)]+?\\)|\bcase[\\s\/\\*]*?when[\\s\/\\*]*?\\([^\\)]+?\\)|load_file\\s*?\\()|<[a-z]+?\\b[^>]*?\\bon([a-z]{4,})\\s*?=|^\\+\\/v(8|9)|\\b(and|or|OR)\\b\\s*?([\\(\\)'\"\\d]+?=[\\(\\)'\"\\d]+?|[\\(\\)'\"a-zA-Z]+?=[\\(\\)'\"a-zA-Z]+?|>|<|\\s+?[\\w]+?\\s+?\\bin\\b\\s*?\\(|\\blike\\b\\s+?[\"'])|\\/\\*.*\\*\\/|\\bEXEC\\b|UNION.+?SELECT\\s*(\\(.+\\)\\s*|@{1,2}.+?\\s*|\\s+?.+?|(`|'|\").*?(`|'|\")\\s*)|UPDATE\\s*(\\(.+\\)\\s*|@{1,2}.+?\\s*|\\s+?.+?|(`|'|\").*?(`|'|\")\\s*)SET|INSERT\\s+INTO.+?VALUES|(SELECT|DELETE)@{0,2}(\\(.+\\)|\\s+?.+?\\s+?|(`|'|\").*?(`|'|\"))FROM(\\(.+\\)|\\s+?.+?|(`|'|\").*?(`|'|\"))|(CREATE|ALTER|DROP|TRUNCATE)\\s+(TABLE|DATABASE)", 40 | "type": "regex" 41 | }, 42 | { 43 | "part": "(convert\\s*?\\(int,CHAR\\(|\\(CONVERT\\s*?\\(INT|\\b(and|or|OR|xor|XOR|AND|/**/)\\b", 44 | "type": "regex" 45 | }, 46 | { 47 | "part": "\\(SELECT.*FROM\\)|\\(union.*SELECT\\)|\\(select.*load_file\\)", 48 | "type": "regex" 49 | }, 50 | { 51 | "part": "SELECT.*concat\\(", 52 | "type": "regex" 53 | }, 54 | { 55 | "part": "pg_sleep|benchmark\\(|if\\(|shutdown", 56 | "type": "match" 57 | } 58 | ], 59 | "tag": "sqlinject", 60 | "level": "high" 61 | }, 62 | { 63 | "patterns": [ 64 | { 65 | "part": "_memberAccess", 66 | "type": "match" 67 | }, 68 | { 69 | "part": "debug.*expression", 70 | "type": "regex" 71 | }, 72 | { 73 | "part": "_memberAccess.*java.lang.Runtime", 74 | "type": "regex" 75 | } 76 | ], 77 | "tag": "struts2", 78 | "level": "high" 79 | }, 80 | { 81 | "patterns": [ 82 | { 83 | "part": "assert\\(|eval\\(|phpinfo\\(|echo\\(|print\\(|var_dump\\(|print_r\\(|Execute\\(|Response.Write\\(|z0=GB2312|z0=UTF-8|\\${new java.lang", 84 | "type": "regex" 85 | } 86 | ], 87 | "tag": "code_excute", 88 | "level": "high" 89 | }, 90 | { 91 | "patterns": [ 92 | { 93 | "part": "shell|allow_url_include|auto_prepend_file|php://input", 94 | "type": "match" 95 | } 96 | ], 97 | "tag": "code_rce", 98 | "level": "high" 99 | }, 100 | { 101 | "patterns": [ 102 | { 103 | "part": "java.lang.String.*println", 104 | "type": "regex" 105 | }, 106 | { 107 | "part": "java.lang.ProcessBuilder", 108 | "type": "match" 109 | } 110 | ], 111 | "tag": "java expression", 112 | "level": "high" 113 | }, 114 | { 115 | "patterns": [ 116 | { 117 | "part": ".git/config", 118 | "type": "match" 119 | } 120 | ], 121 | "tag": "git", 122 | "level": "high" 123 | }, 124 | { 125 | "patterns": [ 126 | { 127 | "part": ".DS_Store", 128 | "type": "match" 129 | } 130 | ], 131 | "tag": "DS_Store", 132 | "level": "high" 133 | }, 134 | { 135 | "patterns": [ 136 | { 137 | "part": ".svn\\\/(all-wcprops|all-wcpropss|entries|trunk)", 138 | "type": "match" 139 | } 140 | ], 141 | "tag": "svn", 142 | "level": "high" 143 | }, 144 | { 145 | "patterns": [ 146 | { 147 | "part": "vul_webscan", 148 | "type": "match" 149 | } 150 | ], 151 | "tag": "360Webscan", 152 | "level": "low" 153 | }, 154 | { 155 | "patterns": [ 156 | { 157 | "part": "dbappsecurity|dbappsec|dbapp|\"%d5\\'|%21(()%26%26%21%7c*%7c*%7c|(()))", 158 | "type": "match" 159 | } 160 | ], 161 | "tag": "anhengWebscan", 162 | "level": "low" 163 | }, 164 | { 165 | "patterns": [ 166 | { 167 | "part": "vulnweb.com|acunetix|bxss.me|injected_by_wvs|wvstest", 168 | "type": "match" 169 | } 170 | ], 171 | "tag": "AWVS", 172 | "level": "low" 173 | }, 174 | { 175 | "patterns": [ 176 | { 177 | "part": "crossdomain.xml", 178 | "type": "match" 179 | } 180 | ], 181 | "tag": "crossdomain.xml", 182 | "level": "low" 183 | }, 184 | { 185 | "patterns": [ 186 | { 187 | "part": "CVS/ROOT", 188 | "type": "match" 189 | } 190 | ], 191 | "tag": "cvs_root", 192 | "level": "low" 193 | }, 194 | { 195 | "patterns": [ 196 | { 197 | "part": "\\/axis\\/services", 198 | "type": "match" 199 | } 200 | ], 201 | "tag": "Aixs", 202 | "level": "low" 203 | }, 204 | { 205 | "patterns": [ 206 | { 207 | "part": "\\/server-status", 208 | "type": "match" 209 | } 210 | ], 211 | "tag": "Apache status", 212 | "level": "low" 213 | }, 214 | { 215 | "patterns": [ 216 | { 217 | "part": "~.aspx", 218 | "type": "match" 219 | } 220 | ], 221 | "tag": "IIS short", 222 | "level": "low" 223 | }, 224 | { 225 | "patterns": [ 226 | { 227 | "part": "services\\/listServices", 228 | "type": "match" 229 | } 230 | ], 231 | "tag": "services scan", 232 | "level": "low" 233 | } 234 | ] 235 | -------------------------------------------------------------------------------- /common/threadpool.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | # 模拟一个进城池 线程池,可以向里面添加任务, 3 | 4 | import threading 5 | import time 6 | import traceback 7 | import logger 8 | import Queue 9 | import random 10 | 11 | class new_threadpool: 12 | 13 | def __init__(self,threadnum,func_scan,Isjoin = False): 14 | self.thread_count = self.thread_nums = threadnum 15 | self.scan_count_lock = threading.Lock() 16 | self.thread_count_lock = threading.Lock() 17 | self.load_lock = threading.Lock() 18 | self.scan_count = 0 19 | self.isContinue = True 20 | self.func_scan = func_scan 21 | self.queue = Queue.Queue() 22 | self.isjoin = Isjoin 23 | 24 | def push(self,payload): 25 | self.queue.put(payload) 26 | 27 | def changeScanCount(self,num): 28 | self.scan_count_lock.acquire() 29 | self.scan_count += num 30 | self.scan_count_lock.release() 31 | 32 | def changeThreadCount(self,num): 33 | self.thread_count_lock.acquire() 34 | self.thread_count += num 35 | self.thread_count_lock.release() 36 | 37 | def run(self): 38 | th = [] 39 | for i in range(self.thread_nums): 40 | t = threading.Thread(target=self.scan) 41 | t.setDaemon(True) 42 | t.start() 43 | th.append(t) 44 | 45 | # It can quit with Ctrl-C 46 | if self.isjoin: 47 | for tt in th: 48 | tt.join() 49 | else: 50 | while 1: 51 | if self.thread_count > 0 and self.isContinue: 52 | time.sleep(0.01) 53 | else: 54 | break 55 | 56 | def stop(self): 57 | self.load_lock.acquire() 58 | self.isContinue = False 59 | self.load_lock.release() 60 | 61 | def scan(self): 62 | while 1: 63 | self.load_lock.acquire() 64 | if self.queue.qsize() > 0 and self.isContinue: 65 | payload = self.queue.get() 66 | 67 | self.load_lock.release() 68 | else: 69 | self.load_lock.release() 70 | break 71 | try: 72 | # 在执行时报错如果不被处理,线程会停止并退出 73 | self.func_scan(payload) 74 | time.sleep(0.3) 75 | except KeyboardInterrupt: 76 | self.isContinue = False 77 | raise KeyboardInterrupt 78 | except Exception: 79 | errmsg = traceback.format_exc() 80 | self.isContinue = False 81 | print_error(errmsg) 82 | 83 | self.changeThreadCount(-1) 84 | 85 | 86 | if __name__ == '__main__': 87 | def calucator(args): 88 | num,numt = args 89 | print numt 90 | i = random.randint(1, 100) 91 | u = num 92 | a = i * u 93 | if (a % 6 == 0): 94 | for x in range(5): 95 | print "new thread",x 96 | #p.push(x) 97 | 98 | p = new_threadpool(3, calucator) 99 | for i in range(20): 100 | args=(i,i+1,) 101 | p.push(args) 102 | p.run() 103 | 104 | -------------------------------------------------------------------------------- /common/units.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #-*- coding:utf-8 -*- 3 | import redis 4 | import os,re,json 5 | from GeoIPUtils import GeoIPUtil,GeoIP2Util 6 | from convert import urldecode,htmlunescape 7 | try: 8 | from user_agents import parse as ua_parse 9 | except: 10 | print "try to pip install pyyaml ua-parser user-agents" 11 | 12 | import sys 13 | default_encoding = 'utf-8' 14 | if sys.getdefaultencoding() != default_encoding: 15 | reload(sys) 16 | sys.setdefaultencoding(default_encoding) 17 | 18 | EXCLUDE_EXTENSIONS = ("ico","3ds", "3g2", "3gp", "7z", "DS_Store", "a", "aac", "adp", "ai", "aif", "aiff", "apk", "ar", "asf", "au", "avi", "bak", "bin", "bk", "bmp", "btif", "bz2", "cab", "caf", "cgm", "cmx", "cpio", "cr2", "dat", "deb", "djvu", "dll", "dmg", "dmp", "dng", "doc", "docx", "dot", "dotx", "dra", "dsk", "dts", "dtshd", "dvb", "dwg", "dxf", "ear", "ecelp4800", "ecelp7470", "ecelp9600", "egg", "eol", "eot", "epub", "exe", "f4v", "fbs", "fh", "fla", "flac", "fli", "flv", "fpx", "fst", "fvt", "g3", "gif", "gz", "h261", "h263", "h264", "ico", "ief", "image", "img", "ipa", "iso", "jar", "jpeg", "jpg", "jpgv", "jpm", "jxr", "ktx", "lvp", "lz", "lzma", "lzo", "m3u", "m4a", "m4v", "mar", "mdi", "mid", "mj2", "mka", "mkv", "mmr", "mng", "mov", "movie", "mp3", "mp4", "mp4a", "mpeg", "mpg", "mpga", "mxu", "nef", "npx", "o", "oga", "ogg", "ogv", "otf", "pbm", "pcx", "pdf", "pea", "pgm", "pic", "png", "pnm", "ppm", "pps", "ppt", "pptx", "ps", "psd", "pya", "pyc", "pyo", "pyv", "qt", "rar", "ras", "raw", "rgb", "rip", "rlc", "rz", "s3m", "s7z", "scm", "scpt", "sgi", "shar", "sil", "smv", "so", "sub", "swf", "tar", "tbz2", "tga", "tgz", "tif", "tiff", "tlz", "ts", "ttf", "uvh", "uvi", "uvm", "uvp", "uvs", "uvu", "viv", "vob", "war", "wav", "wax", "wbmp", "wdp", "weba", "webm", "webp", "whl", "wm", "wma", "wmv", "wmx", "woff", "woff2", "wvx", "xbm", "xif", "xls", "xlsx", "xlt", "xm", "xpi", "xpm", "xwd", "xz", "z", "zip", "zipx") 19 | 20 | def push_msg(msg): 21 | ''' 22 | redis连接池 23 | ''' 24 | redis_host = '192.168.87.222' 25 | redis_port = 6379 26 | redis_pass = 'cft67ygv' 27 | redis_db = 0 28 | redis_key = 'logstash:redis' 29 | try: 30 | #r = redis.StrictRedis(host=redis_host, port=redis_port, db=redis_db, password=redis_pass) 31 | pool = redis.ConnectionPool(host=redis_host, port=redis_port, db=redis_db, password=redis_pass) 32 | r = redis.StrictRedis(connection_pool=pool) 33 | r.rpush(redis_key, msg) 34 | except Exception as e: 35 | print str(e),msg 36 | 37 | def is_intranet(ip): 38 | """ 39 | 匹配内网ip地址 40 | """ 41 | ret = ip.split('.') 42 | if not len(ret) == 4: 43 | return True 44 | if ret[0] == '10': 45 | return True 46 | if ret[0] == '127' and ret[1] == '0': 47 | return True 48 | if ret[0] == '172' and 16 <= int(ret[1]) <= 32: 49 | return True 50 | if ret[0] == '192' and ret[1] == '168': 51 | return True 52 | return False 53 | 54 | def parser_ua(ua_string): 55 | ''' 56 | 解析user-agent 57 | ''' 58 | info = {} 59 | info['spider'] = False 60 | try: 61 | msg = ua_parse(ua_string) 62 | 63 | if msg.is_pc: 64 | info['dev'] = 'PC' 65 | elif msg.is_tablet: 66 | info['dev'] = 'Pad' 67 | elif msg.is_mobile: 68 | info['dev'] = 'MObile' 69 | else: 70 | info['dev'] = 'Unknow' 71 | 72 | if msg.is_bot: 73 | info['spider'] = True 74 | info["type"] = msg.os.family+' '+str(msg.os.version_string) 75 | 76 | info["ua"] = msg.browser.family+' '+str(msg.browser.version_string) 77 | return info 78 | except Exception as e: 79 | return info 80 | 81 | info = {} 82 | addr = "" 83 | g1 = GeoIPUtil() 84 | g2 = GeoIP2Util() 85 | if is_intranet(ipaddr): 86 | info["address"] = "局域网内地址" 87 | info['weidu'] = info['jingdu'] = info['country'] = info['subdivision'] =info['city'] ="" 88 | else: 89 | try: 90 | longitude,latitude = g1.get_lat_alt(ipaddr) 91 | info['weidu'] = latitude 92 | info['jingdu'] = longitude 93 | country, subdivision, city = g2.get_ip_location(ipaddr) 94 | if country == u"中国": 95 | if not subdivision and not city: 96 | addr = country 97 | elif subdivision.find(u'市')==-1 and subdivision not in [u'上海'] and city: 98 | addr = subdivision.strip(u'省')+u"省\t"+city 99 | elif subdivision and not city: 100 | addr = country +' '+ subdivision 101 | else: 102 | addr = subdivision+"\t"+city 103 | else: 104 | if subdivision in [u'台北市',u'新北市',u'基隆市',u'新竹市',u'嘉义市',u'台中市',u'台南市',u'高雄市',u'屏东市']: 105 | subdivision = "台湾省"+' '+subdivision 106 | if country in [u'香港',u'澳门'] and subdivision: 107 | subdivision = country +' '+ subdivision 108 | country = '中国' 109 | elif country in [u'香港',u'澳门'] and not subdivision: 110 | subdivision = country 111 | country = '中国' 112 | 113 | if not subdivision and not city: 114 | addr = country 115 | elif subdivision and not city: 116 | addr = country +' '+ subdivision 117 | else: 118 | addr = country.replace(u'台湾',u'中国') +"\t"+subdivision+"\t"+city 119 | if addr: 120 | info["address"] = addr 121 | 122 | info["country"] = country.replace(u'台湾',u'中国') 123 | info["subdivision"]= subdivision 124 | info["city"] = city 125 | except Exception as e: 126 | print str(e) 127 | return info 128 | 129 | def check_rule(file_data): 130 | ''' 131 | 匹配规则 132 | ''' 133 | rule = '' 134 | results = [] 135 | try: 136 | file_data = urldecode(file_data) 137 | except: 138 | try: 139 | file_data = htmlunescape(file_data) 140 | except: 141 | file_data = file_data 142 | 143 | try: 144 | default_conf_path = os.path.abspath(os.path.dirname(__file__)) + "/" 145 | flist = default_conf_path+"rule.json" 146 | patterns_list = json.load(file(flist)) 147 | for patterns in patterns_list: 148 | sensitive = True 149 | for pattern in patterns['patterns']: 150 | if pattern['type'] == 'match': 151 | re_pattern = re.compile(pattern['part'], re.IGNORECASE | re.DOTALL | re.MULTILINE) 152 | re_result = re.findall(pattern['part'], file_data) 153 | if not re_result: 154 | sensitive = False 155 | break 156 | elif pattern['type'] == 'regex': 157 | re_pattern = re.compile(str(pattern['part']), re.IGNORECASE | re.DOTALL | re.MULTILINE) 158 | if re_pattern.search(file_data) == None: 159 | sensitive = False 160 | break 161 | 162 | if sensitive: 163 | results.append({ 164 | 'tag': patterns['tag'], 165 | 'level': patterns['level'] 166 | }) 167 | except Exception as e: 168 | pass 169 | return results 170 | 171 | if __name__ == '__main__': 172 | print json.dumps(parser_ip('111.122.172.163')) 173 | print json.dumps(parser_ip('220.181.171.119')) 174 | print json.dumps(parser_ip('192.168.199.233')) 175 | 176 | url = "/drupal/?destination=node/8153%23comment-form&q=../../../../../../../../../../WEB-INF/web.xml" 177 | print check_rule(url) 178 | ua = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.21 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.21" 179 | print parser_ua(ua) 180 | 181 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #-*- coding:utf-8 -*- 3 | 4 | from module.apache_parser import ApahceParser 5 | from module.iis_parser import IISLineParser 6 | from module.nginx_parser import NgLineParser 7 | from module.tomcat_parser import TomcatParser 8 | from common.threadpool import new_threadpool 9 | from optparse import OptionParser 10 | from common.units import * 11 | 12 | def check_file(path): 13 | ''' 14 | 检测文件是目录还是文件 15 | ''' 16 | import os 17 | if os.path.isdir(path): 18 | return "directory" 19 | elif os.path.isfile(path): 20 | return "file" 21 | else: 22 | return False 23 | 24 | def para(line,xtype): 25 | ''' 26 | 对日志类型进程分类处理 27 | ''' 28 | if xtype == "apache": 29 | ng_line_parser = ApahceParser() 30 | elif xtype == "iis": 31 | ng_line_parser = IISLineParser() 32 | elif xtype == "nginx": 33 | ng_line_parser = NgLineParser() 34 | elif xtype == "tomcat": 35 | ng_line_parser = TomcatParser() 36 | else: 37 | return "not found,waitting...." 38 | if line.startswith("#"): 39 | pass 40 | else: 41 | try: 42 | ng_line_parser.parse(line) 43 | if ng_line_parser.real_ip: 44 | ippara = parser_ip(ng_line_parser.real_ip) 45 | else: 46 | ippara = parser_ip(ng_line_parser.cdn_ip) 47 | 48 | if ng_line_parser.browser: 49 | parses = parser_ua(ng_line_parser.browser) 50 | mydict = { 51 | "status":ng_line_parser.response_status, 52 | "cdn_ip":ng_line_parser.real_ip, 53 | "real_ip":ng_line_parser.cdn_ip, 54 | "access_time":ng_line_parser.access_time, 55 | "method":ng_line_parser.method, 56 | "url":ng_line_parser.request_url, 57 | "urldecode":urldecode(ng_line_parser.request_url), 58 | "referer":ng_line_parser.reference_url, 59 | "body_bytes":ng_line_parser.bbytes, 60 | "user_agent":ng_line_parser.browser, 61 | "ua_value":parses['ua'], 62 | "dev_type":parses['dev'], 63 | "dev_value":parses['type'], 64 | "spider":parses['spider'], 65 | "addr":ippara['address'], 66 | "city":ippara['city'], 67 | "jingdu":ippara['jingdu'], 68 | "weidu":ippara['weidu'], 69 | "country":ippara['country'], 70 | "subdivision":ippara['subdivision'], 71 | } 72 | else: 73 | mydict = { 74 | "status":ng_line_parser.response_status, 75 | "cdn_ip":ng_line_parser.cdn_ip, 76 | "real_ip":ng_line_parser.real_ip, 77 | "access_time":ng_line_parser.access_time, 78 | "method":ng_line_parser.method, 79 | "url":ng_line_parser.request_url, 80 | "urldecode":urldecode(ng_line_parser.request_url), 81 | "referer":ng_line_parser.reference_url, 82 | "body_bytes":ng_line_parser.bbytes, 83 | "user_agent":ng_line_parser.browser, 84 | "addr":ippara['address'], 85 | "city":ippara['city'], 86 | "jingdu":ippara['jingdu'], 87 | "weidu":ippara['weidu'], 88 | "country":ippara['country'], 89 | "subdivision":ippara['subdivision'], 90 | } 91 | resu = check_rule(ng_line_parser.request_url) 92 | if resu: 93 | if len(resu)==1: 94 | mydict["rule"]= resu[0] 95 | else: 96 | mydict["rule"]= resu 97 | else: 98 | mydict["rule"]= False 99 | ''' 100 | try: 101 | push_msg(json.dumps(mydict)) 102 | except Exception as e: 103 | try: 104 | del mydict["url"] 105 | del mydict["urldecode"] 106 | mydict["url"] = str(ng_line_parser.request_url).decode('gbk', 'ignore').encode('utf-8', 'ignore') 107 | mydict["urldecode"] = str(ng_line_parser.request_url).decode('gbk', 'ignore').encode('utf-8', 'ignore') 108 | push_msg(json.dumps(mydict)) 109 | except Exception as why: 110 | print why 111 | print mydict 112 | pass 113 | ''' 114 | return json.dumps(mydict) 115 | except Exception as why: 116 | print why 117 | pass 118 | 119 | def push_para(xtype,logfile): 120 | ''' 121 | 对每一个文件进行处理 122 | ''' 123 | with open(logfile, 'r') as f: 124 | #p = new_threadpool(3, calucator) 125 | for index, line in enumerate(f): 126 | print para(line,xtype) 127 | 128 | def main(xtype,log_name): 129 | ''' 130 | 检测文件的类型 131 | ''' 132 | xcheck = check_file(log_name) 133 | if not xcheck: 134 | print "file not found" 135 | return 136 | else: 137 | if xcheck == "directory": 138 | files = get_file(log_name) 139 | for xfile in files: 140 | push_para(xtype,log_name+"/"+xfile) 141 | else: 142 | push_para(xtype,log_name) 143 | 144 | def get_file(mypath): 145 | ''' 146 | 获取文件夹下面的全部文件 147 | ''' 148 | from os import listdir 149 | from os.path import isfile, join 150 | onlyfiles = [ f for f in listdir(mypath) if isfile(join(mypath,f)) ] 151 | return onlyfiles 152 | 153 | def init_parser(): 154 | usage = "Usage: %prog --type IIS|Apache|Tomcat|Nginx --file file|directory" 155 | parser = OptionParser(usage=usage, description="log parser ") 156 | parser.add_option("--type", type="str", dest="type", help="chose which log type") 157 | parser.add_option("--file", type="str", dest="file", default=None,help="chose file or directory") 158 | return parser 159 | 160 | if __name__ == '__main__': 161 | #main('./log/access.log.9') 162 | parser = init_parser() 163 | option, _ = parser.parse_args() 164 | logtype = str(option.type).lower() 165 | logfile = option.file 166 | if not logfile or not logtype: 167 | parser.print_help() 168 | 169 | main(logtype,logfile) 170 | -------------------------------------------------------------------------------- /module/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/0xa-saline/Logpara/b0be8febb7a9287157a119d5eb7b1dc8bc988a29/module/__init__.py -------------------------------------------------------------------------------- /module/apache_parser.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #-*- coding:utf-8 -*- 3 | 4 | import datetime 5 | import re,urllib 6 | 7 | request_re= re.compile(r'(?P(GET|POST|HEAD|DELETE|PUT|OPTIONS)?)\s+(?P.*?)\s+(?P.*)$') 8 | log_line_re = re.compile(r'(?P((\d{1,3}\.){3}\d{1,3})+) - - (\[(?P\S+)\s+\S+\])\s+\"(?P(.*?))\"\s+(?P([1-9]\d*))\s+(?P(.*?))\s+\"(?P.*?)\"\s+\"(?P.*?)\"') 9 | logline_re = re.compile(r'(?P((\d{1,3}\.){3}\d{1,3})+) - - (\[(?P\S+)\s+\S+\])\s+\"(?P(.*?))\"\s+(?P([1-9]\d*))\s+(?P(.*?))') 10 | 11 | """ 12 | 192.168.0.23 - - [19/Aug/2017:05:33:54 +0200] "GET /drupal/templates/blue/js/default.js HTTP/1.1" 404 402 "http://192.168.0.102/drupal/templates/blue/js/default.js" "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36" 13 | ip time method url status body referer ua 14 | """ 15 | class ApahceParser(object): 16 | """将 Apache 日志解析成多个字段""" 17 | 18 | def __init__(self): 19 | self._cdn_ip = '' # CDN请求IP 20 | self._access_time = '' # 请求时间 21 | self._request_url = '' # 请求的URL 22 | self._reference_url = '' # 外链URL 23 | self._response_status = '' # NG 响应状态码 24 | self._browser = '' # 用户使用的浏览器 25 | self._real_ip = '' # 用户真实IP 26 | self._mthod = '' # 请求方式 27 | self._bbytes = '' # 内容大小 28 | 29 | def parse(self, line): 30 | """通过传入的一行数据进行解析 31 | """ 32 | processed = log_line_re.search(line) 33 | if processed: 34 | #ip_time_tmp = line_item[2].strip().split() 35 | self.cdn_ip = processed.group('remote_host') # 服务器IP 36 | self.access_time = processed.group('date_time')# 请求发起的时间 37 | request = processed.group('request') 38 | request_ur = request_re.search(request) 39 | if request_ur: 40 | self.method = request_ur.group('request_method') # 请求方式 41 | self.request_url = request_ur.group('request_uri') 42 | self.response_status = processed.group('status') # NG 响应状态码 43 | self.bbytes = processed.group('body_bytes_sent') # 浏览所用时间 44 | if self.bbytes == "-": 45 | self.bbytes = "" 46 | self.browser = processed.group('http_user_agent') 47 | self.reference_url = processed.group('http_referer') # 外链URL 48 | ''' 49 | self.port = line_item[6] 50 | self.reference_url = "" # 外链URL 51 | self.real_ip = line_item[8].strip() 52 | 53 | # 用户使用的浏览器 54 | 55 | ''' 56 | else: 57 | processed = logline_re.search(line) 58 | if processed: 59 | self.cdn_ip = processed.group('remote_host') # 服务器IP 60 | self.access_time = processed.group('date_time')# 请求发起的时间 61 | request = processed.group('request') 62 | request_ur = request_re.search(request) 63 | if request_ur: 64 | self.method = request_ur.group('request_method') # 请求方式 65 | self.request_url = request_ur.group('request_uri') 66 | self.response_status = processed.group('status') # NG 响应状态码 67 | self.bbytes = processed.group('body_bytes_sent') # 浏览所用时间 68 | if self.bbytes == "-": 69 | self.bbytes = "" 70 | 71 | def to_dict(self): 72 | """将属性(@property)的转化为dict输出 73 | """ 74 | propertys = {} 75 | propertys['real_ip'] = self.real_ip 76 | propertys['cdn_ip'] = self.cdn_ip 77 | propertys['method'] = self.method 78 | propertys['access_time'] = self.access_time 79 | propertys['request_url'] = self.request_url 80 | propertys['reference_url'] = self.reference_url 81 | propertys['response_status'] = self.response_status 82 | propertys['browser'] = self.browser 83 | propertys['bbytes'] = self.bbytes 84 | return propertys 85 | 86 | @property 87 | def real_ip(self): 88 | return self._real_ip 89 | 90 | @real_ip.setter 91 | def real_ip(self, real_ip): 92 | self._real_ip = real_ip.split(', ')[0] 93 | 94 | @property 95 | def browser(self): 96 | return self._browser 97 | 98 | @browser.setter 99 | def browser(self, browser): 100 | self._browser = browser.replace('+',' ') 101 | 102 | @property 103 | def response_status(self): 104 | return self._response_status 105 | 106 | @response_status.setter 107 | def response_status(self, response_status): 108 | self._response_status = response_status 109 | 110 | @property 111 | def reference_url(self): 112 | return self._reference_url 113 | 114 | @reference_url.setter 115 | def reference_url(self, reference_url): 116 | """解析外链URL 117 | 只需要解析后的域名, 如: 118 | 传入: http://www.ttmark.com/diannao/2014/11/04/470.html 119 | 解析成: www.ttmark.com 120 | """ 121 | proto, rest = urllib.splittype(reference_url) 122 | res, rest = urllib.splithost(rest) 123 | if not res: 124 | self._reference_url = '-' 125 | else: 126 | self._reference_url = res 127 | 128 | @property 129 | def request_url(self): 130 | return self._request_url 131 | 132 | @request_url.setter 133 | def request_url(self, request_url): 134 | """ 135 | 解析请求的URL 136 | 只需要解析后的URL路径不需要参数, 如: 137 | 传入: /wp-admin/admin-ajax.php?postviews_id=1348 138 | 解析成: /wp-admin/admin-ajax.php 139 | 140 | proto, rest = urllib.splittype(request_url) 141 | url_path, url_param = urllib.splitquery(rest) 142 | 143 | if url_path.startswith('/tag/'): 144 | url_path = '/tag/' 145 | """ 146 | self._request_url = request_url 147 | 148 | @property 149 | def access_time(self): 150 | return str(self._access_time) 151 | 152 | @access_time.setter 153 | def access_time(self, access_time): 154 | # Apache log 解析日志格式 155 | #input_datetime_format = '%d/%b/%Y:%H:%M:%S' 156 | input_datetime_format = '%d/%b/%Y:%H:%M:%S' 157 | self._access_time = datetime.datetime.strptime( access_time,input_datetime_format) 158 | 159 | @property 160 | def cdn_ip(self): 161 | return self._cdn_ip 162 | 163 | @cdn_ip.setter 164 | def cdn_ip(self, cdn_ip): 165 | self._cdn_ip = cdn_ip 166 | 167 | if __name__ == '__main__': 168 | pass 169 | -------------------------------------------------------------------------------- /module/iis_parser.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #-*- coding:utf-8 -*- 3 | 4 | import datetime 5 | import urllib 6 | ''' 7 | date time 日期/时间 8 | s-ip 服务器IP 9 | cs-method 方法 10 | cs-uri-stem 请求访问的页面 11 | cs-uri-query 访问的查询字符串 12 | s-port 服务器端口 13 | cs-username 14 | c-ip 客户端IP 15 | cs(User-Agent) 用户代理 16 | sc-status 协议返回状态 17 | sc-substatus HTTP子协议的状态 18 | sc-win32-status Win32® 状态 19 | time-taken 所用时间 20 | ''' 21 | class IISLineParser(object): 22 | """将 Nginx 日志解析成多个字段""" 23 | 24 | def __init__(self): 25 | self._cdn_ip = '' # CDN请求IP 26 | self._access_time = '' # 请求时间 27 | self._request_url = '' # 请求的URL 28 | self._reference_url = '' # 外链URL 29 | self._response_status = '' # NG 响应状态码 30 | self._browser = '' # 用户使用的浏览器 31 | self._real_ip = '' # 用户真实IP 32 | self._mthod = '' # 请求方式 33 | self._bbytes = '' # 内容大小 34 | 35 | def parse(self, line): 36 | """通过传入的一行数据进行解析 37 | """ 38 | line_item = line.strip().split(' ') 39 | 40 | #ip_time_tmp = line_item[2].strip().split() 41 | self.cdn_ip = line_item[2] # 服务器IP 42 | self.access_time = str(line_item[0]+" "+ line_item[1])# 请求发起的时间 43 | self.method = line_item[3].strip() # 请求方式 44 | if line_item[5].strip() == "-": 45 | self.request_url = line_item[4].strip() # 请求的URL 46 | else: 47 | self.request_url = line_item[4].strip()+"?"+ line_item[5].strip() 48 | self.port = line_item[6] 49 | self.reference_url = "" # 外链URL 50 | self.real_ip = line_item[8].strip() 51 | self.response_status = line_item[10].strip() # NG 响应状态码 52 | self.browser = line_item[9].strip() # 用户使用的浏览器 53 | self.bbytes = line_item[13].strip() # 浏览所用时间 54 | 55 | def to_dict(self): 56 | """将属性(@property)的转化为dict输出 57 | """ 58 | propertys = {} 59 | 60 | propertys['real_ip'] = self.real_ip 61 | propertys['ser_ip'] = self.cdn_ip 62 | propertys['method'] = self.method 63 | propertys['access_time'] = self.access_time 64 | propertys['request_url'] = self.request_url 65 | propertys['reference_url'] = self.reference_url 66 | propertys['response_status'] = self.response_status 67 | propertys['browser'] = self.browser 68 | propertys['bbytes'] = self.bbytes 69 | return propertys 70 | 71 | @property 72 | def real_ip(self): 73 | return self._real_ip 74 | 75 | @real_ip.setter 76 | def real_ip(self, real_ip): 77 | self._real_ip = real_ip.split(', ')[0] 78 | 79 | @property 80 | def browser(self): 81 | return self._browser 82 | 83 | @browser.setter 84 | def browser(self, browser): 85 | self._browser = browser.replace('+',' ') 86 | 87 | @property 88 | def response_status(self): 89 | return self._response_status 90 | 91 | @response_status.setter 92 | def response_status(self, response_status): 93 | self._response_status = response_status 94 | 95 | @property 96 | def reference_url(self): 97 | return self._reference_url 98 | 99 | @reference_url.setter 100 | def reference_url(self, reference_url): 101 | """解析外链URL 102 | 只需要解析后的域名, 如: 103 | 传入: http://www.ttmark.com/diannao/2014/11/04/470.html 104 | 解析成: www.ttmark.com 105 | """ 106 | proto, rest = urllib.splittype(reference_url) 107 | res, rest = urllib.splithost(rest) 108 | if not res: 109 | self._reference_url = '-' 110 | else: 111 | self._reference_url = res 112 | 113 | @property 114 | def request_url(self): 115 | return self._request_url 116 | 117 | @request_url.setter 118 | def request_url(self, request_url): 119 | """解析请求的URL 120 | 只需要解析后的URL路径不需要参数, 如: 121 | 传入: /wp-admin/admin-ajax.php?postviews_id=1348 122 | 解析成: /wp-admin/admin-ajax.php 123 | 124 | proto, rest = urllib.splittype(request_url) 125 | url_path, url_param = urllib.splitquery(rest) 126 | 127 | if url_path.startswith('/tag/'): 128 | url_path = '/tag/' 129 | """ 130 | self._request_url = request_url 131 | 132 | @property 133 | def access_time(self): 134 | return str(self._access_time) 135 | 136 | @access_time.setter 137 | def access_time(self, access_time): 138 | # IIS log 解析日志格式 139 | #input_datetime_format = '%d/%b/%Y:%H:%M:%S' 140 | input_datetime_format = '%Y-%m-%d %H:%M:%S' 141 | self._access_time = datetime.datetime.strptime( access_time,input_datetime_format) 142 | 143 | @property 144 | def cdn_ip(self): 145 | return self._cdn_ip 146 | 147 | @cdn_ip.setter 148 | def cdn_ip(self, cdn_ip): 149 | self._cdn_ip = cdn_ip 150 | 151 | if __name__ == '__main__': 152 | pass 153 | -------------------------------------------------------------------------------- /module/nginx_parser.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #-*- coding:utf-8 -*- 3 | import datetime 4 | import urllib 5 | 6 | class NgLineParser(object): 7 | """将 Nginx 日志解析成多个字段""" 8 | 9 | def __init__(self): 10 | self._cdn_ip = '' # CDN请求IP 11 | self._access_time = '' # 请求时间 12 | self._request_url = '' # 请求的URL 13 | self._reference_url = '' # 外链URL 14 | self._response_status = '' # NG 响应状态码 15 | self._browser = '' # 用户使用的浏览器 16 | self._real_ip = '' # 用户真实IP 17 | self._mthod = '' # 请求方式 18 | self._bbytes = '' # 内容大小 19 | 20 | def parse(self, line): 21 | """通过传入的一行数据进行解析 22 | """ 23 | line_item = line.strip().split('"') 24 | if len(line_item) > 9: # 由于日志有改变需要删除一些元素 25 | del line_item[1] 26 | del line_item[1] 27 | 28 | # 获取临时的 CDN IP 和 访问文件 29 | ip_time_tmp = line_item[0].strip().split() 30 | if len(line_item)>7: 31 | self.real_ip = line_item[7] # 用户真实IP 32 | else: 33 | self.real_ip = '' 34 | self.cdn_ip = ip_time_tmp[0] # CDN请求IP 35 | self.access_time = ip_time_tmp[3].lstrip('[') # 请求时间 36 | self.method = line_item[1].strip().split()[0] # 请求方式 37 | self.request_url = line_item[1].strip().split()[1] # 请求的URL 38 | self.reference_url = line_item[3].strip() # 外链URL 39 | self.response_status = line_item[2].strip().split()[0] # NG 响应状态码 40 | self.bbytes = line_item[2].strip().split()[1] # NG 响应状态码 41 | self.browser = line_item[5].strip() # 用户使用的浏览器 42 | 43 | def to_dict(self): 44 | """将属性(@property)的转化为dict输出 45 | """ 46 | propertys = {} 47 | 48 | propertys['real_ip'] = self.real_ip 49 | propertys['cdn_ip'] = self.cdn_ip 50 | propertys['method'] = self.method 51 | propertys['access_time'] = self.access_time 52 | propertys['request_url'] = self.request_url 53 | propertys['reference_url'] = self.reference_url 54 | propertys['response_status'] = self.response_status 55 | propertys['browser'] = self.browser 56 | propertys['bbytes'] = self.bbytes 57 | return propertys 58 | 59 | def parser_ua(self,ua_string): 60 | info = {} 61 | info['spider'] = False 62 | msg = ua_parse(ua_string) 63 | 64 | if msg.is_pc: 65 | info['dev'] = 'PC' 66 | elif msg.is_tablet: 67 | info['dev'] = 'Pad' 68 | elif msg.is_mobile: 69 | info['dev'] = 'MObile' 70 | else: 71 | info['dev'] = 'Unknow' 72 | 73 | if msg.is_bot: 74 | info['spider'] = True 75 | info["type"] = msg.os.family+' '+str(msg.os.version_string) 76 | 77 | info["ua"] = msg.browser.family+' '+str(msg.browser.version_string) 78 | return info 79 | 80 | 81 | @property 82 | def real_ip(self): 83 | return self._real_ip 84 | 85 | @real_ip.setter 86 | def real_ip(self, real_ip): 87 | self._real_ip = real_ip.split(', ')[0] 88 | 89 | @property 90 | def browser(self): 91 | return self._browser 92 | 93 | @browser.setter 94 | def browser(self, browser): 95 | self._browser = browser 96 | 97 | @property 98 | def response_status(self): 99 | return self._response_status 100 | 101 | @response_status.setter 102 | def response_status(self, response_status): 103 | self._response_status = response_status 104 | 105 | @property 106 | def reference_url(self): 107 | return self._reference_url 108 | 109 | @reference_url.setter 110 | def reference_url(self, reference_url): 111 | """解析外链URL 112 | 只需要解析后的域名, 如: 113 | 传入: http://www.ttmark.com/diannao/2014/11/04/470.html 114 | 解析成: www.ttmark.com 115 | """ 116 | proto, rest = urllib.splittype(reference_url) 117 | res, rest = urllib.splithost(rest) 118 | if not res: 119 | self._reference_url = '-' 120 | else: 121 | self._reference_url = res 122 | 123 | @property 124 | def request_url(self): 125 | return self._request_url 126 | 127 | @request_url.setter 128 | def request_url(self, request_url): 129 | """解析请求的URL 130 | 只需要解析后的URL路径不需要参数, 如: 131 | 传入: /wp-admin/admin-ajax.php?postviews_id=1348 132 | 解析成: /wp-admin/admin-ajax.php 133 | 134 | proto, rest = urllib.splittype(request_url) 135 | url_path, url_param = urllib.splitquery(rest) 136 | 137 | if url_path.startswith('/tag/'): 138 | url_path = '/tag/' 139 | """ 140 | self._request_url = request_url 141 | 142 | @property 143 | def access_time(self): 144 | return str(self._access_time) 145 | 146 | @access_time.setter 147 | def access_time(self, access_time): 148 | # Nginx log 解析日志格式 149 | input_datetime_format = '%d/%b/%Y:%H:%M:%S' 150 | self._access_time = datetime.datetime.strptime( access_time,input_datetime_format) 151 | 152 | @property 153 | def cdn_ip(self): 154 | return self._cdn_ip 155 | 156 | @cdn_ip.setter 157 | def cdn_ip(self, cdn_ip): 158 | self._cdn_ip = cdn_ip 159 | 160 | 161 | if __name__ == '__main__': 162 | pass 163 | -------------------------------------------------------------------------------- /module/tomcat_parser.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #-*- coding:utf-8 -*- 3 | 4 | import datetime 5 | import urllib,json 6 | from common.convert import urldecode,htmlunescape 7 | try: 8 | from user_agents import parse as ua_parse 9 | except: 10 | print "try to pip install pyyaml ua-parser user-agents" 11 | 12 | class TomcatParser(object): 13 | """将 Nginx 日志解析成多个字段""" 14 | 15 | def __init__(self): 16 | self._cdn_ip = '' # CDN请求IP 17 | self._access_time = '' # 请求时间 18 | self._request_url = '' # 请求的URL 19 | self._reference_url = '' # 外链URL 20 | self._response_status = '' # NG 响应状态码 21 | self._browser = '' # 用户使用的浏览器 22 | self._real_ip = '' # 用户真实IP 23 | self._mthod = '' # 请求方式 24 | self._bbytes = '' # 内容大小 25 | 26 | def parse(self, line): 27 | """ 28 | 通过传入的一行数据进行解析 29 | """ 30 | line_item = line.strip().split() 31 | # 获取临时的 CDN IP 和 访问文件 32 | self.real_ip = line_item[0] # 请求IP 33 | self.access_time = line_item[3].replace("[","") # 请求时间 34 | self.method = line_item[5].strip('"') # 请求方式 35 | self.request_url = line_item[6] # 请求的URL 36 | self.response_status = line_item[8] # NG 响应状态码 37 | self.bbytes = line_item[9] # NG 响应状态码 38 | 39 | def to_dict(self): 40 | """ 41 | 将属性(@property)的转化为dict输出 42 | """ 43 | propertys = {} 44 | 45 | propertys['real_ip'] = self.real_ip 46 | propertys['cdn_ip'] = self.cdn_ip 47 | propertys['method'] = self.method 48 | propertys['access_time'] = self.access_time 49 | propertys['request_url'] = self.request_url 50 | propertys['reference_url'] = self.reference_url 51 | propertys['response_status'] = self.response_status 52 | propertys['browser'] = self.browser 53 | propertys['bbytes'] = self.bbytes 54 | return propertys 55 | 56 | def parser_ua(self,ua_string): 57 | info = {} 58 | info['spider'] = False 59 | msg = ua_parse(ua_string) 60 | 61 | if msg.is_pc: 62 | info['dev'] = 'PC' 63 | elif msg.is_tablet: 64 | info['dev'] = 'Pad' 65 | elif msg.is_mobile: 66 | info['dev'] = 'MObile' 67 | else: 68 | info['dev'] = 'Unknow' 69 | 70 | if msg.is_bot: 71 | info['spider'] = True 72 | info["type"] = msg.os.family+' '+str(msg.os.version_string) 73 | 74 | info["ua"] = msg.browser.family+' '+str(msg.browser.version_string) 75 | return info 76 | 77 | 78 | @property 79 | def real_ip(self): 80 | return self._real_ip 81 | 82 | @real_ip.setter 83 | def real_ip(self, real_ip): 84 | self._real_ip = real_ip.split(', ')[0] 85 | 86 | @property 87 | def browser(self): 88 | return self._browser 89 | 90 | @browser.setter 91 | def browser(self, browser): 92 | self._browser = browser 93 | 94 | @property 95 | def response_status(self): 96 | return self._response_status 97 | 98 | @response_status.setter 99 | def response_status(self, response_status): 100 | self._response_status = response_status 101 | 102 | @property 103 | def reference_url(self): 104 | return self._reference_url 105 | 106 | @reference_url.setter 107 | def reference_url(self, reference_url): 108 | """ 109 | 解析外链URL 110 | 只需要解析后的域名, 如: 111 | 传入: http://www.ttmark.com/diannao/2014/11/04/470.html 112 | 解析成: www.ttmark.com 113 | """ 114 | proto, rest = urllib.splittype(reference_url) 115 | res, rest = urllib.splithost(rest) 116 | if not res: 117 | self._reference_url = '-' 118 | else: 119 | self._reference_url = res 120 | 121 | @property 122 | def request_url(self): 123 | return self._request_url 124 | 125 | @request_url.setter 126 | def request_url(self, request_url): 127 | """ 128 | 解析请求的URL 129 | 只需要解析后的URL路径不需要参数, 如: 130 | 传入: /wp-admin/admin-ajax.php?postviews_id=1348 131 | 解析成: /wp-admin/admin-ajax.php 132 | 133 | proto, rest = urllib.splittype(request_url) 134 | url_path, url_param = urllib.splitquery(rest) 135 | 136 | if url_path.startswith('/tag/'): 137 | url_path = '/tag/' 138 | """ 139 | self._request_url = request_url 140 | 141 | @property 142 | def access_time(self): 143 | return str(self._access_time) 144 | 145 | @access_time.setter 146 | def access_time(self, access_time): 147 | # Nginx log 解析日志格式 148 | input_datetime_format = '%d/%b/%Y:%H:%M:%S' 149 | self._access_time = datetime.datetime.strptime( access_time,input_datetime_format) 150 | 151 | @property 152 | def cdn_ip(self): 153 | return self._cdn_ip 154 | 155 | @cdn_ip.setter 156 | def cdn_ip(self, cdn_ip): 157 | self._cdn_ip = cdn_ip 158 | --------------------------------------------------------------------------------