├── README └── __init__.py /README: -------------------------------------------------------------------------------- 1 | Python library for parse apache access logs. 2 | 3 | You can clone this repository as 'apache' and place anywhere you want. 4 | 5 | $ git clone git@github.com:basuke/Apache-Access-Log-Parse-Library-for-Python.git apache 6 | 7 | 8 | Written by basuke@kanshin.com 9 | Copyright 2010 Kanshin!, Inc. 10 | http://corp.kanshin.com/ 11 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf-8 -*- 2 | 3 | import datetime 4 | import time 5 | import re 6 | import subprocess 7 | 8 | def parse(*paths): 9 | """ 10 | parse all access logs and return list of logs. 11 | each item in the list is Log object. 12 | 13 | You can specify local or remote files. 14 | """ 15 | logs = SuperList() 16 | 17 | for path in paths: 18 | fp = openLogFile(path) 19 | for line in fp.xreadlines(): 20 | log = parseLogLine(line) 21 | if log: logs.append(log) 22 | 23 | return logs 24 | 25 | def openLogFile(path): 26 | """ 27 | open local and remote file and return file-like object. 28 | remote file can be specified as scp does. 29 | 30 | if you specify gz compressed file, it was passed to zcat. 31 | 32 | sample: 33 | /home/kanshin/logs/access_log 34 | aji.kanshin.com:/var/log/httpd/logs/access_log 35 | """ 36 | 37 | args = () 38 | 39 | if path.find(':') >= 0: 40 | host, path = path.split(':') 41 | if path.endswith('.gz'): 42 | exe = 'zcat' 43 | else: 44 | exe = 'cat' 45 | 46 | args = ('ssh', host, exe, path) 47 | elif path.endswith('.gz'): 48 | args = ('zcat', path) 49 | 50 | if args: 51 | p = subprocess.Popen(shell=False, args=args, stdout=subprocess.PIPE) 52 | return p.stdout 53 | else: 54 | return open(path, 'r') 55 | 56 | def parseLogLine(line): 57 | """ 58 | parse one line of access log and return Log object. 59 | """ 60 | match = PATTERN.match(line) 61 | if not match: return None 62 | 63 | return Log(*match.groups()) 64 | 65 | class SuperList(list): 66 | def len(self): 67 | return len(self) 68 | 69 | def filter(self, cmp = lambda item: item): 70 | return SuperList(item for item in self if cmp(item)) 71 | 72 | def group(self, attr, fget = lambda val: val): 73 | result = dict() 74 | for item in self: 75 | val = fget(getattr(item, attr)) 76 | 77 | if val not in result: 78 | result[val] = SuperList() 79 | 80 | result[val].append(item) 81 | 82 | return result 83 | 84 | class Log(object): 85 | """ 86 | Object for representing a access log. 87 | """ 88 | 89 | ipaddr = "" 90 | user = "" 91 | timestampStr = "" 92 | _timestampTZ = None 93 | _timestampTuple = None 94 | method = "" 95 | resource = "" 96 | protocol = "" 97 | statusCode = "" 98 | bytes = 0L 99 | referer = "" 100 | userAgent = "" 101 | 102 | def __init__(self, *cols): 103 | if len(cols) > 8: 104 | self.ipaddr = cols[0] 105 | self.user = cols[2] 106 | self.timestampStr = cols[3] 107 | self.method = cols[4] 108 | self.resource = cols[5] 109 | self.protocol = cols[6] 110 | self.statusCode = long(cols[7]) 111 | try: 112 | self.bytes = long(cols[8]) 113 | except ValueError: 114 | self.bytes = 0L 115 | 116 | if len(cols) > 10: 117 | self.referer = cols[9] 118 | self.userAgent = cols[10] 119 | 120 | def timestampTuple(): 121 | def fget(self): 122 | if not self._timestampTuple: 123 | self._timestampTuple = time.strptime(self.timestampStr[:-6], "%d/%b/%Y:%H:%M:%S") 124 | 125 | return self._timestampTuple 126 | return locals() 127 | 128 | timestampTuple = property(**timestampTuple()) 129 | 130 | def timestampTZ(): 131 | def fget(self): 132 | if not self._timestampTZ: 133 | self._timestampTZ = self.timestampStr[-5:] 134 | 135 | return self._timestampTZ 136 | return locals() 137 | 138 | timestampTZ = property(**timestampTZ()) 139 | 140 | year = property(fget=lambda s: s.timestampTuple[0]) 141 | month = property(fget=lambda s: s.timestampTuple[1]) 142 | day = property(fget=lambda s: s.timestampTuple[2]) 143 | hour = property(fget=lambda s: s.timestampTuple[3]) 144 | minute = property(fget=lambda s: s.timestampTuple[4]) 145 | second = property(fget=lambda s: s.timestampTuple[5]) 146 | epoch = property(fget=lambda s: time.mktime(s.timestampTuple)) 147 | timestamp = property(fget=lambda s: datetime.datetime(*s.timestampTuple[0:6])) 148 | 149 | def toTuple(self): 150 | return ( 151 | self.ipaddr, 152 | '-', 153 | self.user, 154 | self.timestampStr, 155 | self.method, 156 | self.resource, 157 | self.protocol, 158 | str(self.statusCode), 159 | str(self.bytes) if self.bytes > 0 else '-', 160 | self.referer, 161 | self.userAgent, 162 | ) 163 | 164 | def __str__(self): 165 | return '%s %s %s [%s] "%s %s %s" %s %s "%s" "%s"' % self.toTuple() 166 | 167 | def __repr__(self): 168 | return 'Log' + repr(self.toTuple()) 169 | 170 | PATTERN = re.compile( 171 | r""" 172 | ^ 173 | ([0-9]{,3}\.[0-9]{,3}\.[0-9]{,3}\.[0-9]{,3}) 174 | \s 175 | ([^ ]{1,}) 176 | \s 177 | ([^ ]{1,}|\-) 178 | \s 179 | \[([0-9]{2}\/[A-Za-z]{3}\/[0-9]{1,4}:[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2} 180 | \s 181 | [+\-][0-9]{4})\] 182 | \s 183 | "([A-Z ]+) 184 | \s 185 | ([^"]*) 186 | \s 187 | ([^"]*)" 188 | \s 189 | ([0-9]{3}) 190 | \s 191 | ([0-9]{1,}|\-) 192 | (?: 193 | \s 194 | "([^"]*|\-)" 195 | \s 196 | "([^"]+)" 197 | ) 198 | $ 199 | """, re.VERBOSE) 200 | 201 | --------------------------------------------------------------------------------