├── README.md ├── ua.awk └── filter.awk /README.md: -------------------------------------------------------------------------------- 1 | Web server log file analysis & filtering 2 | ======================================== 3 | 4 | v1.2; Oct 2012 5 | Ben Carpenter 6 | http://www.bencarpenter.co.uk/awk-for-apache-nginx-logs 7 | 8 | This awk script processes lines from a log format that matches the 9 | 'combined' log often used by the Apache and Nginx web servers. If your log 10 | file format is different, amend accordingly, but for reference this is the 11 | combined format this script expects by default: 12 | 13 | %h %l %u %t "%r" %>s %b "%{Referer}i" "%{User-agent}i" 14 | 15 | %h Remote host 16 | %l Remote logname (ignored) 17 | %u Remote user (ignored) 18 | %t Date and time of the request 19 | %r First line of the request, typically "GET /something HTTP/1.1" 20 | %>s Status 21 | %b Size of response in bytes 22 | 23 | It tries to be efficient on resources, so there's minimal progress messages 24 | and no system commands in the main loop other than writing to a file based 25 | on the status code. The output files are written in a simplified 26 | tab-separated format, error corrected for some strange things like spaces 27 | in URLs and double quotes for the userid. This revised format is easier to 28 | pass reliably through other awk scripts when filtering for specific data, 29 | etc. The file format is: 30 | 31 | IP, Date/Time, Method, URL, Status, Size, Referer, User Agent 32 | 33 | You should be able to send a large (>1GB) amount of log data through this 34 | script quite comfortably. This works well for me, but usual clauses apply 35 | (use it at your own risk, etc.). Bug reports and suggestions for 36 | improvements are very welcome 37 | -------------------------------------------------------------------------------- /ua.awk: -------------------------------------------------------------------------------- 1 | ## 2 | # Web server log file analysis & filtering - anonymised user-agent data 3 | # 4 | # v1.0; Sep 2015 5 | # Ben Carpenter 6 | # https://github.com/lapsedtheorist/awk-for-apache-nginx-logs 7 | # 8 | # This awk script processes lines from a log format that matches the 9 | # 'combined' log often used by the Apache and Nginx web servers. If your log 10 | # file format is different, amend accordingly, but for reference this is the 11 | # combined format this script expects by default: 12 | # 13 | # %h %l %u %t "%r" %>s %b "%{Referer}i" "%{User-agent}i" 14 | # 15 | # %h Remote host 16 | # %l Remote logname (ignored) 17 | # %u Remote user (ignored) 18 | # %t Date and time of the request 19 | # %r First line of the request, typically "GET /something HTTP/1.1" 20 | # %>s Status 21 | # %b Size of response in bytes 22 | # 23 | # It tries to be efficient on resources, so there's minimal progress messages 24 | # and no system commands in the main loop other than writing to a file based 25 | # on the status code. The output files are written in a simplified 26 | # tab-separated format, error corrected for some strange things like spaces 27 | # in URLs and double quotes for the userid. This revised format is easier to 28 | # pass reliably through other awk scripts when filtering for specific data, 29 | # etc. The file format is: 30 | # 31 | # Date/Time, Method, Status, User Agent 32 | # 33 | # You should be able to send a large (>1GB) amount of log data through this 34 | # script quite comfortably. This works well for me, but usual clauses apply 35 | # (use it at your own risk, etc.). Bug reports and suggestions for 36 | # improvements are very welcome 37 | ## 38 | BEGIN { 39 | FS="( \"|\" )" 40 | intro="Processing..." 41 | printf "%s", intro 42 | } 43 | 44 | { 45 | split($1, a, " ") 46 | ip=a[1] 47 | # It seems some browsers/bots set the 'user' part to the blank string, 48 | # double quoted, which is therefore something that can foul our detection 49 | # for the status code, unless we explicitly look for it 50 | if($2!="") { 51 | datetime=a[4]" "a[5] 52 | request=$2 53 | referer=$4 54 | useragent=$5 55 | split($3, c, " ") 56 | code=c[1] 57 | size=c[2] 58 | } else { 59 | split($3, b, " ") 60 | datetime=b[2]" "b[3] 61 | request=$4 62 | referer=$6 63 | useragent=$7 64 | split($5, c, " ") 65 | code=c[1] 66 | size=c[2] 67 | } 68 | total=NR 69 | if(match(code, /^[0-9]+$/)==0) { 70 | # This status code, whatever it is, isn't a number so let's set it to 71 | # UNKNOWN so it's obvious in the analysis that this is a dud 72 | code="UNKNOWN" 73 | } 74 | 75 | # Analyse the request 76 | n=split(request, detail, " ") 77 | method=detail[1] 78 | if(match(method, /^[A-Z]+$/)==0) { 79 | # This request method, whatever it is, doesn't 'look like' a request 80 | # method, so let's set it to UNKNOWN so it's obvious in the analysis 81 | # that this is a dud 82 | method="UNKNOWN" 83 | } 84 | 85 | # Create a condensed file format containing UA data with a few extra 86 | # helpful, but still non-identifying, pieces of info 87 | file="http-anonymous-ua-full.log" 88 | printf "%s\t%s\t%d\t%s\n", \ 89 | datetime, method, code, useragent > file 90 | 91 | } 92 | 93 | END { 94 | for(l=0; ls %b "%{Referer}i" "%{User-agent}i" 14 | # 15 | # %h Remote host 16 | # %l Remote logname (ignored) 17 | # %u Remote user (ignored) 18 | # %t Date and time of the request 19 | # %r First line of the request, typically "GET /something HTTP/1.1" 20 | # %>s Status 21 | # %b Size of response in bytes 22 | # 23 | # It tries to be efficient on resources, so there's minimal progress messages 24 | # and no system commands in the main loop other than writing to a file based 25 | # on the status code. The output files are written in a simplified 26 | # tab-separated format, error corrected for some strange things like spaces 27 | # in URLs and double quotes for the userid. This revised format is easier to 28 | # pass reliably through other awk scripts when filtering for specific data, 29 | # etc. The file format is: 30 | # 31 | # IP, Date/Time, Method, URL, Status, Size, Referer, User Agent 32 | # 33 | # You should be able to send a large (>1GB) amount of log data through this 34 | # script quite comfortably. This works well for me, but usual clauses apply 35 | # (use it at your own risk, etc.). Bug reports and suggestions for 36 | # improvements are very welcome 37 | ## 38 | BEGIN { 39 | FS="( \"|\" )" 40 | intro="Processing..." 41 | printf "%s", intro 42 | } 43 | 44 | { 45 | split($1, a, " ") 46 | ip=a[1] 47 | # It seems some browsers/bots set the 'user' part to the blank string, 48 | # double quoted, which is therefore something that can foul our detection 49 | # for the status code, unless we explicitly look for it 50 | if($2!="") { 51 | datetime=a[4]" "a[5] 52 | request=$2 53 | referer=$4 54 | useragent=$6 55 | split($3, c, " ") 56 | code=c[1] 57 | size=c[2] 58 | } else { 59 | split($3, b, " ") 60 | datetime=b[2]" "b[3] 61 | request=$4 62 | referer=$6 63 | useragent=$8 64 | split($5, c, " ") 65 | code=c[1] 66 | size=c[2] 67 | } 68 | total=NR 69 | if(match(code, /^[0-9]+$/)==0) { 70 | # This status code, whatever it is, isn't a number so let's set it to 71 | # UNKNOWN so it's obvious in the analysis that this is a dud 72 | code="UNKNOWN" 73 | } 74 | statuses[code]++ 75 | 76 | # Analyse the request 77 | n=split(request, detail, " ") 78 | method=detail[1] 79 | if(match(method, /^[A-Z]+$/)==0) { 80 | # This request method, whatever it is, doesn't 'look like' a request 81 | # method, so let's set it to UNKNOWN so it's obvious in the analysis 82 | # that this is a dud 83 | method="UNKNOWN" 84 | } 85 | methods[method]++ 86 | 87 | # We want the URL, but we need to handle the case where the URL contains 88 | # one or more space characters, even though they shouldn't be there 89 | url="" 90 | for(i=2; i file 99 | 100 | # Create and add to a file for each request method 101 | file="http-request-"method".log" 102 | printf "%s\t%s\t%s\t%s\t%d\t%d\t%s\t%s\n", \ 103 | ip, datetime, method, url, code, size, referer, useragent > file 104 | } 105 | 106 | END { 107 | for(l=0; l