├── README.md
├── ua.awk
└── filter.awk


/README.md:
--------------------------------------------------------------------------------
 1 | Web server log file analysis & filtering
 2 | ========================================
 3 | 
 4 | v1.2; Oct 2012  
 5 | Ben Carpenter  
 6 | http://www.bencarpenter.co.uk/awk-for-apache-nginx-logs
 7 | 
 8 | This awk script processes lines from a log format that matches the
 9 | 'combined' log often used by the Apache and Nginx web servers. If your log
10 | file format is different, amend accordingly, but for reference this is the
11 | combined format this script expects by default:
12 | 
13 | 	%h %l %u %t "%r" %>s %b "%{Referer}i" "%{User-agent}i"
14 | 
15 | 	%h		Remote host
16 | 	%l		Remote logname (ignored)
17 | 	%u		Remote user (ignored)
18 | 	%t		Date and time of the request 
19 | 	%r		First line of the request, typically "GET /something HTTP/1.1"
20 | 	%>s		Status
21 | 	%b		Size of response in bytes
22 | 
23 | It tries to be efficient on resources, so there's minimal progress messages
24 | and no system commands in the main loop other than writing to a file based
25 | on the status code. The output files are written in a simplified
26 | tab-separated format, error corrected for some strange things like spaces
27 | in URLs and double quotes for the userid. This revised format is easier to
28 | pass reliably through other awk scripts when filtering for specific data,
29 | etc. The file format is:
30 | 
31 | 	IP, Date/Time, Method, URL, Status, Size, Referer, User Agent
32 | 
33 | You should be able to send a large (>1GB) amount of log data through this
34 | script quite comfortably. This works well for me, but usual clauses apply
35 | (use it at your own risk, etc.). Bug reports and suggestions for
36 | improvements are very welcome
37 | 


--------------------------------------------------------------------------------
/ua.awk:
--------------------------------------------------------------------------------
  1 | ##
  2 | #	Web server log file analysis & filtering - anonymised user-agent data
  3 | #
  4 | #	v1.0; Sep 2015
  5 | #	Ben Carpenter
  6 | #	https://github.com/lapsedtheorist/awk-for-apache-nginx-logs
  7 | #
  8 | #	This awk script processes lines from a log format that matches the
  9 | #	'combined' log often used by the Apache and Nginx web servers. If your log
 10 | #	file format is different, amend accordingly, but for reference this is the
 11 | #	combined format this script expects by default:
 12 | #
 13 | #		%h %l %u %t "%r" %>s %b "%{Referer}i" "%{User-agent}i"
 14 | #
 15 | #		%h		Remote host
 16 | #		%l		Remote logname (ignored)
 17 | #		%u		Remote user (ignored)
 18 | #		%t		Date and time of the request
 19 | #		%r		First line of the request, typically "GET /something HTTP/1.1"
 20 | #		%>s		Status
 21 | #		%b		Size of response in bytes
 22 | #
 23 | #	It tries to be efficient on resources, so there's minimal progress messages
 24 | #	and no system commands in the main loop other than writing to a file based
 25 | #	on the status code. The output files are written in a simplified
 26 | #	tab-separated format, error corrected for some strange things like spaces
 27 | #	in URLs and double quotes for the userid. This revised format is easier to
 28 | #	pass reliably through other awk scripts when filtering for specific data,
 29 | #	etc. The file format is:
 30 | #
 31 | #		Date/Time, Method, Status, User Agent
 32 | #
 33 | #	You should be able to send a large (>1GB) amount of log data through this
 34 | #	script quite comfortably. This works well for me, but usual clauses apply
 35 | #	(use it at your own risk, etc.). Bug reports and suggestions for
 36 | #	improvements are very welcome
 37 | ##
 38 | BEGIN {
 39 | 	FS="( \"|\" )"
 40 | 	intro="Processing..."
 41 | 	printf "%s", intro
 42 | }
 43 | 
 44 | {
 45 | 	split($1, a, " ")
 46 | 	ip=a[1]
 47 | 	# It seems some browsers/bots set the 'user' part to the blank string,
 48 | 	# double quoted, which is therefore something that can foul our detection
 49 | 	# for the status code, unless we explicitly look for it
 50 | 	if($2!="") {
 51 | 		datetime=a[4]" "a[5]
 52 | 		request=$2
 53 | 		referer=$4
 54 | 		useragent=$5
 55 | 		split($3, c, " ")
 56 | 		code=c[1]
 57 | 		size=c[2]
 58 | 	} else {
 59 | 		split($3, b, " ")
 60 | 		datetime=b[2]" "b[3]
 61 | 		request=$4
 62 | 		referer=$6
 63 | 		useragent=$7
 64 | 		split($5, c, " ")
 65 | 		code=c[1]
 66 | 		size=c[2]
 67 | 	}
 68 | 	total=NR
 69 | 	if(match(code, /^[0-9]+$/)==0) {
 70 | 		# This status code, whatever it is, isn't a number so let's set it to
 71 | 		# UNKNOWN so it's obvious in the analysis that this is a dud
 72 | 		code="UNKNOWN"
 73 | 	}
 74 | 
 75 | 	# Analyse the request
 76 | 	n=split(request, detail, " ")
 77 | 	method=detail[1]
 78 | 	if(match(method, /^[A-Z]+$/)==0) {
 79 | 		# This request method, whatever it is, doesn't 'look like' a request
 80 | 		# method, so let's set it to UNKNOWN so it's obvious in the analysis
 81 | 		# that this is a dud
 82 | 		method="UNKNOWN"
 83 | 	}
 84 | 
 85 | 	# Create a condensed file format containing UA data with a few extra
 86 | 	# helpful, but still non-identifying, pieces of info
 87 | 	file="http-anonymous-ua-full.log"
 88 | 	printf "%s\t%s\t%d\t%s\n", \
 89 | 		datetime, method, code, useragent > file
 90 | 
 91 | }
 92 | 
 93 | END {
 94 | 	for(l=0; l<length(intro); l++) {
 95 | 		printf "\b"
 96 | 	}
 97 | 	printf "%d requests filtered\n", \
 98 | 		total
 99 | 
100 | 	# Close and compress the file
101 | 	file="http-anonymous-ua-full.log"
102 | 	close(file)
103 | 	system("gzip -f "file)
104 | 	system("du -sh "file".gz")
105 | 	
106 | 	printf "\n"
107 | }
108 | 


--------------------------------------------------------------------------------
/filter.awk:
--------------------------------------------------------------------------------
  1 | ##
  2 | #	Web server log file analysis & filtering
  3 | #
  4 | #	v1.2; Oct 2012
  5 | #	Ben Carpenter
  6 | #	http://www.bencarpenter.co.uk/awk-for-apache-nginx-logs
  7 | #
  8 | #	This awk script processes lines from a log format that matches the
  9 | #	'combined' log often used by the Apache and Nginx web servers. If your log
 10 | #	file format is different, amend accordingly, but for reference this is the
 11 | #	combined format this script expects by default:
 12 | #
 13 | #		%h %l %u %t "%r" %>s %b "%{Referer}i" "%{User-agent}i"
 14 | #
 15 | #		%h		Remote host
 16 | #		%l		Remote logname (ignored)
 17 | #		%u		Remote user (ignored)
 18 | #		%t		Date and time of the request
 19 | #		%r		First line of the request, typically "GET /something HTTP/1.1"
 20 | #		%>s		Status
 21 | #		%b		Size of response in bytes
 22 | #
 23 | #	It tries to be efficient on resources, so there's minimal progress messages
 24 | #	and no system commands in the main loop other than writing to a file based
 25 | #	on the status code. The output files are written in a simplified
 26 | #	tab-separated format, error corrected for some strange things like spaces
 27 | #	in URLs and double quotes for the userid. This revised format is easier to
 28 | #	pass reliably through other awk scripts when filtering for specific data,
 29 | #	etc. The file format is:
 30 | #
 31 | #		IP, Date/Time, Method, URL, Status, Size, Referer, User Agent
 32 | #
 33 | #	You should be able to send a large (>1GB) amount of log data through this
 34 | #	script quite comfortably. This works well for me, but usual clauses apply
 35 | #	(use it at your own risk, etc.). Bug reports and suggestions for
 36 | #	improvements are very welcome
 37 | ##
 38 | BEGIN {
 39 | 	FS="( \"|\" )"
 40 | 	intro="Processing..."
 41 | 	printf "%s", intro
 42 | }
 43 | 
 44 | {
 45 | 	split($1, a, " ")
 46 | 	ip=a[1]
 47 | 	# It seems some browsers/bots set the 'user' part to the blank string,
 48 | 	# double quoted, which is therefore something that can foul our detection
 49 | 	# for the status code, unless we explicitly look for it
 50 | 	if($2!="") {
 51 | 		datetime=a[4]" "a[5]
 52 | 		request=$2
 53 | 		referer=$4
 54 | 		useragent=$6
 55 | 		split($3, c, " ")
 56 | 		code=c[1]
 57 | 		size=c[2]
 58 | 	} else {
 59 | 		split($3, b, " ")
 60 | 		datetime=b[2]" "b[3]
 61 | 		request=$4
 62 | 		referer=$6
 63 | 		useragent=$8
 64 | 		split($5, c, " ")
 65 | 		code=c[1]
 66 | 		size=c[2]
 67 | 	}
 68 | 	total=NR
 69 | 	if(match(code, /^[0-9]+$/)==0) {
 70 | 		# This status code, whatever it is, isn't a number so let's set it to
 71 | 		# UNKNOWN so it's obvious in the analysis that this is a dud
 72 | 		code="UNKNOWN"
 73 | 	}
 74 | 	statuses[code]++
 75 | 
 76 | 	# Analyse the request
 77 | 	n=split(request, detail, " ")
 78 | 	method=detail[1]
 79 | 	if(match(method, /^[A-Z]+$/)==0) {
 80 | 		# This request method, whatever it is, doesn't 'look like' a request
 81 | 		# method, so let's set it to UNKNOWN so it's obvious in the analysis
 82 | 		# that this is a dud
 83 | 		method="UNKNOWN"
 84 | 	}
 85 | 	methods[method]++
 86 | 
 87 | 	# We want the URL, but we need to handle the case where the URL contains
 88 | 	# one or more space characters, even though they shouldn't be there
 89 | 	url=""
 90 | 	for(i=2; i<n; i++) {
 91 | 		url=(url" "detail[i])
 92 | 	}
 93 | 	url=substr(url, 2)
 94 | 
 95 | 	# Create and add to a file for each status code
 96 | 	file="http-status-"code".log"
 97 | 	printf "%s\t%s\t%s\t%s\t%d\t%d\t%s\t%s\n", \
 98 | 		ip, datetime, method, url, code, size, referer, useragent > file
 99 | 
100 | 	# Create and add to a file for each request method
101 | 	file="http-request-"method".log"
102 | 	printf "%s\t%s\t%s\t%s\t%d\t%d\t%s\t%s\n", \
103 | 		ip, datetime, method, url, code, size, referer, useragent > file
104 | }
105 | 
106 | END {
107 | 	for(l=0; l<length(intro); l++) {
108 | 		printf "\b"
109 | 	}
110 | 	printf "%d requests filtered\n", \
111 | 		total
112 | 
113 | 	# Write out some useful summary data
114 | 	printf "\n%-8s\t%11s\t%6s\t%s\n", \
115 | 			"status", "occurrences", "%", "output\tfile"
116 | 	for(code in statuses) {
117 | 		printf "%-8d\t%11d\t%6.2f\t", \
118 | 			code, statuses[code], (100*statuses[code]/total)
119 | 		# Close and compress each file, because they can be large
120 | 		file="http-status-"code".log"
121 | 		close(file)
122 | 		system("gzip -f "file)
123 | 		system("du -sh "file".gz")
124 | 	}
125 | 	printf "\n%-8s\t%11s\t%6s\t%s\n", \
126 | 		"method", "occurrences", "%", "output\tfile"
127 | 	for(method in methods) {
128 | 		printf "%-8s\t%11d\t%6.2f\t", \
129 | 			method, methods[method], (100*methods[method]/total)
130 | 		# Close and compress each file, because they can be large
131 | 		file="http-request-"method".log"
132 | 		close(file)
133 | 		system("gzip -f "file)
134 | 		system("du -sh "file".gz")
135 | 	}
136 | 
137 | 	printf "\n"
138 | }
139 | 


--------------------------------------------------------------------------------