├── .github └── FUNDING.yml ├── .gitignore ├── README.md ├── build.sbt └── src ├── main └── scala │ ├── AccessLogParser.scala │ └── AccessLogRecord.scala └── test └── scala ├── AccessLogRecordSpec.scala └── SampleData.scala /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | # These are supported funding model platforms 2 | 3 | ko_fi: alvin 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | bin/ 2 | project/ 3 | target/ 4 | .cache 5 | .classpath 6 | .project 7 | .settings 8 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # A Scala Apache Access Log Parser 2 | 3 | This project can be used to parse Apache access log records in JVM applications (Scala, 4 | Java, etc.) It is specifically written to work with "combined records", as that's 5 | the only access log format I've used since the 1990s. 6 | 7 | 8 | ## Discussion 9 | 10 | In short, I needed an Apache access log parser, and after looking at some other 11 | code, I decided to write my own. 12 | 13 | 14 | ## Usage 15 | 16 | The API is in flux, but right now the usage starts like this: 17 | 18 | val rawRecord = """89.166.165.223 - - [21/Jul/2009:02:48:12 -0700] "GET /foo HTTP/1.1" 404 970 "-" "Mozilla/5.0 (Windows; U; Windows NT 5.1; de; rv:1.9.0.11) Firefox/3.0.11"""" 19 | 20 | val parser = AccessLogParser 21 | val accessLogRecord = parser.parse(rawRecord) // an AccessLogRecord instance 22 | 23 | The `AccessLogRecord` class definition looks like this: 24 | 25 | case class AccessLogRecord ( 26 | clientIpAddress: String, // should be an ip address, but may also be the hostname if hostname-lookups are enabled 27 | rfc1413ClientIdentity: String, // typically '-' 28 | remoteUser: String, // typically '-' 29 | dateTime: String, // [day/month/year:hour:minute:second zone] 30 | request: String, // 'GET /foo ...' 31 | httpStatusCode: String, // 200, 404, etc. 32 | bytesSent: String, // may be '-' 33 | referer: String, // where the visitor came from 34 | userAgent: String // long string to represent the browser and OS 35 | ) 36 | 37 | In the test code you'll see that I use the parser like this: 38 | 39 | val parser = new AccessLogParser 40 | val rec = parser.parseRecord(rawRecord) 41 | it("the result should not be None") { 42 | assert(rec != None) 43 | } 44 | it("the individual fields should be right") { 45 | rec.foreach { r => 46 | assert(r.clientIpAddress == "66.249.70.10") 47 | assert(r.rfc1413ClientIdentity == "-") 48 | assert(r.remoteUser == "-") 49 | assert(r.dateTime == "[23/Feb/2014:03:21:59 -0700]") 50 | assert(r.request == "GET /blog/post/java/how-load-multiple-spring-context-files-standalone/ HTTP/1.0") 51 | assert(r.httpStatusCode == "301") 52 | assert(r.bytesSent == "-") 53 | assert(r.referer == "-") 54 | assert(r.userAgent == "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)") 55 | } 56 | } 57 | 58 | If you don't like using the Option/Some/None pattern, I added a method named `parseRecordReturningNullObjectOnFailure` 59 | that returns a "Null Object" version of an `AccessLogRecord` instead of an Option. 60 | 61 | I also added some methods to parse the `date` and `request` fields, and I'll document those 62 | here on another day. You can see all of the current, up-to-date API by looking at the tests 63 | in the `AccessLogRecordSpec` class. 64 | 65 | 66 | ## Building 67 | 68 | This project is a typical Scala/SBT project, so just use commands like this: 69 | 70 | sbt compile 71 | sbt test 72 | sbt package 73 | 74 | 75 | ## More information 76 | 77 | I've added more documentation about this library at the following URLs. First, the basic documentation 78 | on this library is at this URL: 79 | 80 | * [My Scala Apache access log parser library](http://alvinalexander.com/scala/scala-apache-access-log-parser-library-java-jvm) 81 | 82 | Next, I've written two articles on how to use this library to analyze Apache access log records with 83 | Apache Spark and Scala: 84 | 85 | * [Analyzing Apache access logs with Spark and Scala](http://alvinalexander.com/scala/analyzing-apache-access-logs-files-spark-scala) 86 | * [Generating a list of URLs from Apache access log files, sorted by hit count, using Apache Spark (and Scala)](http://alvinalexander.com/scala/analyzing-apache-access-logs-files-spark-scala-part-2) 87 | 88 | For more information about yours truly: 89 | 90 | * [See my website](http://alvinalexander.com) 91 | * [Find me here on Twitter]([https://twitter.com/alvinalexander) 92 | 93 | All the best, 94 | Alvin Alexander 95 | http://alvinalexander.com 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | -------------------------------------------------------------------------------- /build.sbt: -------------------------------------------------------------------------------- 1 | name := "ScalaApacheAccessLogParser" 2 | 3 | version := "1.0" 4 | 5 | scalaVersion := "2.10.0" 6 | 7 | resolvers += "Typesafe Repository" at "http://repo.typesafe.com/typesafe/releases/" 8 | 9 | scalacOptions += "-deprecation" 10 | 11 | libraryDependencies += "org.scalatest" % "scalatest_2.10" % "2.1.0" % "test" 12 | 13 | -------------------------------------------------------------------------------- /src/main/scala/AccessLogParser.scala: -------------------------------------------------------------------------------- 1 | package com.alvinalexander.accesslogparser 2 | 3 | import java.util.regex.Pattern 4 | import java.text.SimpleDateFormat 5 | import java.util.Locale 6 | import scala.util.control.Exception._ 7 | import java.util.regex.Matcher 8 | import scala.util.{Try, Success, Failure} 9 | 10 | /** 11 | * A sample record: 12 | * 94.102.63.11 - - [21/Jul/2009:02:48:13 -0700] "GET / HTTP/1.1" 200 18209 "http://acme.com/foo.php" "Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0)" 13 | * 14 | * I put this code in the 'class' so (a) the pattern could be pre-compiled and (b) the user can create 15 | * multiple instances of this parser, in case they want to work in a multi-threaded way. 16 | * I don't know that this is necessary, but I think it is for this use case. 17 | * 18 | */ 19 | 20 | @SerialVersionUID(100L) 21 | class AccessLogParser extends Serializable { 22 | 23 | private val ddd = "\\d{1,3}" // at least 1 but not more than 3 times (possessive) 24 | private val ip = s"($ddd\\.$ddd\\.$ddd\\.$ddd)?" // like `123.456.7.89` 25 | private val client = "(\\S+)" // '\S' is 'non-whitespace character' 26 | private val user = "(\\S+)" 27 | private val dateTime = "(\\[.+?\\])" // like `[21/Jul/2009:02:48:13 -0700]` 28 | private val request = "\"(.*?)\"" // any number of any character, reluctant 29 | private val status = "(\\d{3})" 30 | private val bytes = "(\\S+)" // this can be a "-" 31 | private val referer = "\"(.*?)\"" 32 | private val agent = "\"(.*?)\"" 33 | private val regex = s"$ip $client $user $dateTime $request $status $bytes $referer $agent" 34 | private val p = Pattern.compile(regex) 35 | 36 | /** 37 | * note: group(0) is the entire record that was matched (skip it) 38 | * @param record Assumed to be an Apache access log combined record. 39 | * @return An AccessLogRecord instance wrapped in an Option. 40 | */ 41 | def parseRecord(record: String): Option[AccessLogRecord] = { 42 | val matcher = p.matcher(record) 43 | if (matcher.find) { 44 | Some(buildAccessLogRecord(matcher)) 45 | } else { 46 | None 47 | } 48 | } 49 | 50 | /** 51 | * Same as parseRecord, but returns a "Null Object" version of an AccessLogRecord 52 | * rather than an Option. 53 | * 54 | * @param record Assumed to be an Apache access log combined record. 55 | * @return An AccessLogRecord instance. This will be a "Null Object" version of an 56 | * AccessLogRecord if the parsing process fails. All fields in the Null Object 57 | * will be empty strings. 58 | */ 59 | def parseRecordReturningNullObjectOnFailure(record: String): AccessLogRecord = { 60 | val matcher = p.matcher(record) 61 | if (matcher.find) { 62 | buildAccessLogRecord(matcher) 63 | } else { 64 | AccessLogParser.nullObjectAccessLogRecord 65 | } 66 | } 67 | 68 | private def buildAccessLogRecord(matcher: Matcher) = { 69 | AccessLogRecord( 70 | matcher.group(1), 71 | matcher.group(2), 72 | matcher.group(3), 73 | matcher.group(4), 74 | matcher.group(5), 75 | matcher.group(6), 76 | matcher.group(7), 77 | matcher.group(8), 78 | matcher.group(9)) 79 | } 80 | } 81 | 82 | /** 83 | * A sample record: 84 | * 94.102.63.11 - - [21/Jul/2009:02:48:13 -0700] "GET / HTTP/1.1" 200 18209 "http://acme.com/foo.php" "Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0)" 85 | */ 86 | object AccessLogParser { 87 | 88 | val nullObjectAccessLogRecord = AccessLogRecord("", "", "", "", "", "", "", "", "") 89 | 90 | /** 91 | * @param A String like "GET /the-uri-here HTTP/1.1" 92 | * @return A Tuple3(requestType, uri, httpVersion). requestType is GET, POST, etc. 93 | * 94 | * Returns a Tuple3 of three blank strings if the method fails. 95 | */ 96 | def parseRequestField(request: String): Option[Tuple3[String, String, String]] = { 97 | val arr = request.split(" ") 98 | if (arr.size == 3) Some((arr(0), arr(1), arr(2))) else None 99 | } 100 | 101 | /** 102 | * @param A String that looks like "[21/Jul/2009:02:48:13 -0700]" 103 | */ 104 | def parseDateField(field: String): Option[java.util.Date] = { 105 | val dateRegex = "\\[(.*?) .+]" 106 | val datePattern = Pattern.compile(dateRegex) 107 | val dateMatcher = datePattern.matcher(field) 108 | if (dateMatcher.find) { 109 | val dateString = dateMatcher.group(1) 110 | println("***** DATE STRING" + dateString) 111 | // HH is 0-23; kk is 1-24 112 | val dateFormat = new SimpleDateFormat("dd/MMM/yyyy:HH:mm:ss", Locale.ENGLISH) 113 | allCatch.opt(dateFormat.parse(dateString)) // return Option[Date] 114 | } else { 115 | None 116 | } 117 | } 118 | 119 | } 120 | 121 | 122 | 123 | 124 | -------------------------------------------------------------------------------- /src/main/scala/AccessLogRecord.scala: -------------------------------------------------------------------------------- 1 | package com.alvinalexander.accesslogparser 2 | 3 | /** 4 | * @see http://httpd.apache.org/docs/2.2/logs.html for details 5 | */ 6 | case class AccessLogRecord ( 7 | clientIpAddress: String, // should be an ip address, but may also be the hostname if hostname-lookups are enabled 8 | rfc1413ClientIdentity: String, // typically `-` 9 | remoteUser: String, // typically `-` 10 | dateTime: String, // [day/month/year:hour:minute:second zone] 11 | request: String, // `GET /foo ...` 12 | httpStatusCode: String, // 200, 404, etc. 13 | bytesSent: String, // may be `-` 14 | referer: String, // where the visitor came from 15 | userAgent: String // long string to represent the browser and OS 16 | ) 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | -------------------------------------------------------------------------------- /src/test/scala/AccessLogRecordSpec.scala: -------------------------------------------------------------------------------- 1 | package com.alvinalexander.accesslogparser 2 | 3 | import org.scalatest.FunSpec 4 | import org.scalatest.BeforeAndAfter 5 | import org.scalatest.GivenWhenThen 6 | import java.util.Calendar 7 | 8 | class ApacheCombinedAccessLogRecordSpec extends FunSpec with BeforeAndAfter with GivenWhenThen { 9 | 10 | var records: Seq[String] = _ 11 | 12 | before { 13 | records = SampleCombinedAccessLogRecords.data 14 | } 15 | 16 | describe("Testing the first access log record ...") { 17 | it("the data fields should be correct") { 18 | Given("the first sample log record") 19 | records = SampleCombinedAccessLogRecords.data 20 | val parser = new AccessLogParser 21 | val rec = parser.parseRecord(records(0)) 22 | println("IP ADDRESS: " + rec.get.clientIpAddress) 23 | Then("parsing record(0) should not return None") 24 | assert(rec != None) 25 | And("the ip address should be correct") 26 | assert(rec.get.clientIpAddress == "124.30.9.161") 27 | And("client identity") 28 | assert(rec.get.rfc1413ClientIdentity == "-") 29 | And("remote user") 30 | assert(rec.get.remoteUser == "-") 31 | And("date/time") 32 | assert(rec.get.dateTime == "[21/Jul/2009:02:48:11 -0700]") 33 | And("request") 34 | assert(rec.get.request == "GET /java/edu/pj/pj010004/pj010004.shtml HTTP/1.1") 35 | And("status code should be 200") 36 | assert(rec.get.httpStatusCode == "200") 37 | And("bytes sent should be 16731") 38 | assert(rec.get.bytesSent == "16731") 39 | And("referer") 40 | assert(rec.get.referer == "http://www.google.co.in/search?hl=en&client=firefox-a&rlz=1R1GGGL_en___IN337&hs=F0W&q=reading+data+from+file+in+java&btnG=Search&meta=&aq=0&oq=reading+data+") 41 | And("user agent") 42 | assert(rec.get.userAgent == "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.11) Gecko/2009060215 Firefox/3.0.11 GTB5") 43 | } 44 | } 45 | 46 | describe("Testing a second access log record ...") { 47 | records = SampleCombinedAccessLogRecords.data 48 | val parser = new AccessLogParser 49 | val rec = parser.parseRecord(records(1)) 50 | it("the result should not be None") { 51 | assert(rec != None) 52 | } 53 | it("the individual fields should be right") { 54 | rec.foreach { r => 55 | assert(r.clientIpAddress == "89.166.165.223") 56 | assert(r.rfc1413ClientIdentity == "-") 57 | assert(r.remoteUser == "-") 58 | assert(r.dateTime == "[21/Jul/2009:02:48:12 -0700]") 59 | assert(r.request == "GET /favicon.ico HTTP/1.1") 60 | assert(r.httpStatusCode == "404") 61 | assert(r.bytesSent == "970") 62 | assert(r.referer == "-") 63 | assert(r.userAgent == "Mozilla/5.0 (Windows; U; Windows NT 5.1; de; rv:1.9.0.11) Gecko/2009060215 Firefox/3.0.11") 64 | } 65 | } 66 | } 67 | 68 | describe("Trying to parse a record I used to fail on ...") { 69 | records = SampleCombinedAccessLogRecords.badRecord 70 | val parser = new AccessLogParser 71 | val rec = parser.parseRecord(records(0)) 72 | it("the result should not be None") { 73 | assert(rec != None) 74 | } 75 | it("the individual fields should be right") { 76 | rec.foreach { r => 77 | assert(r.clientIpAddress == "66.249.70.10") 78 | assert(r.rfc1413ClientIdentity == "-") 79 | assert(r.remoteUser == "-") 80 | assert(r.dateTime == "[23/Feb/2014:03:21:59 -0700]") 81 | assert(r.request == "GET /blog/post/java/how-load-multiple-spring-context-files-standalone/ HTTP/1.0") 82 | assert(r.httpStatusCode == "301") 83 | assert(r.bytesSent == "-") 84 | assert(r.referer == "-") 85 | assert(r.userAgent == "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)") 86 | } 87 | } 88 | } 89 | 90 | describe("Testing the parseRecordReturningNullObjectOnFailure method with a valid record ...") { 91 | records = SampleCombinedAccessLogRecords.data 92 | val parser = new AccessLogParser 93 | val rec = parser.parseRecordReturningNullObjectOnFailure(records(1)) 94 | it("the result should not be null") { 95 | assert(rec != null) 96 | } 97 | it("the individual fields should be right") { 98 | assert(rec.clientIpAddress == "89.166.165.223") 99 | assert(rec.rfc1413ClientIdentity == "-") 100 | assert(rec.remoteUser == "-") 101 | assert(rec.dateTime == "[21/Jul/2009:02:48:12 -0700]") 102 | assert(rec.request == "GET /favicon.ico HTTP/1.1") 103 | assert(rec.httpStatusCode == "404") 104 | assert(rec.bytesSent == "970") 105 | assert(rec.referer == "-") 106 | assert(rec.userAgent == "Mozilla/5.0 (Windows; U; Windows NT 5.1; de; rv:1.9.0.11) Gecko/2009060215 Firefox/3.0.11") 107 | } 108 | } 109 | 110 | describe("Testing the parseRecordReturningNullObjectOnFailure method with an invalid record ...") { 111 | val parser = new AccessLogParser 112 | val rec = parser.parseRecordReturningNullObjectOnFailure("foo bar baz") 113 | it("the result should not be null") { 114 | assert(rec != null) 115 | } 116 | it("the individual fields should be blank strings") { 117 | assert(rec.clientIpAddress == "") 118 | assert(rec.rfc1413ClientIdentity == "") 119 | assert(rec.remoteUser == "") 120 | assert(rec.dateTime == "") 121 | assert(rec.request == "") 122 | assert(rec.httpStatusCode == "") 123 | assert(rec.bytesSent == "") 124 | assert(rec.referer == "") 125 | assert(rec.userAgent == "") 126 | } 127 | } 128 | 129 | describe("Parsing the request field ...") { 130 | it("a simple request should work") { 131 | val req = "GET /the-uri-here HTTP/1.1" 132 | val result = AccessLogParser.parseRequestField(req) 133 | assert(result != None) 134 | result.foreach { res => 135 | val (requestType, uri, httpVersion) = res 136 | assert(requestType == "GET") 137 | assert(uri == "/the-uri-here") 138 | assert(httpVersion == "HTTP/1.1") 139 | } 140 | } 141 | it("an invalid request should return blanks") { 142 | val req = "foobar" 143 | val result = AccessLogParser.parseRequestField(req) 144 | assert(result == None) 145 | } 146 | } 147 | 148 | describe("Parsing the date field ...") { 149 | it("a valid date field should work") { 150 | val date = AccessLogParser.parseDateField("[21/Jul/2009:02:48:13 -0700]") 151 | assert(date != None) 152 | date.foreach { d => 153 | val cal = Calendar.getInstance 154 | cal.setTimeInMillis(d.getTime) 155 | assert(cal.get(Calendar.YEAR) == 2009) 156 | assert(cal.get(Calendar.MONTH) == 6) // 0-based 157 | assert(cal.get(Calendar.DAY_OF_MONTH) == 21) 158 | assert(cal.get(Calendar.HOUR) == 2) 159 | assert(cal.get(Calendar.MINUTE) == 48) 160 | assert(cal.get(Calendar.SECOND) == 13) 161 | } 162 | } 163 | it("an invalid date field should return None") { 164 | val date = AccessLogParser.parseDateField("[foo bar]") 165 | assert(date == None) 166 | } 167 | } 168 | 169 | } 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | -------------------------------------------------------------------------------- /src/test/scala/SampleData.scala: -------------------------------------------------------------------------------- 1 | package com.alvinalexander.accesslogparser 2 | 3 | object SampleCombinedAccessLogRecords { 4 | 5 | val data = """ 6 | 124.30.9.161 - - [21/Jul/2009:02:48:11 -0700] "GET /java/edu/pj/pj010004/pj010004.shtml HTTP/1.1" 200 16731 "http://www.google.co.in/search?hl=en&client=firefox-a&rlz=1R1GGGL_en___IN337&hs=F0W&q=reading+data+from+file+in+java&btnG=Search&meta=&aq=0&oq=reading+data+" "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.11) Gecko/2009060215 Firefox/3.0.11 GTB5" 7 | 89.166.165.223 - - [21/Jul/2009:02:48:12 -0700] "GET /favicon.ico HTTP/1.1" 404 970 "-" "Mozilla/5.0 (Windows; U; Windows NT 5.1; de; rv:1.9.0.11) Gecko/2009060215 Firefox/3.0.11" 8 | 94.102.63.11 - - [21/Jul/2009:02:48:13 -0700] "GET / HTTP/1.1" 200 18209 "http://www.developer.com/net/vb/article.php/3683331" "Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0)" 9 | 124.30.7.162 - - [21/Jul/2009:02:48:13 -0700] "GET /images/tline3.gif HTTP/1.1" 200 79 "http://www.devdaily.com/java/edu/pj/pj010004/pj010004.shtml" "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.11) Gecko/2009060215 Firefox/3.0.11 GTB5" 10 | 122.165.54.17 - - [21/Jul/2009:02:48:12 -0700] "GET /java/java_oo/ HTTP/1.1" 200 32579 "http://www.google.co.in/search?hl=en&q=OO+with+java+standalone+example&btnG=Search&meta=&aq=f&oq=" "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7" 11 | 217.32.108.226 - - [21/Jul/2009:02:48:13 -0700] "GET /blog/post/perl/checking-testing-perl-module-in-inc-include-path/ HTTP/1.1" 200 18417 "http://www.devdaily.com/blog/post/perl/perl-error-cant-locate-module-in-inc/" "Mozilla/5.0 (X11; U; SunOS i86pc; en-US; rv:1.7) Gecko/20070606" 12 | 122.165.54.17 - - [21/Jul/2009:02:48:15 -0700] "GET /java/java_oo/java_oo.css HTTP/1.1" 200 1235 "http://www.devdaily.com/java/java_oo/" "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7" 13 | """.split("\n").filter(_ != "") 14 | 15 | val badRecord = """ 16 | 66.249.70.10 - - [23/Feb/2014:03:21:59 -0700] "GET /blog/post/java/how-load-multiple-spring-context-files-standalone/ HTTP/1.0" 301 - "-" "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)" 17 | """.split("\n").filter(_ != "") 18 | 19 | 20 | } 21 | 22 | --------------------------------------------------------------------------------