├── project
    ├── build.properties
    └── plugins.sbt
├── version.sbt
├── .dockerignore
├── gnaf-ui
    ├── html
    │   ├── loading.gif
    │   ├── index.html
    │   └── index.css
    └── README.md
├── gnaf-util
    ├── README.md
    ├── build.sbt
    ├── src
    │   └── main
    │   │   └── scala
    │   │       └── au
    │   │           └── csiro
    │   │               └── data61
    │   │                   └── gnaf
    │   │                       └── util
    │   │                           ├── Util.scala
    │   │                           ├── Timer.scala
    │   │                           └── Gnaf.scala
    └── 3rd-party-licenses.html
├── gnaf-indexer
    ├── build.sbt
    ├── src
    │   ├── test
    │   │   └── resources
    │   │   │   └── logback-test.xml
    │   └── main
    │   │   ├── resources
    │   │       └── logback.xml
    │   │   └── scala
    │   │       └── au
    │   │           └── csiro
    │   │               └── data61
    │   │                   └── gnaf
    │   │                       └── indexer
    │   │                           └── Indexer.scala
    ├── README.md
    └── 3rd-party-licenses.html
├── gnaf-extractor
    ├── build.sbt
    ├── src
    │   └── main
    │   │   ├── resources
    │   │       ├── logback.xml
    │   │       └── application.conf
    │   │   ├── script
    │   │       ├── loadElasticsearch.sh
    │   │       ├── loadElasticsearch.js
    │   │       └── gnafMapping.json
    │   │   └── scala
    │   │       └── au
    │   │           └── csiro
    │   │               └── data61
    │   │                   └── gnaf
    │   │                       └── extractor
    │   │                           └── Extractor.scala
    ├── README.md
    └── 3rd-party-licenses.html
├── gnaf-test
    ├── build.sbt
    ├── src
    │   ├── main
    │   │   ├── resources
    │   │   │   ├── logback.xml
    │   │   │   └── application.conf
    │   │   └── script
    │   │   │   ├── diff.js
    │   │   │   ├── summary.js
    │   │   │   ├── run.sh
    │   │   │   ├── Maps.js
    │   │   │   ├── searchLucene.js
    │   │   │   └── searchEs.js
    │   └── test
    │   │   ├── resources
    │   │       └── logback-test.xml
    │   │   └── scala
    │   │       └── au
    │   │           └── csiro
    │   │               └── data61
    │   │                   └── gnaf
    │   │                       └── test
    │   │                           └── MainTest.scala
    ├── package.json
    ├── README.md
    └── 3rd-party-licenses.html
├── .gitignore
├── gnaf-db
    ├── src
    │   └── main
    │   │   └── script
    │   │       ├── constraint.sed
    │   │       └── createGnafDb.sh
    ├── build.sbt
    └── 3rd-party-licenses.html
├── gnaf-lucene
    ├── build.sbt
    ├── src
    │   ├── test
    │   │   ├── resources
    │   │   │   └── logback-test.xml
    │   │   └── scala
    │   │   │   └── au
    │   │   │       └── csiro
    │   │   │           └── data61
    │   │   │               └── gnaf
    │   │   │                   └── lucene
    │   │   │                       └── GnafLuceneTest.scala
    │   └── main
    │   │   └── scala
    │   │       └── au
    │   │           └── csiro
    │   │               └── data61
    │   │                   └── gnaf
    │   │                       └── lucene
    │   │                           ├── LuceneUtil.scala
    │   │                           └── GnafLucene.scala
    ├── 3rd-party-licenses.html
    └── README.md
├── gnaf-contrib
    ├── src
    │   ├── test
    │   │   └── resources
    │   │   │   └── logback-test.xml
    │   └── main
    │   │   ├── resources
    │   │       ├── logback.xml
    │   │       └── application.conf
    │   │   └── scala
    │   │       └── au
    │   │           └── csiro
    │   │               └── data61
    │   │                   └── gnaf
    │   │                       └── contrib
    │   │                           ├── db
    │   │                               └── ContribTables.scala
    │   │                           └── service
    │   │                               └── ContribService.scala
    ├── build.sbt
    ├── README.md
    └── 3rd-party-licenses.html
├── gnaf-search
    ├── src
    │   ├── test
    │   │   └── resources
    │   │   │   └── logback-test.xml
    │   └── main
    │   │   ├── resources
    │   │       ├── logback.xml
    │   │       └── application.conf
    │   │   └── scala
    │   │       └── au
    │   │           └── csiro
    │   │               └── data61
    │   │                   └── gnaf
    │   │                       └── search
    │   │                           └── Search.scala
    ├── README.md
    ├── build.sbt
    └── 3rd-party-licenses.html
├── gnaf-db-service
    ├── src
    │   ├── test
    │   │   └── resources
    │   │   │   └── logback-test.xml
    │   └── main
    │   │   ├── resources
    │   │       ├── logback.xml
    │   │       └── application.conf
    │   │   └── scala
    │   │       └── au
    │   │           └── csiro
    │   │               └── data61
    │   │                   └── gnaf
    │   │                       └── db
    │   │                           └── service
    │   │                               └── DbService.scala
    ├── build.sbt
    ├── README.md
    └── 3rd-party-licenses.html
├── src
    └── main
    │   └── script
    │       ├── checkupdates.sh
    │       └── run.sh
├── Dockerfile
├── LICENSE.txt
├── README.md
└── template.yaml


/project/build.properties:
--------------------------------------------------------------------------------
1 | sbt.version=0.13.12
2 | 


--------------------------------------------------------------------------------
/version.sbt:
--------------------------------------------------------------------------------
1 | version in ThisBuild := "1.1-SNAPSHOT"
2 | 


--------------------------------------------------------------------------------
/.dockerignore:
--------------------------------------------------------------------------------
1 | indexDir/
2 | gnaf-db/data/
3 | addresses.gz
4 | 


--------------------------------------------------------------------------------
/gnaf-ui/html/loading.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/data61/gnaf/HEAD/gnaf-ui/html/loading.gif


--------------------------------------------------------------------------------
/gnaf-util/README.md:
--------------------------------------------------------------------------------
1 | # gnaf-util
2 | 
3 | ## Introduction
4 | 
5 | This project produces a library of common code used by the other gnaf sub-projects.
6 | 
7 | 


--------------------------------------------------------------------------------
/gnaf-indexer/build.sbt:
--------------------------------------------------------------------------------
1 | name := "gnaf-indexer"
2 | 
3 | libraryDependencies ++= Seq(
4 |   "com.github.scopt" %% "scopt" % "3.3.0",
5 |   "com.jsuereth" %% "scala-arm" % "2.0.0-M1"
6 |   )
7 | 


--------------------------------------------------------------------------------
/gnaf-extractor/build.sbt:
--------------------------------------------------------------------------------
1 | name := "gnaf-extractor"
2 | 
3 | libraryDependencies ++= Seq(
4 |   "com.github.scopt" %% "scopt" % "3.3.0",
5 |   "com.jsuereth" %% "scala-arm" % "2.0.0-M1"
6 |   )
7 | 


--------------------------------------------------------------------------------
/gnaf-test/build.sbt:
--------------------------------------------------------------------------------
1 | name := "gnaf-test"
2 | 
3 | libraryDependencies ++= Seq(
4 |   "com.github.scopt" %% "scopt" % "3.3.0",
5 |   "com.jsuereth" %% "scala-arm" % "2.0.0-M1",
6 |   "org.scalatest" %% "scalatest" % "3.0.0" % "test"
7 |   )
8 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.log
 2 | *~
 3 | 
 4 | # sbt specific
 5 | target/
 6 | /project/project/
 7 | /project/target/
 8 | .cache-main
 9 | .cache-tests
10 | 
11 | # Scala-IDE specific
12 | bin/
13 | test-bin/
14 | .classpath
15 | .project
16 | .settings/
17 | .worksheet
18 | 
19 | 
20 | 


--------------------------------------------------------------------------------
/gnaf-db/src/main/script/constraint.sed:
--------------------------------------------------------------------------------
 1 | # --regexp-extended
 2 | /ALTER TABLE/ {
 3 |   h
 4 |   d
 5 | }
 6 | /CONSTRAINT/ {
 7 |   H
 8 |   s~ *CONSTRAINT ([A-Z0-9_)]+) .*~SELECT 'Adding constraint \1 ...' AS Progress, CURRENT_TIME() AS Time;\n~p
 9 |   g
10 | }
11 | 
12 | 
13 | 


--------------------------------------------------------------------------------
/gnaf-lucene/build.sbt:
--------------------------------------------------------------------------------
 1 | name := "gnaf-lucene"
 2 | 
 3 | libraryDependencies ++= Seq(
 4 |   "com.github.scopt" %% "scopt" % "3.3.0",
 5 |   "com.jsuereth" %% "scala-arm" % "2.0.0-M1",
 6 |   "org.scalatest" %% "scalatest" % "3.0.0" % "test"
 7 |   )
 8 | 
 9 | libraryDependencies ++= Seq(
10 |   "lucene-core",
11 |   "lucene-analyzers-common"
12 | ) map ("org.apache.lucene" % _ % "6.2.1")
13 | 


--------------------------------------------------------------------------------
/gnaf-db/build.sbt:
--------------------------------------------------------------------------------
 1 | name := "gnaf-db"
 2 | 
 3 | libraryDependencies ++= Seq(
 4 |   "com.h2database" % "h2" % "1.4.193" // or postgres or whatever,  % "runtime" should be enough, but sbt slick.codegen needs it on compile classpath
 5 |   )
 6 | 
 7 | libraryDependencies ++= Seq(
 8 |   "slick-codegen", // only needed when generating slick mapping
 9 |   "slick",
10 |   "slick-hikaricp"
11 | ) map ("com.typesafe.slick" %% _ % "3.1.1")
12 | 


--------------------------------------------------------------------------------
/gnaf-util/build.sbt:
--------------------------------------------------------------------------------
 1 | name := "gnaf-util"
 2 | 
 3 | libraryDependencies ++= Seq(
 4 |   "io.spray" %%  "spray-json" % "1.3.2",
 5 |   "com.typesafe.scala-logging" %% "scala-logging" % "3.1.0",
 6 |   "org.slf4j" % "slf4j-api" % "1.7.12",
 7 |   "ch.qos.logback" % "logback-classic" % "1.1.3"
 8 |   // "org.scala-lang" % "scala-reflect" % "2.11.8", // Multiple dependencies with the same organization/name but different versions. To avoid conflict, pick one version
 9 |   // "org.scala-lang.modules" %% "scala-xml" % "1.0.4" // as above
10 |   )
11 | 


--------------------------------------------------------------------------------
/gnaf-contrib/src/test/resources/logback-test.xml:
--------------------------------------------------------------------------------
 1 | <configuration>
 2 | 
 3 |   <appender name="LOG" class="ch.qos.logback.core.ConsoleAppender">
 4 |     <!-- encoders are assigned the type ch.qos.logback.classic.encoder.PatternLayoutEncoder by default -->
 5 |     <encoder>
 6 |       <pattern>%d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern>
 7 |     </encoder>
 8 |   </appender>
 9 | 
10 |   <logger name="au.csiro.data61.gnaf.contrib" level="DEBUG" />
11 |   
12 |   <root level="DEBUG">
13 |     <appender-ref ref="LOG" />
14 |   </root>
15 | </configuration>
16 | 


--------------------------------------------------------------------------------
/gnaf-search/src/test/resources/logback-test.xml:
--------------------------------------------------------------------------------
 1 | <configuration>
 2 | 
 3 |   <appender name="LOG" class="ch.qos.logback.core.ConsoleAppender">
 4 | <!--   <appender name="LOG" class="ch.qos.logback.core.FileAppender"> -->
 5 | <!--     <file>gnaf-search.log</file> -->
 6 |     <encoder>
 7 |       <pattern>%d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern>
 8 |     </encoder>
 9 |   </appender>
10 | 
11 |   <logger name="au.csiro.data61.gnaf" level="DEBUG" />
12 |   
13 |   <root level="INFO">
14 |     <appender-ref ref="LOG" />
15 |   </root>
16 | </configuration>
17 | 


--------------------------------------------------------------------------------
/gnaf-test/src/main/resources/logback.xml:
--------------------------------------------------------------------------------
 1 | <configuration>
 2 | 
 3 | <!--   <appender name="LOG" class="ch.qos.logback.core.ConsoleAppender"> -->
 4 |   <appender name="LOG" class="ch.qos.logback.core.FileAppender">
 5 |     <file>gnaf-test.log</file>
 6 |     <encoder>
 7 |       <pattern>%d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern>
 8 |     </encoder>
 9 |   </appender>
10 | 
11 | <!--   <logger name="au.csiro.data61.gnaf.test" level="DEBUG" /> -->
12 |   
13 |   <root level="INFO">
14 |     <appender-ref ref="LOG" />
15 |   </root>
16 | </configuration>
17 | 


--------------------------------------------------------------------------------
/gnaf-db-service/src/test/resources/logback-test.xml:
--------------------------------------------------------------------------------
 1 | <configuration>
 2 | 
 3 |   <appender name="LOG" class="ch.qos.logback.core.ConsoleAppender">
 4 |     <!-- encoders are assigned the type ch.qos.logback.classic.encoder.PatternLayoutEncoder by default -->
 5 |     <encoder>
 6 |       <pattern>%d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern>
 7 |     </encoder>
 8 |   </appender>
 9 | 
10 |   <logger name="au.csiro.data61.gnaf.db.service" level="DEBUG" />
11 |   
12 |   <root level="DEBUG">
13 |     <appender-ref ref="LOG" />
14 |   </root>
15 | </configuration>
16 | 


--------------------------------------------------------------------------------
/gnaf-test/src/test/resources/logback-test.xml:
--------------------------------------------------------------------------------
 1 | <configuration>
 2 | 
 3 |   <appender name="LOG" class="ch.qos.logback.core.ConsoleAppender">
 4 | <!--   <appender name="LOG" class="ch.qos.logback.core.FileAppender"> -->
 5 | <!--     <file>gnaf-test.log</file> -->
 6 |     <encoder>
 7 |       <pattern>%d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern>
 8 |     </encoder>
 9 |   </appender>
10 | 
11 |   <logger name="au.csiro.data61.gnaf.test" level="DEBUG" />
12 |   
13 |   <root level="INFO">
14 |     <appender-ref ref="LOG" />
15 |   </root>
16 | </configuration>
17 | 


--------------------------------------------------------------------------------
/gnaf-db-service/build.sbt:
--------------------------------------------------------------------------------
 1 | name := "gnaf-db-service"
 2 | 
 3 | libraryDependencies ++= Seq(
 4 |   "ch.megard" %% "akka-http-cors" % "0.1.2",
 5 |   "com.github.swagger-akka-http" %% "swagger-akka-http" % "0.7.0", // adding swagger brings in all the horrible old javax.ws & Jackson dependencies!
 6 |   "io.swagger" % "swagger-annotations" % "1.5.9"
 7 |   )
 8 | 
 9 | libraryDependencies ++= Seq(
10 |   "akka-actor",
11 |   "akka-stream",
12 |   "akka-http-experimental",
13 |   "akka-http-spray-json-experimental",
14 |   "akka-http-testkit"
15 |   ) map ("com.typesafe.akka" %% _ % "2.4.3")
16 |  


--------------------------------------------------------------------------------
/gnaf-indexer/src/test/resources/logback-test.xml:
--------------------------------------------------------------------------------
 1 | <configuration>
 2 | 
 3 |   <appender name="LOG" class="ch.qos.logback.core.ConsoleAppender">
 4 | <!--   <appender name="LOG" class="ch.qos.logback.core.FileAppender"> -->
 5 | <!--     <file>gnaf-indexer.log</file> -->
 6 |     <encoder>
 7 |       <pattern>%d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern>
 8 |     </encoder>
 9 |   </appender>
10 | 
11 |   <logger name="au.csiro.data61.gnaf.indexer" level="DEBUG" />
12 |   
13 |   <root level="INFO">
14 |     <appender-ref ref="LOG" />
15 |   </root>
16 | </configuration>
17 | 


--------------------------------------------------------------------------------
/gnaf-lucene/src/test/resources/logback-test.xml:
--------------------------------------------------------------------------------
 1 | <configuration>
 2 | 
 3 |   <appender name="LOG" class="ch.qos.logback.core.ConsoleAppender">
 4 | <!--   <appender name="LOG" class="ch.qos.logback.core.FileAppender"> -->
 5 | <!--     <file>gnaf-lucene.log</file> -->
 6 |     <encoder>
 7 |       <pattern>%d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern>
 8 |     </encoder>
 9 |   </appender>
10 | 
11 |   <logger name="au.csiro.data61.gnaf.lucene" level="DEBUG" />
12 |   
13 |   <root level="INFO">
14 |     <appender-ref ref="LOG" />
15 |   </root>
16 | </configuration>
17 | 


--------------------------------------------------------------------------------
/gnaf-contrib/src/main/resources/logback.xml:
--------------------------------------------------------------------------------
 1 | <configuration>
 2 | 
 3 |   <appender name="LOG" class="ch.qos.logback.core.ConsoleAppender">
 4 | <!--   <appender name="LOG" class="ch.qos.logback.core.FileAppender"> -->
 5 | <!--     <file>gnaf-contrib.log</file> -->
 6 |     <encoder>
 7 |       <pattern>%d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern>
 8 |     </encoder>
 9 |   </appender>
10 | 
11 | <!--   <logger name="au.csiro.data61.gnaf.contrib" level="DEBUG" /> -->
12 |   
13 |   <root level="INFO">
14 |     <appender-ref ref="LOG" />
15 |   </root>
16 | </configuration>
17 | 


--------------------------------------------------------------------------------
/gnaf-indexer/src/main/resources/logback.xml:
--------------------------------------------------------------------------------
 1 | <configuration>
 2 | 
 3 |   <appender name="LOG" class="ch.qos.logback.core.ConsoleAppender">
 4 | <!--   <appender name="LOG" class="ch.qos.logback.core.FileAppender"> -->
 5 | <!--     <file>gnaf-indexer.log</file> -->
 6 |     <encoder>
 7 |       <pattern>%d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern>
 8 |     </encoder>
 9 |   </appender>
10 | 
11 | <!--   <logger name="au.csiro.data61.gnaf.indexer" level="DEBUG" /> -->
12 |   
13 |   <root level="INFO">
14 |     <appender-ref ref="LOG" />
15 |   </root>
16 | </configuration>
17 | 


--------------------------------------------------------------------------------
/gnaf-search/src/main/resources/logback.xml:
--------------------------------------------------------------------------------
 1 | <configuration>
 2 | 
 3 |   <appender name="LOG" class="ch.qos.logback.core.ConsoleAppender">
 4 | <!--   <appender name="LOG" class="ch.qos.logback.core.FileAppender"> -->
 5 | <!--     <file>gnaf-search.log</file> -->
 6 |     <encoder>
 7 |       <pattern>%d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern>
 8 |     </encoder>
 9 |   </appender>
10 | 
11 | <!--   <logger name="au.csiro.data61.gnaf.search" level="DEBUG" /> -->
12 |   
13 |   <root level="INFO">
14 |     <appender-ref ref="LOG" />
15 |   </root>
16 | </configuration>
17 | 


--------------------------------------------------------------------------------
/gnaf-db-service/README.md:
--------------------------------------------------------------------------------
 1 | # gnaf-db-service
 2 | 
 3 | ## Introduction
 4 | This project provides a [Scala](http://scala-lang.org/) [RESTful](https://en.wikipedia.org/wiki/Representational_state_transfer) JSON
 5 | web service providing access to the G-NAF database.
 6 | 
 7 | This is a stand-alone webapp and does not run in a servlet container.
 8 | 
 9 | ## Configuration
10 | 
11 | Configuration is in [application.conf](src/main/resources/application.conf) and most settings can be overridden with environment variables.
12 | 
13 | ## Running and Usage
14 | 
15 | See `gnaf/src/main/script/run.sh`.
16 | 


--------------------------------------------------------------------------------
/gnaf-extractor/src/main/resources/logback.xml:
--------------------------------------------------------------------------------
 1 | <configuration>
 2 | 
 3 | <!--   <appender name="STDOUT" class="ch.qos.logback.core.ConsoleAppender"> -->
 4 |   <appender name="FILE" class="ch.qos.logback.core.FileAppender">
 5 |     <file>gnaf-extractor.log</file>
 6 |     <encoder>
 7 |       <pattern>%d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern>
 8 |     </encoder>
 9 |   </appender>
10 | 
11 | <!--   <logger name="au.csiro.data61.gnaf.extractor" level="DEBUG" /> -->
12 |   
13 |   <root level="INFO">
14 |     <appender-ref ref="FILE" />
15 |   </root>
16 | </configuration>
17 | 


--------------------------------------------------------------------------------
/gnaf-db-service/src/main/resources/logback.xml:
--------------------------------------------------------------------------------
 1 | <configuration>
 2 | 
 3 |   <appender name="LOG" class="ch.qos.logback.core.ConsoleAppender">
 4 | <!--   <appender name="LOG" class="ch.qos.logback.core.FileAppender"> -->
 5 | <!--     <file>gnaf-db-service.log</file> -->
 6 |     <encoder>
 7 |       <pattern>%d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern>
 8 |     </encoder>
 9 |   </appender>
10 | 
11 | <!--   <logger name="au.csiro.data61.gnaf.db.service" level="DEBUG" /> -->
12 |   
13 |   <root level="INFO">
14 |     <appender-ref ref="LOG" />
15 |   </root>
16 | </configuration>
17 | 


--------------------------------------------------------------------------------
/gnaf-search/README.md:
--------------------------------------------------------------------------------
 1 | # gnaf-search
 2 | 
 3 | ## Introduction
 4 | 
 5 | This project provides a JSON web service to search the [Lucene](https://lucene.apache.org/) index created by `gnaf-indexer`.
 6 | Users should note the [suggested preprocessing](../gnaf-lucene/README.md#suggested-preprocessing-for-client-applications) for
 7 | query strings.
 8 | 
 9 | ## Configuration
10 | 
11 | Configuration is in [application.conf](src/main/resources/application.conf) and most settings can be overridden with environment variables.
12 | Command line options take precedence over the above, use `--help` for details.
13 | 
14 | ## Running and Usage
15 | 
16 | See `gnaf/src/main/script/run.sh`.
17 | 


--------------------------------------------------------------------------------
/gnaf-test/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "gnaf-test",
 3 |   "version": "1.0.0",
 4 |   "description": "## Introduction",
 5 |   "main": "src/main/script/searchLucene.js",
 6 |   "dependencies": {
 7 |     "fs": "^0.0.2",
 8 |     "request": "^2.74.0"
 9 |   },
10 |   "devDependencies": {},
11 |   "scripts": {
12 |     "test": "echo \"Error: no test specified\" && exit 1"
13 |   },
14 |   "repository": {
15 |     "type": "git",
16 |     "url": "git+https://github.com/data61/gnaf.git"
17 |   },
18 |   "author": "",
19 |   "license": "BSD-3-Clause",
20 |   "bugs": {
21 |     "url": "https://github.com/data61/gnaf/issues"
22 |   },
23 |   "homepage": "https://github.com/data61/gnaf#readme"
24 | }
25 | 


--------------------------------------------------------------------------------
/gnaf-search/src/main/resources/application.conf:
--------------------------------------------------------------------------------
 1 | akka {
 2 |   loglevel = INFO
 3 | }
 4 | 
 5 | gnafSearch {
 6 | 
 7 |   indexDir = "./indexDir"
 8 |   
 9 |   // validation limits
10 |   bulk = 50
11 |   numHits = 1000
12 |   fuzzyMinLength = 2
13 |   fuzzyMaxEdits = 2
14 |   fuzzyPrefixLength = 0
15 |   
16 |   interface = "0.0.0.0"
17 |   port = 9040
18 |   
19 |   indexDir = ${?GNAF_SEARCH_INDEX_DIR}
20 |   
21 |   numHits = ${?GNAF_SEARCH_NUM_HITS}
22 |   fuzzyMinLength = ${?GNAF_SEARCH_FUZZY_MIN_LENGTH}
23 |   fuzzyMaxEdits = ${?GNAF_SEARCH_FUZZY_MAX_EDITS}
24 |   fuzzyPrefixLength = ${?GNAF_SEARCH_FUZZY_PREFIX_LENGTH}
25 |   
26 |   interface = ${?GNAF_SEARCH_INTERFACE}
27 |   port = ${?GNAF_SEARCH_PORT}
28 | }
29 | 
30 | 


--------------------------------------------------------------------------------
/gnaf-indexer/README.md:
--------------------------------------------------------------------------------
 1 | # gnaf-indexer
 2 | 
 3 | ## Introduction
 4 | 
 5 | This project loads JSON address data from `gnaf-extractor` into a [Lucene](https://lucene.apache.org/) index.
 6 | Originally Elasticsearch was used, but it was found that significant tweaks to scoring were required for good results
 7 | and this was easiest achieved in raw Lucene (which also provided significant speed improvements).
 8 | 
 9 | ## Configuration
10 | 
11 | Configuration is in [application.conf](src/main/resources/application.conf) and most settings can be overridden with environment variables.
12 | The index directory can also be set with a command line option (overriding the above, use `--help` for details).
13 | 
14 | ## Running and Usage
15 | 
16 | See `gnaf/src/main/script/run.sh`.
17 | 


--------------------------------------------------------------------------------
/gnaf-ui/html/index.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 | <head>
 4 |   <meta charset="UTF-8">
 5 |   <title>G-NAF</title>
 6 |   <link rel="icon" type="image/ico" href="http://www.csiro.au/themes/default/img/favicon.ico" />
 7 |   <link rel="stylesheet" type="text/css" href="https://code.jquery.com/ui/1.11.4/themes/smoothness/jquery-ui.css">
 8 |   <link rel="stylesheet" type="text/css" href="index.css">
 9 | </head>
10 | <body>
11 |   <h1>G-NAF</h1>
12 |   <div id="gnaf"></div>
13 |   <script type="text/javascript" src="https://code.jquery.com/jquery-2.2.2.min.js"></script>
14 |   <script type="text/javascript" src="https://code.jquery.com/ui/1.11.4/jquery-ui.min.js"></script>
15 |   <script type="text/javascript" src="index.js"></script>
16 |   <script type="text/javascript">$(document).ready(initGnaf);</script>
17 | </body>
18 | </html>
19 | 
20 | 
21 | 


--------------------------------------------------------------------------------
/project/plugins.sbt:
--------------------------------------------------------------------------------
 1 | // I got it saying: Source code has generated in /home/neil/sw/gnaf/target/scala-2.11/src_managed/main/au/com/data61/gnaf/db/Tables.scala
 2 | // but this file was not actually created, so I'm giving up on this plugin for now. 
 3 | // addSbtPlugin("com.github.tototoshi" % "sbt-slick-codegen" % "1.2.0")
 4 | 
 5 | // required by above
 6 | // libraryDependencies += "com.h2database" % "h2" % "1.4.191"
 7 | 
 8 | addSbtPlugin("io.spray" % "sbt-revolver" % "0.8.0")
 9 | 
10 | addSbtPlugin("com.typesafe.sbteclipse" % "sbteclipse-plugin" % "4.0.0")
11 | 
12 | addSbtPlugin("org.scala-sbt.plugins" % "sbt-onejar" % "0.8")
13 | 
14 | addSbtPlugin("net.virtual-void" % "sbt-dependency-graph" % "0.8.2")
15 | 
16 | addSbtPlugin("com.typesafe.sbt" % "sbt-license-report" % "1.1.0")
17 | 
18 | addSbtPlugin("com.github.gseitz" % "sbt-release" % "1.0.3")
19 | 


--------------------------------------------------------------------------------
/gnaf-test/src/test/scala/au/csiro/data61/gnaf/test/MainTest.scala:
--------------------------------------------------------------------------------
 1 | package au.csiro.data61.gnaf.test
 2 | 
 3 | import org.scalatest.FlatSpec
 4 | import org.scalatest.Matchers
 5 | import Main._
 6 | 
 7 | class MainTest extends FlatSpec with Matchers {
 8 |   
 9 |   val s = "some test string"
10 |   
11 |   val typo = "\\S{2}~".r
12 | 
13 |   "mkTypo" should "make one random typo and not in the first two chars of a word" in {
14 |     val s = Seq(Some("the quick brown fox"), None, Some("jumped over the lazy"), Some("fence"))
15 |     (0 to 100).foreach { _ =>
16 |       (s zip mkTypo(s)).count { case (a, b) =>
17 |         val notEq = a != b
18 |         if (notEq) {
19 |           log.debug(b.toString)
20 |           b.isDefined should be (true)
21 |           typo.findFirstIn(b.get).isDefined should be (true)
22 |         }
23 |         notEq
24 |       } should be(1)
25 |     }
26 |   }
27 |   
28 | }


--------------------------------------------------------------------------------
/gnaf-contrib/build.sbt:
--------------------------------------------------------------------------------
 1 | name := "gnaf-contrib"
 2 | 
 3 | libraryDependencies ++= Seq(
 4 |   "com.h2database" % "h2" % "1.4.193", // or postgres or whatever,  % "runtime" should be enough, but sbt slick.codegen needs it on compile classpath
 5 |   "ch.megard" %% "akka-http-cors" % "0.1.2",
 6 |   "com.github.swagger-akka-http" %% "swagger-akka-http" % "0.7.0", // adding swagger brings in all the horrible old javax.ws & Jackson dependencies!
 7 |   "io.swagger" % "swagger-annotations" % "1.5.9"
 8 |   )
 9 | 
10 | libraryDependencies ++= Seq(
11 |   "slick-codegen", // only needed when generating slick mapping
12 |   "slick",
13 |   "slick-hikaricp"
14 | ) map ("com.typesafe.slick" %% _ % "3.1.1")
15 | 
16 | libraryDependencies ++= Seq(
17 |   "akka-actor",
18 |   "akka-stream",
19 |   "akka-http-experimental",
20 |   "akka-http-spray-json-experimental",
21 |   "akka-http-testkit"
22 |   ) map ("com.typesafe.akka" %% _ % "2.4.3")
23 | 
24 |   


--------------------------------------------------------------------------------
/src/main/script/checkupdates.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | # This script simply checks the current data and determines whether there are updates that need to be applied to
 4 | # our production environment.
 5 | 
 6 | jsonUrl=http://www.data.gov.au/api/3/action/package_show?id=19432f89-dc3a-4ef3-b943-5326ef1dbecc
 7 | prodUrl=http://gnaf.nationalmap.nicta.com.au/v2/version
 8 | 
 9 | last_modified=$( curl -sL $jsonUrl | jq -r '.result.resources[] | select(.format == "ZIP") | .last_modified' )
10 | 
11 | existing_last_modified=$(curl -sL $prodUrl | jq -r '.["gnaf-version"]' || echo None_Found)
12 | 
13 | echo "Last modified date in production: $existing_last_modified";
14 | echo "Last modified date from data.gov.au: $last_modified";
15 | 
16 | if [[ "$last_modified" != "$existing_last_modified" ]]; then
17 |     echo "New data found!";
18 |     exit 0
19 | else
20 |     echo "No new data found, exiting with exit code 1";
21 |     exit 1;
22 | fi
23 | 


--------------------------------------------------------------------------------
/gnaf-ui/README.md:
--------------------------------------------------------------------------------
 1 | # gnaf-ui
 2 | 
 3 | ## Introduction
 4 | This project consists of static files providing a demonstration web user interface using Elasticsearch and the gnaf-service.
 5 | It uses ECMAScript 6 and so only runs in some modern browsers (Chrome, Firefox, Edge, not yet Safari).
 6 | 
 7 | ## Configuration
 8 | 
 9 | The function `initBaseUrl` in `index.js` determines the URLs used to access the servers depending on the protocol used to serve the webapp.
10 | If the `file:` protocol is used (`index.html` was opened as a file rather than from a web server) then then `http://localhost is used to access the servers.
11 | Otherwise the protocol and host used to serve the webapp is used.
12 | 
13 | ## Running and Usage
14 | 
15 | Cors access to servers isn't working from a `file:` URL.
16 | 
17 | To use python's simple web server to serve the UI over HTTP, run from the html directory: `python3 -m http.server`. Access the UI at: http://localhost:8000/.
18 | 


--------------------------------------------------------------------------------
/gnaf-util/src/main/scala/au/csiro/data61/gnaf/util/Util.scala:
--------------------------------------------------------------------------------
 1 | package au.csiro.data61.gnaf.util
 2 | 
 3 | import com.typesafe.scalalogging.Logger
 4 | import org.slf4j.LoggerFactory
 5 | 
 6 | object Util {
 7 |   def loader = getClass.getClassLoader // or Thread.currentThread.getContextClassLoader
 8 | 
 9 |   /** Get a Scala singleton Object.
10 |     * @param fqn object's fully qualified name
11 |     * @return object as type T
12 |     */
13 |   def getObject[T](fqn: String): T = {
14 |     val m = scala.reflect.runtime.universe.runtimeMirror(loader)
15 |     m.reflectModule(m.staticModule(fqn)).instance.asInstanceOf[T]
16 |   }
17 | 
18 |   /**
19 |    * It appears that configuring a logger name containing a '$' in logback.xml doesn't work, so convert Scala object names ending in '$' to use '.' instead.
20 |    */
21 |   def logName(c: Class[_]) = c.getName.replace('$', '.')
22 |   
23 |   def getLogger(c: Class[_]) = Logger(LoggerFactory.getLogger(logName(c)))
24 | }
25 | 


--------------------------------------------------------------------------------
/gnaf-db-service/src/main/resources/application.conf:
--------------------------------------------------------------------------------
 1 | gnafDb = {
 2 |   connectionPool = HikariCP // this is the default
 3 |   
 4 |   slickDriver = slick.driver.H2Driver
 5 |   url = "jdbc:h2:file:~/gnaf"
 6 |   driver = org.h2.Driver
 7 |   
 8 |   // slickDriver = slick.driver.PostgresDriver
 9 |   // url = "jdbc:postgresql://localhost/gnaf"
10 |   // driver = org.postgresql.Driver
11 |   
12 |   readOnly = true
13 |   user = "READONLY"
14 |   password = "READONLY"
15 |   numThreads       =  4
16 |   queueSize        =  100
17 |   maxConnections   =  10
18 |   minConnections   =  1
19 |   connectionTimeout = 10000
20 |   initializationFailFast = false
21 | 
22 |   slickDriver = ${?GNAF_SLICK_DRIVER} // optional override by environment variable
23 |   url = ${?GNAF_JDBC_URL}
24 |   driver = ${?GNAF_JDBC_DRIVER}  
25 |   user = ${?GNAF_JDBC_USER}  
26 |   password = ${?GNAF_JDBC_PASSWORD}  
27 | }
28 | 
29 | akka {
30 |   loglevel = INFO
31 | }
32 | 
33 | http {
34 |   interface = "0.0.0.0"
35 |   port = 9000
36 |   
37 |   interface = ${?GNAF_DB_SERVICE_INTERFACE}
38 |   port = ${?GNAF_DB_SERVICE_PORT}
39 | }
40 | 


--------------------------------------------------------------------------------
/gnaf-extractor/src/main/resources/application.conf:
--------------------------------------------------------------------------------
 1 | gnafDb = {
 2 |   connectionPool = HikariCP // this is the default
 3 |   
 4 |   slickDriver = slick.driver.H2Driver
 5 |   url = "jdbc:h2:file:~/gnaf;max_memory_rows=600000" // requires db admin rights
 6 |   driver = org.h2.Driver
 7 |   
 8 |   // slickDriver = slick.driver.PostgresDriver
 9 |   // url = "jdbc:postgresql://localhost/gnaf"
10 |   // driver = org.postgresql.Driver
11 |   
12 |   readOnly = true
13 |   user = "gnaf" // "READONLY"
14 |   password = "gnaf" // "READONLY"
15 |   numThreads       =  4
16 |   queueSize        =  600000
17 |   maxConnections   =  20
18 |   minConnections   =  4
19 |   connectionTimeout = 120000
20 |   initializationFailFast = false
21 | 
22 |   slickDriver = ${?GNAF_SLICK_DRIVER} // optional override by environment variable
23 |   url = ${?GNAF_JDBC_URL}
24 |   driver = ${?GNAF_JDBC_DRIVER}  
25 |   user = ${?GNAF_JDBC_USER}  
26 |   password = ${?GNAF_JDBC_PASSWORD}  
27 |   
28 |   localityTimeout = 60   // timeout in minutes for all queries for a locality  
29 |   allTimeout = 1000      // timeout in minutes for all queries  
30 | }
31 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Build
 2 | FROM ubuntu:16.04 as builder
 3 | 
 4 | WORKDIR /
 5 | 
 6 | RUN apt-get update
 7 | 
 8 | RUN apt-get -y install apt-transport-https
 9 | 
10 | RUN echo "deb https://dl.bintray.com/sbt/debian /" | tee -a /etc/apt/sources.list.d/sbt.list
11 | RUN echo "deb http://apt.postgresql.org/pub/repos/apt/ xenial-pgdg main"
12 | RUN apt-key adv --keyserver hkp://p80.pool.sks-keyservers.net:80 --recv-keys B97B0AFCAA1A47F044F244A07FCC7D46ACCC4CF8
13 | RUN apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv 2EE0EA64E40A89B84B2DF73499E82A75642AC823
14 | 
15 | RUN apt-get update
16 | 
17 | RUN apt-get -y install openjdk-8-jre sbt jq postgresql-client-9.5 wget curl zip time
18 | 
19 | ADD . /
20 | 
21 | RUN /bin/bash src/main/script/run.sh
22 | 
23 | # Run
24 | FROM openjdk:8-jre
25 | 
26 | WORKDIR /
27 | 
28 | COPY --from=builder /indexDir /indexDir
29 | COPY --from=builder /gnaf-search/target/scala-2.11/gnaf-search_2.11-1.1-SNAPSHOT-one-jar.jar /gnaf-search/target/scala-2.11/gnaf-search_2.11-1.1-SNAPSHOT-one-jar.jar
30 | 
31 | EXPOSE 9040
32 | 
33 | CMD ["java", "-jar", "/gnaf-search/target/scala-2.11/gnaf-search_2.11-1.1-SNAPSHOT-one-jar.jar"]
34 | 


--------------------------------------------------------------------------------
/gnaf-contrib/src/main/resources/application.conf:
--------------------------------------------------------------------------------
 1 | gnafContribDb = {
 2 |   connectionPool = HikariCP // this is the default
 3 |   
 4 |   slickDriver = slick.driver.H2Driver
 5 |   url = "jdbc:h2:file:~/gnafContrib"
 6 |   driver = org.h2.Driver
 7 |   
 8 |   // slickDriver = slick.driver.PostgresDriver
 9 |   // url = "jdbc:postgresql://localhost/gnafContrib"
10 |   // driver = org.postgresql.Driver
11 |   
12 |   user = "gnaf"
13 |   password = "gnaf"
14 |   numThreads       =  4
15 |   maxConnections   =  10      // 1 for each of above + 1 for each concurrently run Future + 2 spare
16 |   minConnections   =  1
17 |   connectionTimeout = 10000
18 |   initializationFailFast = false
19 | 
20 |   slickDriver = ${?GNAF_CONTRIB_SLICK_DRIVER} // optional override by environment variable
21 |   url = ${?GNAF_CONTRIB_JDBC_URL}
22 |   driver = ${?GNAF_CONTRIB_JDBC_DRIVER}  
23 |   user = ${?GNAF_CONTRIB_JDBC_USER}  
24 |   password = ${?GNAF_CONTRIB_JDBC_PASSWORD}  
25 | }
26 | 
27 | akka {
28 |   loglevel = INFO
29 | }
30 | 
31 | http {
32 |   interface = "0.0.0.0"
33 |   port = 9010
34 |   
35 |   interface = ${?GNAF_CONTRIB_SERVICE_INTERFACE}
36 |   port = ${?GNAF_CONTRIB_SERVICE_PORT}
37 | }
38 | 
39 | 


--------------------------------------------------------------------------------
/gnaf-test/src/main/resources/application.conf:
--------------------------------------------------------------------------------
 1 | // not sure if we should move this to gnaf-common
 2 | gnafDb = {
 3 |   connectionPool = HikariCP // this is the default
 4 |   
 5 |   slickDriver = slick.driver.H2Driver
 6 |   url = "jdbc:h2:file:~/gnaf" // ;max_memory_rows=100000 // requires db admin rights
 7 |   driver = org.h2.Driver
 8 |   
 9 |   // slickDriver = slick.driver.PostgresDriver
10 |   // url = "jdbc:postgresql://localhost/gnaf"
11 |   // driver = org.postgresql.Driver
12 |   
13 |   readOnly = true
14 |   user = "READONLY"
15 |   password = "READONLY"
16 |   numThreads       =  4
17 |   queueSize        =  987654  // failed with ~1000, so Future callbacks must go on this queue too
18 |   maxConnections   =  10      // 1 for each of above + 1 for each concurrently run Future + 2 spare
19 |   minConnections   =  1
20 |   connectionTimeout = 10000
21 |   initializationFailFast = false
22 | 
23 |   slickDriver = ${?GNAF_SLICK_DRIVER} // optional override by environment variable
24 |   url = ${?GNAF_JDBC_URL}
25 |   driver = ${?GNAF_JDBC_DRIVER}  
26 |   user = ${?GNAF_JDBC_USER}  
27 |   password = ${?GNAF_JDBC_PASSWORD}  
28 | }
29 | 
30 | gnafTest = {
31 |   sampleSize = 100
32 |   sampleSize = ${?GNAF_TEST_SAMPLE_SIZE}
33 | }


--------------------------------------------------------------------------------
/gnaf-test/src/main/script/diff.js:
--------------------------------------------------------------------------------
 1 | var fs = require('fs');
 2 | 
 3 | /**
 4 |  * Usage: node src/main/node/diff.js otherDir files ...
 5 |  */
 6 | 
 7 | // 0 -> node; 1 -> src/main/script/diff.js; 2 -> otherDir; 3 -> files ...
 8 | var otherDir = process.argv[2];
 9 | 
10 | var m = new Map();
11 | for (i = 3; i < process.argv.length; ++i) {
12 |   var stats = JSON.parse(fs.readFileSync(process.argv[i], "utf8"));
13 |   for (desc in stats.histogram) {
14 |     var o = stats.histogram[desc];
15 |     for (p in o) histAdd(m, p, o[p]);
16 |   }
17 |   stats = JSON.parse(fs.readFileSync(otherDir + '/' + process.argv[i], "utf8"));
18 |   for (desc in stats.histogram) {
19 |     var o = stats.histogram[desc];
20 |     for (p in o) histAdd(m, p, -o[p]);
21 |   }
22 | }
23 | 
24 | var sum = 0;
25 | for (i of m.values()) sum += i;
26 | console.log(JSON.stringify({ samples: sum, histogram: mapToArr(m) }));
27 | 
28 | /** Add v occurrences of k to a histogram map.
29 |  * 
30 |  * @param m histogram map: k -> occurrence count of k
31 |  * @param k key
32 |  * @param v new occurrences of k to add
33 |  */
34 | function histAdd(m, k, v) {
35 |   var n = m.get(k);
36 |   m.set(k, n ? n + v : v);
37 | }
38 | 
39 | function mapToArr(m) {
40 |   var a = [];
41 |   for (e of m) a.push(e);
42 |   return a;
43 | } 
44 | 


--------------------------------------------------------------------------------
/gnaf-search/build.sbt:
--------------------------------------------------------------------------------
 1 | name := "gnaf-search"
 2 | 
 3 | // resourceGenerators in Compile += Def.task {
 4 | //   val file = (resourceManaged in Compile).value / "demo" / "myapp.properties"
 5 | //   val contents = "name=%s\nversion=%s".format(name.value,version.value)
 6 | //   IO.write(file, contents)
 7 | //   Seq(file)
 8 | // }.taskValue
 9 | 
10 | // mappings in (Compile, packageBin) += {
11 | //   (resourceManaged in Compile).value / "demo" / "myapp.properties" -> "demo/myapp.properties"
12 | // }
13 | 
14 | mappings in (Compile, packageBin) += {
15 |   new File("gnaf-db/target/generated/version.json") -> "version.json"
16 | }
17 | 
18 | libraryDependencies ++= Seq(
19 |   "com.github.scopt" %% "scopt" % "3.3.0",
20 |   "com.jsuereth" %% "scala-arm" % "2.0.0-M1",
21 |   "ch.megard" %% "akka-http-cors" % "0.1.2",
22 |   "com.github.swagger-akka-http" %% "swagger-akka-http" % "0.7.0", // adding swagger brings in all the horrible old javax.ws & Jackson dependencies!
23 |   "io.swagger" % "swagger-annotations" % "1.5.9",
24 |   "org.scalatest" %% "scalatest" % "3.0.0" % "test"
25 |   )
26 | 
27 | libraryDependencies ++= Seq(
28 |   "akka-actor",
29 |   "akka-stream",
30 |   "akka-http-experimental",
31 |   "akka-http-spray-json-experimental",
32 |   "akka-http-testkit"
33 |   ) map ("com.typesafe.akka" %% _ % "2.4.3")
34 | 
35 |  


--------------------------------------------------------------------------------
/gnaf-extractor/src/main/script/loadElasticsearch.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | set -ex
 4 | GNAF=$PWD
 5 | 
 6 | DIR=tmp
 7 | # DIR=/srv/gnaf/data # for http://gnaf.it.csiro.au/ (no space in user home dir)
 8 | 
 9 | if false
10 | then
11 | 
12 | # run Scala program, takes about 25min with a SSD
13 | rm -f gnaf-indexer.log
14 | mkdir -p $DIR
15 | time java -Xmx3G -jar target/scala-2.11/gnaf-extractor_2.11-0.1-SNAPSHOT-one-jar.jar | gzip > $DIR/out.gz
16 | mv gnaf-indexer.log $DIR
17 | 
18 | fi
19 | 
20 | (
21 |   cd $DIR
22 |   
23 |   # transform output of Scala program to suit Elasticsearch 'bulk' API, takes about 15min with a SSD (was 32min using jq)
24 |   time zcat out.gz | node $GNAF/src/main/script/loadElasticsearch.js > bulk
25 | 
26 |   # split 'bulk' file into chunks not too big for a POST request
27 |   rm -f chunk-???
28 |   split -l10000 -a3 bulk chunk-
29 | )
30 | 
31 | # backup old index? (for cluster.name: neilsElasSrch set in elasticsearch.yml)
32 | # tar cvfz index1.tar.gz -C /var/lib/elasticsearch/neilsElasSrch/ nodes
33 | 
34 | # delete any old index
35 | curl -XDELETE 'localhost:9200/gnaf/'
36 | 
37 | # create new index with custom field mappings
38 | curl -XPUT 'localhost:9200/gnaf/' --data-binary @src/main/script/gnafMapping.json
39 | 
40 | # load the chunks using the Elasticsearch 'bulk' API, takes about 37min with a SSD
41 | time for i in $DIR/chunk-???
42 | do
43 |   echo $i
44 |   curl -s -XPOST localhost:9200/_bulk --data-binary @$i
45 | done
46 | 
47 | echo "all done"
48 | 
49 | 
50 | 


--------------------------------------------------------------------------------
/gnaf-util/3rd-party-licenses.html:
--------------------------------------------------------------------------------
1 | <html><head><title>gnaf-util-licenses</title></head><body><h1>gnaf-util-licenses</h1><table border="0" cellspacing="0" cellpading="1">
2 |       <thead><tr><th>Category</th><th>License</th><th>Dependency</th><th>Notes</th></tr></thead>
3 |     <tbody><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0.txt">Apache 2</a>&nbsp;</td><td>io.spray # spray-json_2.11 # 1.3.2&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0.html">Apache 2.0 License</a>&nbsp;</td><td>com.typesafe.scala-logging # scala-logging_2.11 # 3.1.0&nbsp;</td><td></td></tr><tr><td>BSD&nbsp;</td><td><a href="http://www.scala-lang.org/license.html">BSD 3-Clause</a>&nbsp;</td><td>org.scala-lang # scala-library # 2.11.8&nbsp;</td><td></td></tr><tr><td>BSD&nbsp;</td><td><a href="http://www.scala-lang.org/license.html">BSD 3-Clause</a>&nbsp;</td><td>org.scala-lang # scala-reflect # 2.11.8&nbsp;</td><td></td></tr><tr><td>LGPL&nbsp;</td><td><a href="http://logback.qos.ch/license.html">EPL + GNU Lesser General Public License</a>&nbsp;</td><td>ch.qos.logback # logback-classic # 1.1.3&nbsp;</td><td></td></tr><tr><td>LGPL&nbsp;</td><td><a href="http://logback.qos.ch/license.html">EPL + GNU Lesser General Public License</a>&nbsp;</td><td>ch.qos.logback # logback-core # 1.1.3&nbsp;</td><td></td></tr><tr><td>MIT&nbsp;</td><td><a href="http://www.slf4j.org/license.html">MIT License</a>&nbsp;</td><td>org.slf4j # slf4j-api # 1.7.12&nbsp;</td><td></td></tr></tbody></table></body></html>


--------------------------------------------------------------------------------
/gnaf-test/src/main/script/summary.js:
--------------------------------------------------------------------------------
 1 | var fs = require('fs');
 2 | 
 3 | /**
 4 |  * Usage: node src/main/node/summary.js files ...
 5 |  * 
 6 |  * The test results (our input) are keyed by a description of the test data.
 7 |  * By default we sum the data for all descriptions except 'nofuzTypo', which is excluded because data
 8 |  * potentially containing typos should be searched with 'fuz'.
 9 |  * If a "-{desc}" precedes files then we sum only the descriptions matching this {desc}.  
10 |  */
11 | var argIdx = 2; // 0 -> node; 1 -> src/main/script/summary.js; 2 -> [-desc] files ...
12 | var descMatch = process.argv[argIdx].startsWith('-') ? process.argv[argIdx++].substring(1) : null;
13 | var descPred = descMatch ? desc => desc == descMatch : desc => desc != 'nofuzTypo';
14 | 
15 | var m = new Map();
16 | for (; argIdx < process.argv.length; ++argIdx) {
17 |   var stats = JSON.parse(fs.readFileSync(process.argv[argIdx], "utf8"));
18 |   for (desc in stats.histogram) {
19 |     if (descPred(desc)) {
20 |       var o = stats.histogram[desc];
21 |       for (p in o) histAdd(m, p, o[p]);
22 |     }
23 |   }
24 | }
25 | 
26 | var sum = 0;
27 | for (i of m.values()) sum += i;
28 | console.log(JSON.stringify({ samples: sum, histogram: mapToArr(m) }));
29 | 
30 | /** Add v occurrences of k to a histogram map.
31 |  * 
32 |  * @param m histogram map: k -> occurrence count of k
33 |  * @param k key
34 |  * @param v new occurrences of k to add
35 |  */
36 | function histAdd(m, k, v) {
37 |   var n = m.get(k);
38 |   m.set(k, n ? n + v : v);
39 | }
40 | 
41 | function mapToArr(m) {
42 |   var a = [];
43 |   for (e of m) a.push(e);
44 |   return a;
45 | } 
46 | 


--------------------------------------------------------------------------------
/gnaf-util/src/main/scala/au/csiro/data61/gnaf/util/Timer.scala:
--------------------------------------------------------------------------------
 1 | package au.csiro.data61.gnaf.util
 2 | 
 3 | /** Accumulates time since constructed or reset.
 4 |  */
 5 | class Timer {
 6 |   private var t0 = 0L      // start of currently measured time period
 7 |   private var elapsed = 0L // sum of previous time periods ended by stop/elapsedSecs
 8 | 
 9 |   reset
10 | 
11 |   def reset = {
12 |     elapsed = 0L
13 |     start
14 |   }
15 | 
16 |   /** `start` and `stop` need not be used - used to discard (not accumulate) the time between `stop` and `start`. */
17 |   def start = t0 = System.currentTimeMillis
18 | 
19 |   def stop = {
20 |     val t = System.currentTimeMillis
21 |     elapsed += (t - t0)
22 |     t0 = t // so subsequent `start` isn't required 
23 |   }
24 | 
25 |   /** Get accumulated seconds.
26 |    * 
27 |    *  Also does `stop`, so time between `elapsedSecs` and a subsequent `start` would not be accumulated.
28 |    */
29 |   def elapsedSecs: Float = {
30 |     stop
31 |     elapsed * 1e-3f
32 |   }
33 | 
34 | }
35 | 
36 | object Timer {
37 |   
38 |   private lazy val log = Util.getLogger(getClass)
39 |   
40 |   def apply() = new Timer()
41 |   
42 |   /** Log elapsed time as info.
43 |    *  
44 |    *  Usage:
45 |    *  {{{
46 |    *  val a: A = timed("it took {} secs") {
47 |    *     ...
48 |    *     new A()
49 |    *  }
50 |    *  }}}
51 |    *  
52 |    *  @param msg contains "{}" which is replaced by the elapsed time in secs
53 |    *  @param action thunk to execute and time
54 |    */
55 |   def timed[T](msg: String)(action: => T) = {
56 |     val t = Timer()
57 |     val x = action
58 |     log.info(msg, t.elapsedSecs.toString)
59 |     x
60 |   }
61 | }


--------------------------------------------------------------------------------
/gnaf-contrib/README.md:
--------------------------------------------------------------------------------
 1 | # gnaf-contrib
 2 | 
 3 | ## Introduction
 4 | This project provides a [Scala](http://scala-lang.org/) [RESTful](https://en.wikipedia.org/wiki/Representational_state_transfer) JSON
 5 | web service providing access to the
 6 | gnafContrib database of user supplied geocodes.
 7 | 
 8 | This is a stand-alone webapp and does not run in a servlet container.
 9 | On startup the database schema is created if it doesn't already exist.
10 | 
11 | ## Configuration
12 | 
13 | Configuration is in [application.conf](src/main/resources/application.conf) and most settings can be overridden with environment variables.
14 | 
15 | ## Running and Usage
16 | 
17 | See `gnaf/src/main/script/run.sh`.
18 | 
19 | 
20 | ### Generate Slick bindings
21 | 
22 | The slick bindings can be written by hand, but its quicker to generate the bindings from a manually created database table: 
23 | 
24 | Create and connect to a new database with dburl `jdbc:h2:file:~/gnafContrib`, username `gnaf` and password `gnaf`.
25 | Create a table from which the bindings will be generated:
26 | 
27 | 	CREATE TABLE ADDRESS_SITE_GEOCODE (
28 | 	  id long IDENTITY,                      -- auto-inc primary key
29 | 	  contrib_status varchar(15) NOT NULL,   -- ‘SUBMITTED’|‘PUBLISHED’
30 | 	  address_site_geocode_pid varchar(15),  -- set to correct a gnaf geocode, null to add a new one
31 | 	  date_created date NOT NULL,
32 | 	  version int NOT NULL,                  -- optimistic locking row version
33 | 	  address_site_pid varchar(15) NOT NULL,
34 | 	  geocode_type_code varchar(4) NOT NULL,
35 | 	  longitude numeric(11,8) NOT NULL,
36 | 	  latitude numeric(10,8) NOT NULL
37 | 	);
38 | 
39 | Disconnect the SQL client from the database then, from the top level gnaf directory:
40 | 
41 |     sbt
42 |     > project gnafContrib
43 |     > console
44 |     slick.codegen.SourceCodeGenerator.main(
45 |         Array("slick.driver.H2Driver", "org.h2.Driver", "jdbc:h2:file:~/gnafContrib", "generated", "au.csiro.data61.gnaf.contrib.db", "gnaf", "gnaf")
46 |     )
47 | 
48 | This generates code in: `generated/au/csiro/data61/gnaf/contrib/db/Tables.scala`.
49 | The source file `src/main/scala/au/csiro/data61/gnaf/contrib/db/ContribTables.scala` is a very minor modification of this generated code.
50 | 
51 | 


--------------------------------------------------------------------------------
/gnaf-extractor/src/main/script/loadElasticsearch.js:
--------------------------------------------------------------------------------
 1 | var readline = require('readline');
 2 | 
 3 | var d61Num = a => [ a.prefix, a.number.toString(), a.suffix ].filter(a => a != null && a != "D61_NULL" && a != "-1").join("");
 4 | 
 5 | var d61NumLast = a => a.number == -1 ? "" : '-' + d61Num(a);
 6 | 
 7 | var d61StreetNum = a => a.numberFirst.number == -1 ? "" : d61Num(a.numberFirst) + d61NumLast(a.numberLast);
 8 | 
 9 | /** Each inner array gets indexed as a separate Lucene "value" in the "d61Address" field.
10 |  *  Although Lucene just concatenates all the values into the field there is a big position increment between the values
11 |  *  ("position_increment_gap": 100 set in gnafMapping.json) that stops phrase searches and shingles (n-grams) matching across values.
12 |  */
13 | var d61Address = a =>
14 |   [
15 |     [ a.addressSiteName, a.buildingName ],
16 |     [ a.flatTypeName, d61Num(a.flat) ], 
17 |     [ a.levelTypeName, d61Num(a.level) ],
18 |     [ d61StreetNum(a), a.street.name, a.street.typeCode, a.street.suffixName ],
19 |     [ a.localityName, a.stateAbbreviation, a.postcode ]
20 |   ].concat(
21 |     a.streetVariant.map( x => [ d61StreetNum(a), x.name, x.typeCode, x.suffixName ]), 
22 |     a.localityVariant.map( x => [ x.localityName, a.stateAbbreviation, a.postcode ])
23 |   ).map(x => x.filter(x => x != "" && x != null && x != "D61_NULL").join(" ")).filter(x => x != "");
24 | 
25 | var d61AddressNoAlias = a =>
26 |   [
27 |     a.addressSiteName, a.buildingName,
28 |     a.flatTypeName, d61Num(a.flat), 
29 |     a.levelTypeName, d61Num(a.level),
30 |     d61StreetNum(a), a.street.name, a.street.typeCode, a.street.suffixName,
31 |     a.localityName, a.stateAbbreviation, a.postcode
32 |   ].filter(x => x != "" && x != null && x != "D61_NULL").join(" ");
33 | 
34 | var rl = readline.createInterface({
35 |   input: process.stdin,
36 |   output: process.stdout,
37 |   terminal: false
38 | });
39 | 
40 | rl.on('line', function (l) {
41 |   var a = JSON.parse(l);
42 |   a["d61Address"] = d61Address(a);
43 |   a["d61AddressNoAlias"] = d61AddressNoAlias(a);
44 |   console.log(
45 |     JSON.stringify({ index: { _index: "gnaf", _type: "gnaf", _id: a.addressDetailPid } }) // one line of elasticsearch indexing metadata
46 |     + '\n' + JSON.stringify(a) // next line is document to index
47 |   );
48 | });


--------------------------------------------------------------------------------
/gnaf-ui/html/index.css:
--------------------------------------------------------------------------------
  1 | body {
  2 |   font-family: verdana, helvetica, arial, sans-serif;
  3 |   font-size: 14px;
  4 |   color: #404040;
  5 | }
  6 | 
  7 | h1, h2, h3, h4, .ui-tabs-nav .ui-state-active a {
  8 |   color: #98bf21;
  9 | }
 10 | 
 11 | /* .example { */
 12 | /*   padding-left: 10px; */
 13 | /*   font-size: small;	 */
 14 | /* } */
 15 | 
 16 | .label, label {
 17 |   color: #98bf21;
 18 |   font-weight: bold;
 19 |   margin-right: 5px;
 20 |   vertical-align: top;
 21 | }
 22 | 
 23 | .example {
 24 |   padding-left: 10px;
 25 |   font-size: small;	
 26 |   vertical-align: top;
 27 | }
 28 | 
 29 | .multi-line {
 30 |   display: inline-block;
 31 | }
 32 | 
 33 | #bulkLookup .header a {
 34 |   margin-left: 10px;
 35 | }
 36 | 
 37 | #bulkAddresses {
 38 |   height: 250px;
 39 |   width: 500px;
 40 | }
 41 | 
 42 | 
 43 | /* .error { */
 44 | /*   color: red; */
 45 | /*   margin-top: 15px; */
 46 | /*   font-family: monospace; */
 47 | /*   white-space: pre; */
 48 | /* } */
 49 | 
 50 | button {
 51 |   color: white;
 52 |   background-color: #98bf21;
 53 |   border: 1px outset #98bf21;
 54 |   margin-top: 10px;
 55 |   font-weight: bold;
 56 | }
 57 | 
 58 | table {
 59 |   border: 1px solid #98bf21;
 60 |   border-collapse: collapse;
 61 |   margin-top: 5px;
 62 | }
 63 | 
 64 | tr:nth-child(odd) {
 65 |   background: #EAF2D3;
 66 | }
 67 | 
 68 | th {
 69 |   text-align: center;
 70 |   padding: 3px 3px 3px 3px;
 71 |   background-color: #A7C942;
 72 |   color: white;
 73 | }
 74 | 
 75 | td {
 76 |   vertical-align: top;
 77 |   border: 1px solid #98bf21;
 78 |   padding: 2px 5px 2px 5px;
 79 | }
 80 | 
 81 | div.location {
 82 |   margin-bottom: 10px;
 83 | }
 84 | 
 85 | div.location a, #addressNearMe .refresh, .contribStatus .delete {
 86 |   margin-left: 5px;
 87 | }
 88 | 
 89 | #addressInput {
 90 |   width: 500px;
 91 | }
 92 | 
 93 | #addressLookupResult, #addressNearMeResult, #bulkResult, .addrType, .gnafGeocodes, .contribGeocodes {
 94 |   margin-top: 15px;
 95 | }
 96 | 
 97 | .contribStatus a {
 98 | 	
 99 | }
100 | .contribGeocodes .add input {
101 |   width: 150px;
102 | }
103 | 
104 | .contribGeocodes .geocodeTypeCode select {
105 |   width: 500px;
106 | }
107 | 
108 | .showGeoDetail {
109 |   margin-left: 10px;
110 | }
111 | 
112 | #streetFilter, #address {
113 |   width: 300px;
114 | }
115 | 
116 | 


--------------------------------------------------------------------------------
/gnaf-test/src/main/script/run.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | # Note: to count total searches performed:
 4 | # for i in address*.json; do jq '.|length' $i; done | awk '{ sum += $1 } END { print sum*6 }'
 5 | 
 6 | # Note: searchLucene.js performs 6 searches for each address:
 7 | # { fuz, noFuz } * { query, queryTypo, queryPostcodeBeforeState }
 8 | 
 9 | version=`sed 's/.*"\(.*\)"/\1/' ../version.sbt`
10 | scalaVersion=2.11
11 | 
12 | search="src/main/script/searchLucene.js" # "src/main/script/searchEs.js"
13 | url="http://localhost:9040/bulkSearch"   # "http://localhost:9200/gnaf/_msearch"
14 | skip="false"
15 | sampleSize=200
16 | addrDir=.
17 | statsDir=.
18 | 
19 | while getopts "u:n:sh" opt
20 | do
21 |   case $opt in
22 |     a) addrDir=$OPTARG ;;
23 |     o) statsDir=$OPTARG ;;
24 |     u) url=$OPTARG ;;
25 |     n) sampleSize=$OPTARG ;;
26 |     s) skip="true" ;;
27 |     h|"?") cat <<EOF
28 | usage: $0 -u url -s -h
29 |   -u url to set the search service endpoint, blank to skip search (default $url)
30 |   -n sampleSize passed to gnaf-test (default $sampleSize)
31 |   -s to skip generation of test address data (it must already exist in the current directory)
32 |   -a directory for address JSON files (default $addrDir)
33 |   -o directory for stats JSON files (default $statsDir)
34 |   -h for this help
35 | EOF
36 | 	exit 1 ;;
37 |   esac
38 | done
39 | 
40 | for opt in "" "--numberAdornments" "--flat" "--flat --noFlatType" "--level" "--streetAlias" "--localityAlias"
41 | do
42 |     echo "options: $opt ..."
43 |     name=${opt//--/}
44 |     name=${name// /-}
45 |     [[ -n "$name" ]] && name=-${name}
46 |     afile=$addrDir/address${name}.json
47 |     sfile=$statsDir/stats${name}.json
48 |     # sample size turns out smaller than requested because some locations have no addresses
49 |     # and others have no addresses with the requested characteristics (haven't figured out how to filter them out cheaply)
50 |     if [[ "$skip" != "true" ]]
51 |     then
52 |       time java -jar target/scala-${scalaVersion}/gnaf-test_${scalaVersion}-${version}-one-jar.jar --sampleSize $sampleSize $opt > $afile
53 |       wait # for previous node process
54 |       [[ -n "$url" ]] && node $search $url $afile > $sfile &
55 |     else
56 |       # re-run with same test data as before
57 |       node $search $url $afile > $sfile
58 |     fi
59 | done
60 | wait # for last node process
61 | 
62 | node src/main/script/summary.js $statsDir/stats*.json
63 | 


--------------------------------------------------------------------------------
/gnaf-indexer/src/main/scala/au/csiro/data61/gnaf/indexer/Indexer.scala:
--------------------------------------------------------------------------------
 1 | package au.csiro.data61.gnaf.indexer
 2 | 
 3 | import java.io.File
 4 | 
 5 | import scala.io.Source
 6 | 
 7 | import org.apache.lucene.document.{ Document, DoublePoint, Field }
 8 | 
 9 | import au.csiro.data61.gnaf.lucene.GnafLucene._
10 | import au.csiro.data61.gnaf.lucene.LuceneUtil.directory
11 | import au.csiro.data61.gnaf.util.Gnaf.Address
12 | import au.csiro.data61.gnaf.util.Gnaf.JsonProtocol.addressFormat
13 | import au.csiro.data61.gnaf.util.Util.getLogger
14 | import resource.managed
15 | import spray.json.pimpString
16 | 
17 | 
18 | object Indexer {
19 |   val log = getLogger(getClass)
20 |   
21 |   case class CliOption(indexDir: File)
22 |   val defaultCliOption = CliOption(new File("./indexDir"))
23 |     
24 |   def main(args: Array[String]) = {
25 |     val parser = new scopt.OptionParser[CliOption]("gnaf-indexer") {
26 |       head("gnaf-lucene-indexer", "0.x")
27 |       note("Load GNAF JSON into a Lucene index")
28 |       opt[File]('i', "indexDir") action { (x, c) =>
29 |         c.copy(indexDir = x)
30 |       } text (s"Lucene index directory, default ${defaultCliOption.indexDir}")
31 |       help("help") text ("prints this usage text")
32 |     }
33 |     parser.parse(args, defaultCliOption) foreach run
34 |     log.info("done")
35 |   }
36 | 
37 |   def addrToDoc(line: String) = {
38 |     val addr = line.parseJson.convertTo[Address]
39 |     val (d61Address, noneCount, d61AddressNoAlias) = addr.toD61Address
40 |     val doc = new Document
41 |     doc.add(new Field(F_JSON, line, storedNotIndexedFieldType))
42 |     for (l <- addr.location) doc.add(new DoublePoint(F_LOCATION, l.lat.toDouble, l.lon.toDouble))
43 |     for (a <- d61Address) doc.add(new Field(F_ADDRESS, a, addressFieldType))
44 |     for {
45 |       f <- addr.flat.toOptStr if addr.level.toOptStr.isEmpty
46 |       n <- addr.numberFirst.toOptStr
47 |     } doc.add(new Field(F_ADDRESS, f + BIGRAM_SEPARATOR + n, addressFieldType)) // explicitly add flat + street num bigram without extra unigrams 
48 |     for (i <- 0 until noneCount) doc.add(new Field(F_MISSING_DATA, MISSING_DATA_TOKEN, missingDataFieldType))
49 |     doc.add(new Field(F_ADDRESS_NOALIAS, d61AddressNoAlias, storedNotIndexedFieldType))
50 |     
51 |     doc
52 |   }
53 |   
54 |   def run(c: CliOption) = {
55 |     for {
56 |       indexer <- managed(mkIndexer(directory(c.indexDir)))
57 |       line <- Source.fromInputStream(System.in, "UTF-8").getLines
58 |     } {
59 |       indexer.addDocument(addrToDoc(line))
60 |     }
61 |   }
62 | }


--------------------------------------------------------------------------------
/gnaf-indexer/3rd-party-licenses.html:
--------------------------------------------------------------------------------
1 | <html><head><title>gnaf-indexer-licenses</title></head><body><h1>gnaf-indexer-licenses</h1><table border="0" cellspacing="0" cellpading="1">
2 |       <thead><tr><th>Category</th><th>License</th><th>Dependency</th><th>Notes</th></tr></thead>
3 |     <tbody><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0.txt">Apache 2</a>&nbsp;</td><td>io.spray # spray-json_2.11 # 1.3.2&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0.html">Apache 2.0 License</a>&nbsp;</td><td>com.typesafe.scala-logging # scala-logging_2.11 # 3.1.0&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0.txt">The Apache Software License, Version 2.0</a>&nbsp;</td><td>org.apache.lucene # lucene-analyzers-common # 6.2.1&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0.txt">The Apache Software License, Version 2.0</a>&nbsp;</td><td>org.apache.lucene # lucene-core # 6.2.1&nbsp;</td><td></td></tr><tr><td>BSD&nbsp;</td><td><a href="https://github.com/data61/gnaf/blob/master/LICENSE.txt">BSD</a>&nbsp;</td><td>au.csiro.data61.gnaf # gnaf-lucene_2.11 # 0.8-SNAPSHOT&nbsp;</td><td></td></tr><tr><td>BSD&nbsp;</td><td><a href="https://github.com/data61/gnaf/blob/master/LICENSE.txt">BSD</a>&nbsp;</td><td>au.csiro.data61.gnaf # gnaf-util_2.11 # 0.8-SNAPSHOT&nbsp;</td><td></td></tr><tr><td>BSD&nbsp;</td><td><a href="http://www.scala-lang.org/license.html">BSD 3-Clause</a>&nbsp;</td><td>org.scala-lang # scala-library # 2.11.8&nbsp;</td><td></td></tr><tr><td>BSD&nbsp;</td><td><a href="http://www.scala-lang.org/license.html">BSD 3-Clause</a>&nbsp;</td><td>org.scala-lang # scala-reflect # 2.11.8&nbsp;</td><td></td></tr><tr><td>BSD&nbsp;</td><td><a href="http://www.opensource.org/licenses/bsd-license.php">BSD-Style</a>&nbsp;</td><td>com.jsuereth # scala-arm_2.11 # 2.0.0-M1&nbsp;</td><td></td></tr><tr><td>LGPL&nbsp;</td><td><a href="http://logback.qos.ch/license.html">EPL + GNU Lesser General Public License</a>&nbsp;</td><td>ch.qos.logback # logback-classic # 1.1.3&nbsp;</td><td></td></tr><tr><td>LGPL&nbsp;</td><td><a href="http://logback.qos.ch/license.html">EPL + GNU Lesser General Public License</a>&nbsp;</td><td>ch.qos.logback # logback-core # 1.1.3&nbsp;</td><td></td></tr><tr><td>MIT&nbsp;</td><td><a href="http://www.opensource.org/licenses/mit-license.php">MIT License</a>&nbsp;</td><td>com.github.scopt # scopt_2.11 # 3.3.0&nbsp;</td><td></td></tr><tr><td>MIT&nbsp;</td><td><a href="http://www.slf4j.org/license.html">MIT License</a>&nbsp;</td><td>org.slf4j # slf4j-api # 1.7.12&nbsp;</td><td></td></tr></tbody></table></body></html>


--------------------------------------------------------------------------------
/gnaf-extractor/README.md:
--------------------------------------------------------------------------------
 1 | # gnaf-extractor
 2 | 
 3 | ## Introduction
 4 | This project queries the gnaf database to produce JSON address data (for consumption by gnaf-indexer).
 5 | `src/main/script` contains obsolete scripts to load the output into Elasticsearch.
 6 | 
 7 | 
 8 | ## H2 Result Set Spooling
 9 | If an [H2](http://www.h2database.com/) result set contains more than
10 | [MAX_MEMORY_ROWS](http://www.h2database.com/html/grammar.html?highlight=max_memory_rows&search=MAX_MEMORY_ROWS#set_max_memory_rows),
11 | it is spooled to disk before the first row is provided to the client.
12 | The default is 40000 rows per GB of available RAM and setting a non-default value requires database admin rights (which we prefer to avoid using).
13 | Analysis in comments in `Extractor.scala` shows that it needs to handle result sets up to 95,004 rows, so allocating 3GB of heap (with `java -Xmx3G`) should avoid spooling.
14 | 
15 | ## Configuration
16 | 
17 | Configuration is in [application.conf](src/main/resources/application.conf) and most settings can be overridden with environment variables.
18 | The database URL can also be set with a command line option (overriding the above, use `--help` for details).
19 | 
20 | ## Running and Usage
21 | 
22 | See `gnaf/src/main/script/run.sh`.
23 | 
24 | ## To Do
25 | 
26 | Add Code/Name from the _AUT tables as synonyms (e.g. so ST will match STREET) to the phrase suggester.
27 | The current indexed term is the full name (which may contain spaces), so we need to add the abbreviation (which does not contain spaces).
28 | A difference in spaces alters the number of tokens and all the following term positions resulting in problems with phrase search.
29 | See https://www.elastic.co/guide/en/elasticsearch/guide/current/multi-word-synonyms.html, which suggest using "Simple Contraction".
30 | However we're using shingles/ngrams rather than phrase search, so do we have the same problem? Yes I think so.
31 | We should do contraction to the single term abreviation.
32 | Possible negative consequences? Synonyms create the risk of spurious matches. The tables contain some unused entries (e.g. the STREET_TYPE_AUT (AWLK, AIRWALK)) and many rarely used entries; using them all as synonyms increases the risk. e.g. ATM has small edit distance from ATMA, ATKA, ATEA (street names), so contracting "AUTOMATIC TELLER MACHINE" to "ATM" could result in these street names matching AUTOMATIC TELLER MACHINEs.
33 | Perhaps we need to be quite selective in the use of synonyms.
34 | 
35 | Other synonyms: "St" for "Saint", "Mt" for "Mount"?
36 | The "Example Queries" section shows that this should be handled already by the inclusion of street (locality) aliases.
37 | 
38 | At some cost in terms of speed, we could prioritize primary over secondary addresses and principle over alias addresses. But maybe the default higher weight given to shorter docs is already enough?
39 | 


--------------------------------------------------------------------------------
/gnaf-lucene/src/main/scala/au/csiro/data61/gnaf/lucene/LuceneUtil.scala:
--------------------------------------------------------------------------------
 1 | package au.csiro.data61.gnaf.lucene
 2 | 
 3 | import java.io.Closeable
 4 | import scala.util.Try
 5 | import org.apache.lucene.analysis.{ Analyzer, TokenStream }
 6 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute
 7 | import org.apache.lucene.document.Document
 8 | import org.apache.lucene.index.DirectoryReader
 9 | import org.apache.lucene.search.{ IndexSearcher, Query, ScoreDoc }
10 | import org.apache.lucene.store.Directory
11 | import au.csiro.data61.gnaf.util.Timer
12 | import au.csiro.data61.gnaf.util.Util.getLogger
13 | import java.io.File
14 | import org.apache.lucene.store.FSDirectory
15 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute
16 | 
17 | 
18 | /**
19 |  * Non GNAF specific code for Lucene indexing and searching.
20 |  * 
21 |  * simplified from: https://github.csiro.au/bac003/social-watch/blob/master/analytics/src/main/scala/org/t3as/socialWatch/analytics/LuceneUtil.scala
22 |  */
23 | object LuceneUtil {
24 |   val log = getLogger(getClass)
25 | 
26 |   def tokenIter(ts: TokenStream): Iterator[String] = {
27 |     ts.reset
28 |     Iterator.continually {
29 |       val more = ts.incrementToken
30 |       if (!more) {
31 |         ts.end
32 |         ts.close
33 |       }
34 |       more
35 |     }.takeWhile(identity).map(_ => ts.getAttribute(classOf[CharTermAttribute]).toString)
36 |   }
37 | 
38 |   def tokenIter(analyzer: Analyzer, fieldName: String, text: String): Iterator[String]
39 |     = tokenIter(analyzer.tokenStream(fieldName, text))
40 |     
41 |   def directory(indexDir: File) = FSDirectory.open(indexDir.toPath)
42 |   
43 |   class Searcher[Hit, Results](
44 |     directory: Directory,
45 |     toHit: (ScoreDoc, Document) => Hit, // convert score and map of fields to Hit
46 |     toResults: (Int, Float, Seq[Hit], Option[String]) => Results // convert totalHits, elapsedSecs, Seq[Hit], Option[error] to Results
47 |   ) extends Closeable {      
48 |     val log = getLogger(getClass)
49 | 
50 |     val searcher = open
51 |     protected def open = new IndexSearcher(DirectoryReader.open(directory))
52 |     
53 |     log.debug(s"Searcher: numDocs = ${searcher.getIndexReader.numDocs}")
54 |         
55 |     def search(q: Query, numHits: Int = 20) = {
56 |       val timer = Timer()
57 |       
58 |       val result = for {
59 |         topDocs <- Try {
60 |           searcher.search(q, numHits)
61 |         }
62 |         hits <- Try {
63 |           topDocs.scoreDocs map { scoreDoc => toHit(scoreDoc, searcher.doc(scoreDoc.doc)) }
64 |         }
65 |       } yield toResults(topDocs.totalHits, timer.elapsedSecs.toFloat, hits, None)
66 |       
67 |       result.recover { case e => toResults(0, timer.elapsedSecs.toFloat, List(), Some(e.getMessage)) }.get
68 |     }
69 |     
70 |     def close = searcher.getIndexReader.close
71 |   }
72 | 
73 | }


--------------------------------------------------------------------------------
/gnaf-db/3rd-party-licenses.html:
--------------------------------------------------------------------------------
1 | <html><head><title>gnaf-db-licenses</title></head><body><h1>gnaf-db-licenses</h1><table border="0" cellspacing="0" cellpading="1">
2 |       <thead><tr><th>Category</th><th>License</th><th>Dependency</th><th>Notes</th></tr></thead>
3 |     <tbody><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0.txt">Apache 2</a>&nbsp;</td><td>io.spray # spray-json_2.11 # 1.3.2&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0.html">Apache 2.0 License</a>&nbsp;</td><td>com.typesafe.scala-logging # scala-logging_2.11 # 3.1.0&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0">Apache License, Version 2.0</a>&nbsp;</td><td>com.typesafe # config # 1.2.1&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0.txt">The Apache Software License, Version 2.0</a>&nbsp;</td><td>com.zaxxer # HikariCP-java6 # 2.3.7&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0.txt">The Apache Software License, Version 2.0</a>&nbsp;</td><td>org.javassist # javassist # 3.19.0-GA&nbsp;</td><td></td></tr><tr><td>BSD&nbsp;</td><td><a href="https://github.com/data61/gnaf/blob/master/LICENSE.txt">BSD</a>&nbsp;</td><td>au.csiro.data61.gnaf # gnaf-util_2.11 # 0.8-SNAPSHOT&nbsp;</td><td></td></tr><tr><td>BSD&nbsp;</td><td><a href="http://www.scala-lang.org/license.html">BSD 3-Clause</a>&nbsp;</td><td>org.scala-lang # scala-library # 2.11.8&nbsp;</td><td></td></tr><tr><td>BSD&nbsp;</td><td><a href="http://www.scala-lang.org/license.html">BSD 3-Clause</a>&nbsp;</td><td>org.scala-lang # scala-reflect # 2.11.8&nbsp;</td><td></td></tr><tr><td>BSD&nbsp;</td><td><a href="http://github.com/slick/slick/blob/master/LICENSE.txt">Two-clause BSD-style license</a>&nbsp;</td><td>com.typesafe.slick # slick-codegen_2.11 # 3.1.1&nbsp;</td><td></td></tr><tr><td>BSD&nbsp;</td><td><a href="http://github.com/slick/slick/blob/master/LICENSE.txt">Two-clause BSD-style license</a>&nbsp;</td><td>com.typesafe.slick # slick-hikaricp_2.11 # 3.1.1&nbsp;</td><td></td></tr><tr><td>BSD&nbsp;</td><td><a href="http://github.com/slick/slick/blob/master/LICENSE.txt">Two-clause BSD-style license</a>&nbsp;</td><td>com.typesafe.slick # slick_2.11 # 3.1.1&nbsp;</td><td></td></tr><tr><td>CC0&nbsp;</td><td><a href="http://creativecommons.org/publicdomain/zero/1.0/">CC0</a>&nbsp;</td><td>org.reactivestreams # reactive-streams # 1.0.0&nbsp;</td><td></td></tr><tr><td>LGPL&nbsp;</td><td><a href="http://logback.qos.ch/license.html">EPL + GNU Lesser General Public License</a>&nbsp;</td><td>ch.qos.logback # logback-classic # 1.1.3&nbsp;</td><td></td></tr><tr><td>LGPL&nbsp;</td><td><a href="http://logback.qos.ch/license.html">EPL + GNU Lesser General Public License</a>&nbsp;</td><td>ch.qos.logback # logback-core # 1.1.3&nbsp;</td><td></td></tr><tr><td>MIT&nbsp;</td><td><a href="http://www.slf4j.org/license.html">MIT License</a>&nbsp;</td><td>org.slf4j # slf4j-api # 1.7.12&nbsp;</td><td></td></tr><tr><td>Mozilla&nbsp;</td><td><a href="http://h2database.com/html/license.html">MPL 2.0 or EPL 1.0</a>&nbsp;</td><td>com.h2database # h2 # 1.4.193&nbsp;</td><td></td></tr></tbody></table></body></html>


--------------------------------------------------------------------------------
/gnaf-lucene/3rd-party-licenses.html:
--------------------------------------------------------------------------------
1 | <html><head><title>gnaf-lucene-licenses</title></head><body><h1>gnaf-lucene-licenses</h1><table border="0" cellspacing="0" cellpading="1">
2 |       <thead><tr><th>Category</th><th>License</th><th>Dependency</th><th>Notes</th></tr></thead>
3 |     <tbody><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0.txt">Apache 2</a>&nbsp;</td><td>io.spray # spray-json_2.11 # 1.3.2&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0.html">Apache 2.0 License</a>&nbsp;</td><td>com.typesafe.scala-logging # scala-logging_2.11 # 3.1.0&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0.txt">The Apache Software License, Version 2.0</a>&nbsp;</td><td>org.apache.lucene # lucene-analyzers-common # 6.2.1&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0.txt">The Apache Software License, Version 2.0</a>&nbsp;</td><td>org.apache.lucene # lucene-core # 6.2.1&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0">the Apache License, ASL Version 2.0</a>&nbsp;</td><td>org.scalactic # scalactic_2.11 # 3.0.0&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0">the Apache License, ASL Version 2.0</a>&nbsp;</td><td>org.scalatest # scalatest_2.11 # 3.0.0&nbsp;</td><td></td></tr><tr><td>BSD&nbsp;</td><td><a href="https://github.com/data61/gnaf/blob/master/LICENSE.txt">BSD</a>&nbsp;</td><td>au.csiro.data61.gnaf # gnaf-util_2.11 # 0.8-SNAPSHOT&nbsp;</td><td></td></tr><tr><td>BSD&nbsp;</td><td><a href="http://www.scala-lang.org/license.html">BSD 3-Clause</a>&nbsp;</td><td>org.scala-lang # scala-library # 2.11.8&nbsp;</td><td></td></tr><tr><td>BSD&nbsp;</td><td><a href="http://www.scala-lang.org/license.html">BSD 3-Clause</a>&nbsp;</td><td>org.scala-lang # scala-reflect # 2.11.8&nbsp;</td><td></td></tr><tr><td>BSD&nbsp;</td><td><a href="http://opensource.org/licenses/BSD-3-Clause">BSD 3-clause</a>&nbsp;</td><td>org.scala-lang.modules # scala-parser-combinators_2.11 # 1.0.4&nbsp;</td><td></td></tr><tr><td>BSD&nbsp;</td><td><a href="http://opensource.org/licenses/BSD-3-Clause">BSD 3-clause</a>&nbsp;</td><td>org.scala-lang.modules # scala-xml_2.11 # 1.0.5&nbsp;</td><td></td></tr><tr><td>BSD&nbsp;</td><td><a href="http://www.opensource.org/licenses/bsd-license.php">BSD-Style</a>&nbsp;</td><td>com.jsuereth # scala-arm_2.11 # 2.0.0-M1&nbsp;</td><td></td></tr><tr><td>LGPL&nbsp;</td><td><a href="http://logback.qos.ch/license.html">EPL + GNU Lesser General Public License</a>&nbsp;</td><td>ch.qos.logback # logback-classic # 1.1.3&nbsp;</td><td></td></tr><tr><td>LGPL&nbsp;</td><td><a href="http://logback.qos.ch/license.html">EPL + GNU Lesser General Public License</a>&nbsp;</td><td>ch.qos.logback # logback-core # 1.1.3&nbsp;</td><td></td></tr><tr><td>MIT&nbsp;</td><td><a href="http://www.opensource.org/licenses/mit-license.php">MIT License</a>&nbsp;</td><td>com.github.scopt # scopt_2.11 # 3.3.0&nbsp;</td><td></td></tr><tr><td>MIT&nbsp;</td><td><a href="http://www.slf4j.org/license.html">MIT License</a>&nbsp;</td><td>org.slf4j # slf4j-api # 1.7.12&nbsp;</td><td></td></tr></tbody></table></body></html>


--------------------------------------------------------------------------------
/gnaf-util/src/main/scala/au/csiro/data61/gnaf/util/Gnaf.scala:
--------------------------------------------------------------------------------
 1 | package au.csiro.data61.gnaf.util
 2 | 
 3 | import spray.json.DefaultJsonProtocol
 4 | 
 5 | object Gnaf {
 6 |   
 7 |   def join(s: Seq[Option[String]], delim: String): Option[String] = {
 8 |     val r = s.flatten.filter(_.nonEmpty).mkString(delim)
 9 |     if (r.nonEmpty) Some(r) else None
10 |   }
11 |   def d61Num(n: Option[Int]) = n.map(_.toString)
12 |   
13 |   case class PreNumSuf(prefix: Option[String], number: Option[Int], suffix: Option[String]) {
14 |     def toOptStr = join(Seq(prefix, d61Num(number), suffix), "")
15 |   }
16 |   
17 |   case class Street(name: String, typeCode: Option[String], typeName: Option[String], suffixCode: Option[String], suffixName: Option[String])
18 |   case class LocalityVariant(localityName: String)
19 |   case class Location(lat: BigDecimal, lon: BigDecimal)
20 |   case class Address(addressDetailPid: String, addressSiteName: Option[String], buildingName: Option[String],
21 |                      flatTypeCode: Option[String], flatTypeName: Option[String], flat: PreNumSuf,
22 |                      levelTypeCode: Option[String], levelTypeName: Option[String], level: PreNumSuf,
23 |                      numberFirst: PreNumSuf, numberLast: PreNumSuf,
24 |                      street: Option[Street], localityName: String, stateAbbreviation: String, stateName: String, postcode: Option[String],
25 |                      aliasPrincipal: Option[Char], primarySecondary: Option[Char],
26 |                      location: Option[Location], streetVariant: Seq[Street], localityVariant: Seq[LocalityVariant]) {
27 |         
28 |     def toD61Address = {
29 |       val streetNum = numberFirst.toOptStr.map(f => f + numberLast.toOptStr.map("-" + _).getOrElse(""))
30 |       val seqNoAlias = Seq(
31 |         Seq( addressSiteName, buildingName ), // each inner Seq optionally produces a String in the final Seq
32 |         Seq( flatTypeName, flat.toOptStr ), 
33 |         Seq( levelTypeName, level.toOptStr ),
34 |         Seq( streetNum, street.map(_.name), street.flatMap(_.typeCode), street.flatMap(_.suffixName) ),
35 |         Seq( Some(localityName), Some(stateAbbreviation), postcode )
36 |       )
37 |       val seqWithAlias = seqNoAlias ++
38 |         streetVariant.map(v => Seq( streetNum, Some(v.name), v.typeCode, v.suffixName )) ++
39 |         localityVariant.map(v => Seq( Some(v.localityName), Some(stateAbbreviation), postcode ))
40 |       val d61Address = seqWithAlias.map(inner => join(inner, " ")).flatten
41 |       val seqNoAlias2 = seqNoAlias.map(inner => join(inner, " "))
42 |       val noneCount = (streetNum +: seqNoAlias2).count(_.isEmpty) // count each empty streetNum and inner seq: site/building, flat, level 
43 |       val d61AddressNoAlias = join(seqNoAlias.map(inner => join(inner, " ")), " ").getOrElse("")
44 |       (d61Address, noneCount, d61AddressNoAlias)
45 |     }
46 |   }
47 | 
48 |   object JsonProtocol extends DefaultJsonProtocol {
49 |     implicit val preNumSufFormat = jsonFormat3(PreNumSuf)
50 |     implicit val streetFormat = jsonFormat5(Street)
51 |     implicit val locVarFormat = jsonFormat1(LocalityVariant)
52 |     implicit val locationFormat = jsonFormat2(Location)
53 |     implicit val addressFormat = jsonFormat21(Address)
54 |   }
55 |   
56 | }


--------------------------------------------------------------------------------
/gnaf-test/src/main/script/Maps.js:
--------------------------------------------------------------------------------
  1 | 'use strict';
  2 | 
  3 | var identity = x => x;
  4 | 
  5 | function mapToObj(m, f = identity) {
  6 |   var a = {};
  7 |   for (var [k, v] of m) a[k.toString()] = f(v);
  8 |   return a;    
  9 | }
 10 | 
 11 | var ctorMap = () => new Map();
 12 | var ctorArr = () => [];
 13 | 
 14 | 
 15 | 
 16 | /** Map where values are a container e.g. an array or another map */
 17 | class MapCont {
 18 |   
 19 |   /**
 20 |    * @param ctor constructor for a new container value in the map
 21 |    */
 22 |   constructor(ctor) {
 23 |     this.ctor = ctor;
 24 |     this.m = ctorMap();
 25 |   }
 26 |   
 27 |   /**
 28 |    * Get container for k, constructing and adding it if it doesn't exist.
 29 |    * @param k 
 30 |    */
 31 |   get(k) {
 32 |     var a = this.m.get(k);
 33 |     if (!a) {
 34 |       a = this.ctor();
 35 |       this.m.set(k, a);
 36 |     }
 37 |     return a;
 38 |   }
 39 |   
 40 |   /** convert to object (e.g. for JSON serialization) */
 41 |   object(f = identity) { return mapToObj(this.m, f); }
 42 | }
 43 | 
 44 | 
 45 | 
 46 | /** Map of maps: k1 -> k2 -> v */
 47 | class MapMap extends MapCont {
 48 |   
 49 |   constructor(ctor = ctorMap) {
 50 |     super(ctor);
 51 |   }
 52 |   
 53 |   get2(k1, k2) {
 54 |     return this.get(k1).get(k2);
 55 |   }
 56 |   
 57 |   set2(k1, k2, v) {
 58 |     this.get(k1).set(k2, v);
 59 |   }
 60 |   
 61 |   object(f = identity) { return super.object(v => mapToObj(v, f)); }
 62 | }
 63 | 
 64 | 
 65 | 
 66 | /** Map of histograms, where the histograms are maps: k2 -> count */
 67 | class MapHist extends MapMap {
 68 |   inc(k1, k2) {
 69 |     var m2 = this.get(k1);
 70 |     var n = m2.get(k2);
 71 |     m2.set(k2, n ? n + 1 : 1);
 72 |   }
 73 | }
 74 | 
 75 | 
 76 | 
 77 | /** Map of arrays */
 78 | class MapArr extends MapCont {
 79 |   
 80 |   constructor() {
 81 |     super(ctorArr);
 82 |   }
 83 |   
 84 |   append(k1, v) {
 85 |     this.get(k1).push(v);
 86 |   }
 87 | }
 88 | 
 89 | var ctorMapArr = () => new MapArr();
 90 | 
 91 | 
 92 | 
 93 | class MapMapCont extends MapMap {
 94 |   
 95 |   constructor(ctor) { // must provide object() 
 96 |     super(ctor);
 97 |   }
 98 |   
 99 |   object(f = identity) { return mapToObj(this.m, v => v.object(f)); }
100 | }
101 | 
102 | 
103 | 
104 | module.exports = {
105 |   identity: identity,
106 |   mapToObj: mapToObj,
107 |   ctorMapArr: ctorMapArr,
108 |   MapCont: MapCont,
109 |   MapMap: MapMap,
110 |   MapHist: MapHist,
111 |   MapArr: MapArr,
112 |   MapMapCont: MapMapCont
113 | }
114 | 
115 | // examples
116 | //var m = new MapMap();
117 | //m.set2("sally", 1, "fred")
118 | //m.set2("sally", 1, "sally")
119 | //m.set2("sally", 2, "fred")
120 | //console.log('MapMap.object:', m.object());
121 | //
122 | //var h = new MapHist();
123 | //h.inc("fred", 1);
124 | //h.inc("sally", 2);
125 | //h.inc("sally", 2);
126 | //console.log('MapHist.object:', h.object());
127 | //
128 | //var a = new MapArr();
129 | //a.append("sally", "fred");
130 | //a.append("sally", "sue");
131 | //console.log('MapArr.object:', a.object());
132 | //
133 | //var c = new MapMapCont(ctorMapArr);
134 | //c.get("sally").append("george", "fred");
135 | //c.get("sally").append("george", "sue");
136 | //console.log('MapMapCont.object:', c.object());
137 | 
138 | 
139 | 


--------------------------------------------------------------------------------
/gnaf-test/README.md:
--------------------------------------------------------------------------------
 1 | # gnaf-test
 2 | 
 3 | ## Introduction
 4 | 
 5 | This sub-project provides end to end evaluation by comparing address lookups with known correct results from the database.
 6 | This approach is motivated by the fact that search tuning must be evaluated across a wide range of test data.
 7 | 
 8 | ## Project Structure
 9 | 
10 | 1. The [Scala](http://scala-lang.org/) command line program `gnaf-test` extracts from the database random selections of addresses with user selected characteristics,
11 | such as using street or locality aliases, street number prefixes, suffixes or ranges, unit or level numbers,
12 | out of order elements (postcode before state) or intentional errors (to test fuzzy matching).
13 | It outputs JSON containing the search input as an address string and the correct result as a G-NAF address without aliases or errors.
14 | The `addressDetailPid` is not useful as the correct result because G-NAF contains addresses that are not unique (at least over the fields used here).
15 | 2. A [node.js](https://nodejs.org/en/) program [src/main/script/searchLucene.js](src/main/script/searchLucene.js) takes the above JSON, performs bulk lookups using the `gnaf-search` web service,
16 | computes the histogram of how often the correct result is the top hit (index 0),
17 | next hit (index 1) etc. or not in the top N hits (index -1).
18 | Where its not the top hit the problematic input address is output for further investigation.
19 | The histogram and problematic input addresses are output as JSON.
20 | 3. A [node.js](https://nodejs.org/en/) program [src/main/script/summary.js](src/main/script/summary.js) aggregates the above output into a single histogram. 
21 | 4. A [bash](https://www.gnu.org/software/bash/) script [src/main/script/run.sh](src/main/script/run.sh) runs all of the above.
22 | 
23 | ## Configuration
24 | 
25 | Configuration is in [application.conf](src/main/resources/application.conf) and most settings can be overridden with environment variables.
26 | 
27 | ## Dependencies
28 | 
29 | - install [node.js](https://nodejs.org/en/) and `npm`.
30 |   The Ubuntu packaged versions are too old but up-to-date packages are available [here](https://github.com/nodesource/distributions).
31 | - run `npm install` to install node package dependencies
32 | 
33 | ## Results
34 | 
35 | Overall results:
36 | 
37 | 	node src/main/script/summary.js stats*.json
38 | 	{"samples":4780,"histogram":[["0",4764],["1",7],["8",1],["-1",8]]}
39 | 
40 | A potential error reported for test addresses using street and locality aliases is (using [jq](https://stedolan.github.io/jq/) to filter and format):
41 | 
42 | 	jq .errors stats-localityAlias-streetAlias.json
43 | 	"nofuz": {
44 | 	  "-1": [
45 | 	    "MAIDENWELL-BUNYA MOUNTAINS ROAD PIMPIMBUDGEE QLD 4615"
46 | 	  ]
47 | 	}
48 | 
49 | A non-fuzzy search gets the same score for all the top 10 hits and they are all correct matches, just not the one we were looking for.
50 | Unfortunately G-NAF contains many duplicates with inconsistent usage of the main name and aliases.
51 | Most reported potential errors are similarly not actual errors.
52 |   
53 | Baseline for following comparisons:
54 | 
55 | 	node src/main/script/summary.js -nofuz stats*.json
56 | 	{"samples":956,"histogram":[["0",955],["-1",1]]}
57 | 
58 | Inputting a field out or order (postcode before state) looses bigram matches but only introduced one additional potential error:
59 | 	
60 | 	node src/main/script/summary.js -nofuzPostcodeBeforeState stats*.json
61 | 	{"samples":956,"histogram":[["0",954],["-1",2]]}
62 | 
63 | Adding a single character error and fuzzy matching also only introduced one additional potential error over the baseline:
64 | 
65 | 	node src/main/script/summary.js -fuzTypo stats*.json
66 | 	{"samples":956,"histogram":[["0",954],["1",1],["-1",1]]}
67 | 	
68 | 


--------------------------------------------------------------------------------
/gnaf-extractor/3rd-party-licenses.html:
--------------------------------------------------------------------------------
1 | <html><head><title>gnaf-extractor-licenses</title></head><body><h1>gnaf-extractor-licenses</h1><table border="0" cellspacing="0" cellpading="1">
2 |       <thead><tr><th>Category</th><th>License</th><th>Dependency</th><th>Notes</th></tr></thead>
3 |     <tbody><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0.txt">Apache 2</a>&nbsp;</td><td>io.spray # spray-json_2.11 # 1.3.2&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0.html">Apache 2.0 License</a>&nbsp;</td><td>com.typesafe.scala-logging # scala-logging_2.11 # 3.1.0&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0">Apache License, Version 2.0</a>&nbsp;</td><td>com.typesafe # config # 1.2.1&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0.txt">The Apache Software License, Version 2.0</a>&nbsp;</td><td>com.zaxxer # HikariCP-java6 # 2.3.7&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0.txt">The Apache Software License, Version 2.0</a>&nbsp;</td><td>org.javassist # javassist # 3.19.0-GA&nbsp;</td><td></td></tr><tr><td>BSD&nbsp;</td><td><a href="https://github.com/data61/gnaf/blob/master/LICENSE.txt">BSD</a>&nbsp;</td><td>au.csiro.data61.gnaf # gnaf-db_2.11 # 0.8-SNAPSHOT&nbsp;</td><td></td></tr><tr><td>BSD&nbsp;</td><td><a href="https://github.com/data61/gnaf/blob/master/LICENSE.txt">BSD</a>&nbsp;</td><td>au.csiro.data61.gnaf # gnaf-util_2.11 # 0.8-SNAPSHOT&nbsp;</td><td></td></tr><tr><td>BSD&nbsp;</td><td><a href="http://www.scala-lang.org/license.html">BSD 3-Clause</a>&nbsp;</td><td>org.scala-lang # scala-library # 2.11.8&nbsp;</td><td></td></tr><tr><td>BSD&nbsp;</td><td><a href="http://www.scala-lang.org/license.html">BSD 3-Clause</a>&nbsp;</td><td>org.scala-lang # scala-reflect # 2.11.8&nbsp;</td><td></td></tr><tr><td>BSD&nbsp;</td><td><a href="http://www.opensource.org/licenses/bsd-license.php">BSD-Style</a>&nbsp;</td><td>com.jsuereth # scala-arm_2.11 # 2.0.0-M1&nbsp;</td><td></td></tr><tr><td>BSD&nbsp;</td><td><a href="http://github.com/slick/slick/blob/master/LICENSE.txt">Two-clause BSD-style license</a>&nbsp;</td><td>com.typesafe.slick # slick-codegen_2.11 # 3.1.1&nbsp;</td><td></td></tr><tr><td>BSD&nbsp;</td><td><a href="http://github.com/slick/slick/blob/master/LICENSE.txt">Two-clause BSD-style license</a>&nbsp;</td><td>com.typesafe.slick # slick-hikaricp_2.11 # 3.1.1&nbsp;</td><td></td></tr><tr><td>BSD&nbsp;</td><td><a href="http://github.com/slick/slick/blob/master/LICENSE.txt">Two-clause BSD-style license</a>&nbsp;</td><td>com.typesafe.slick # slick_2.11 # 3.1.1&nbsp;</td><td></td></tr><tr><td>CC0&nbsp;</td><td><a href="http://creativecommons.org/publicdomain/zero/1.0/">CC0</a>&nbsp;</td><td>org.reactivestreams # reactive-streams # 1.0.0&nbsp;</td><td></td></tr><tr><td>LGPL&nbsp;</td><td><a href="http://logback.qos.ch/license.html">EPL + GNU Lesser General Public License</a>&nbsp;</td><td>ch.qos.logback # logback-classic # 1.1.3&nbsp;</td><td></td></tr><tr><td>LGPL&nbsp;</td><td><a href="http://logback.qos.ch/license.html">EPL + GNU Lesser General Public License</a>&nbsp;</td><td>ch.qos.logback # logback-core # 1.1.3&nbsp;</td><td></td></tr><tr><td>MIT&nbsp;</td><td><a href="http://www.opensource.org/licenses/mit-license.php">MIT License</a>&nbsp;</td><td>com.github.scopt # scopt_2.11 # 3.3.0&nbsp;</td><td></td></tr><tr><td>MIT&nbsp;</td><td><a href="http://www.slf4j.org/license.html">MIT License</a>&nbsp;</td><td>org.slf4j # slf4j-api # 1.7.12&nbsp;</td><td></td></tr><tr><td>Mozilla&nbsp;</td><td><a href="http://h2database.com/html/license.html">MPL 2.0 or EPL 1.0</a>&nbsp;</td><td>com.h2database # h2 # 1.4.193&nbsp;</td><td></td></tr></tbody></table></body></html>


--------------------------------------------------------------------------------
/gnaf-lucene/README.md:
--------------------------------------------------------------------------------
 1 | # gnaf-lucene
 2 | 
 3 | ## Introduction
 4 | 
 5 | This project produces a library of common code for indexing and searching G-NAF with [Lucene](https://lucene.apache.org/)
 6 | and is used by `gnaf-indexer` and `gnaf-search`.
 7 | 
 8 | ## Search Techniques
 9 | 
10 | ### Indexing
11 | 
12 | The following G-NAF data is formatted into an array of strings (one array element per bullet point):
13 | 
14 | - site name, building name (commas not included)
15 | - unit/flat,
16 | - level,
17 | - street (number ranges are formatted with a minus separator and no space e.g. "2-4 Reed Street South"),
18 | - locality, state abbreviation, postcode;
19 | 
20 | plus:
21 | 
22 | - one array element for each street alias; and
23 | - one array element for each locality alias: locality alias, state abbreviation, postcode
24 | 
25 | These strings are indexed into the same Lucene field using `WhitespaceTokenizer`, `LowerCaseFilter` and `ShingleFilter` producing unigram and bigram tokens.
26 | Bigrams provide a reward for terms appearing in the above order.
27 | A PositionIncrementGap is used to prevent bigrams going across string boundaries so that only ordering within each string is rewarded, not between them.
28 | 
29 | A case where this indexing scheme doesn't work well is a user query for "2 17 SMITH STREET". We understand the 2 represents a unit/flat number, because if it was a level number it would need some text to indicate that. The flat number and street number appear in separate array elements so "2 17" will not produce a bigram match. The "2" will only score as a unigram match to any "2" e.g. possibly a level or street number. In the case that an address has a flat number and a street number but no level, a flat number/street number bigram is added to the index specifically to handle queries of this form.
30 | 
31 | A search for a street address with no flat specified should score a match to the street address with no flat higher than one with a spurious match to a flat. More generally it is desirable to add a slight boost (less than the score increment for a correct match) to results with missing data for: site/building, flat, level, and street number. This is facilitated by adding a MISSING_DATA_TOKEN to the field F_MISSING_DATA for each missing data element from this list.
32 | 
33 | ### Searching
34 | 
35 | Query tokenization and filtering is as discussed above (under Indexing).
36 | Bigram term matches are boosted by a factor of 3 to reward correct ordering.
37 | MISSING_DATA_TOKEN is added to the query boosted by 0.05 to slightly boost results for each missing data element.
38 | 
39 | ### Scoring
40 | 
41 | Analysis of results using `gnaf-test` has shown that Lucene's default scoring based on language models doesn't work well with address data.
42 | 
43 | `AddressSimilarity` is used to override the default scoring:
44 | 
45 | - length norm is disabled so that multiple aliases are not penalized
46 | - term frequency is disabled so that a matching street and locality name isn't unduly rewarded
47 | - document frequency is disabled so that common street names are not penalized
48 | 
49 | `MissingDataSimilarity` overrides the scoring for the field F_MISSING_DATA:
50 | 
51 | - length norm is disabled so that multiple tokens are not penalized
52 | - term frequency is enabled so that multiple tokens score more
53 | - document frequency is disabled (it's a constant as we only have one unique token)
54 | 
55 | #### Suggested preprocessing for client applications
56 | 
57 | People often use "2 / 12 BLAH STREET" for "UNIT 2 12 BLAH STREET" (which corresponds the indexed format).
58 | Bigrams will provide a high score for "2 12 BLAH" but not for "2 / 12 BLAH", so "/" in the input should be replaced with a space.
59 | Similarly any commas in the input should also be replaced with a space.
60 | 
61 | The only useful non-alphanumeric characters are '-' as a number range separator and some non-alphanumeric characters that may appear
62 | in names such as "-" and "'".
63 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | CSIRO Open Source Software License Agreement (variation of the BSD / MIT License)
 2 | Copyright (c) 2016, Commonwealth Scientific and Industrial Research Organisation (CSIRO) ABN 41 687 119 230.
 3 | All rights reserved. CSIRO is willing to grant you a license to this G_NAF Search on the following terms, except where otherwise indicated for third party material.
 4 | Redistribution and use of this software in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
 5 | * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
 6 | * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
 7 | * Neither the name of CSIRO nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission of CSIRO.
 8 | EXCEPT AS EXPRESSLY STATED IN THIS AGREEMENT AND TO THE FULL EXTENT PERMITTED BY APPLICABLE LAW, THE SOFTWARE IS PROVIDED "AS-IS". CSIRO MAKES NO REPRESENTATIONS, WARRANTIES OR CONDITIONS OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO ANY REPRESENTATIONS, WARRANTIES OR CONDITIONS REGARDING THE CONTENTS OR ACCURACY OF THE SOFTWARE, OR OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NON-INFRINGEMENT, THE ABSENCE OF LATENT OR OTHER DEFECTS, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT DISCOVERABLE.
 9 | TO THE FULL EXTENT PERMITTED BY APPLICABLE LAW, IN NO EVENT SHALL CSIRO BE LIABLE ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION, IN AN ACTION FOR BREACH OF CONTRACT, NEGLIGENCE OR OTHERWISE) FOR ANY CLAIM, LOSS, DAMAGES OR OTHER LIABILITY HOWSOEVER INCURRED.  WITHOUT LIMITING THE SCOPE OF THE PREVIOUS SENTENCE THE EXCLUSION OF LIABILITY SHALL INCLUDE: LOSS OF PRODUCTION OR OPERATION TIME, LOSS, DAMAGE OR CORRUPTION OF DATA OR RECORDS; OR LOSS OF ANTICIPATED SAVINGS, OPPORTUNITY, REVENUE, PROFIT OR GOODWILL, OR OTHER ECONOMIC LOSS; OR ANY SPECIAL, INCIDENTAL, INDIRECT, CONSEQUENTIAL, PUNITIVE OR EXEMPLARY DAMAGES, ARISING OUT OF OR IN CONNECTION WITH THIS AGREEMENT, ACCESS OF THE SOFTWARE OR ANY OTHER DEALINGS WITH THE SOFTWARE, EVEN IF CSIRO HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH CLAIM, LOSS, DAMAGES OR OTHER LIABILITY.
10 | APPLICABLE LEGISLATION SUCH AS THE AUSTRALIAN CONSUMER LAW MAY APPLY REPRESENTATIONS, WARRANTIES, OR CONDITIONS, OR IMPOSES OBLIGATIONS OR LIABILITY ON CSIRO THAT CANNOT BE EXCLUDED, RESTRICTED OR MODIFIED TO THE FULL EXTENT SET OUT IN THE EXPRESS TERMS OF THIS CLAUSE ABOVE "CONSUMER GUARANTEES".  TO THE EXTENT THAT SUCH CONSUMER GUARANTEES CONTINUE TO APPLY, THEN TO THE FULL EXTENT PERMITTED BY THE APPLICABLE LEGISLATION, THE LIABILITY OF CSIRO UNDER THE RELEVANT CONSUMER GUARANTEE IS LIMITED (WHERE PERMITTED AT CSIRO'S OPTION) TO ONE OF FOLLOWING REMEDIES OR SUBSTANTIALLY EQUIVALENT REMEDIES:
11 | (a)               THE REPLACEMENT OF THE SOFTWARE, THE SUPPLY OF EQUIVALENT SOFTWARE, OR SUPPLYING RELEVANT SERVICES AGAIN;
12 | (b)               THE REPAIR OF THE SOFTWARE;
13 | (c)               THE PAYMENT OF THE COST OF REPLACING THE SOFTWARE, OF ACQUIRING EQUIVALENT SOFTWARE, HAVING THE RELEVANT SERVICES SUPPLIED AGAIN, OR HAVING THE SOFTWARE REPAIRED.
14 | IN THIS CLAUSE, CSIRO INCLUDES ANY THIRD PARTY AUTHOR OR OWNER OF ANY PART OF THE SOFTWARE OR MATERIAL DISTRIBUTED WITH IT.  CSIRO MAY ENFORCE ANY RIGHTS ON BEHALF OF THE RELEVANT THIRD PARTY.
15 | Third Party Components
16 | The following third party components are distributed with the Software.  You agree to comply with the license terms for these components as part of accessing the Software.  Other third party software may also be identified in separate files distributed with the Software.
17 | ___________________________________________________________________
18 | 
19 | Please refer to the file: 3rd-party-licenses.html
20 | ___________________________________________________________________
21 | 
22 | 
23 | 


--------------------------------------------------------------------------------
/gnaf-test/3rd-party-licenses.html:
--------------------------------------------------------------------------------
1 | <html><head><title>gnaf-test-licenses</title></head><body><h1>gnaf-test-licenses</h1><table border="0" cellspacing="0" cellpading="1">
2 |       <thead><tr><th>Category</th><th>License</th><th>Dependency</th><th>Notes</th></tr></thead>
3 |     <tbody><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0.txt">Apache 2</a>&nbsp;</td><td>io.spray # spray-json_2.11 # 1.3.2&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0.html">Apache 2.0 License</a>&nbsp;</td><td>com.typesafe.scala-logging # scala-logging_2.11 # 3.1.0&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0">Apache License, Version 2.0</a>&nbsp;</td><td>com.typesafe # config # 1.2.1&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0.txt">The Apache Software License, Version 2.0</a>&nbsp;</td><td>com.zaxxer # HikariCP-java6 # 2.3.7&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0.txt">The Apache Software License, Version 2.0</a>&nbsp;</td><td>org.javassist # javassist # 3.19.0-GA&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0">the Apache License, ASL Version 2.0</a>&nbsp;</td><td>org.scalactic # scalactic_2.11 # 3.0.0&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0">the Apache License, ASL Version 2.0</a>&nbsp;</td><td>org.scalatest # scalatest_2.11 # 3.0.0&nbsp;</td><td></td></tr><tr><td>BSD&nbsp;</td><td><a href="https://github.com/data61/gnaf/blob/master/LICENSE.txt">BSD</a>&nbsp;</td><td>au.csiro.data61.gnaf # gnaf-db_2.11 # 0.8-SNAPSHOT&nbsp;</td><td></td></tr><tr><td>BSD&nbsp;</td><td><a href="https://github.com/data61/gnaf/blob/master/LICENSE.txt">BSD</a>&nbsp;</td><td>au.csiro.data61.gnaf # gnaf-util_2.11 # 0.8-SNAPSHOT&nbsp;</td><td></td></tr><tr><td>BSD&nbsp;</td><td><a href="http://www.scala-lang.org/license.html">BSD 3-Clause</a>&nbsp;</td><td>org.scala-lang # scala-library # 2.11.8&nbsp;</td><td></td></tr><tr><td>BSD&nbsp;</td><td><a href="http://www.scala-lang.org/license.html">BSD 3-Clause</a>&nbsp;</td><td>org.scala-lang # scala-reflect # 2.11.8&nbsp;</td><td></td></tr><tr><td>BSD&nbsp;</td><td><a href="http://opensource.org/licenses/BSD-3-Clause">BSD 3-clause</a>&nbsp;</td><td>org.scala-lang.modules # scala-parser-combinators_2.11 # 1.0.4&nbsp;</td><td></td></tr><tr><td>BSD&nbsp;</td><td><a href="http://opensource.org/licenses/BSD-3-Clause">BSD 3-clause</a>&nbsp;</td><td>org.scala-lang.modules # scala-xml_2.11 # 1.0.5&nbsp;</td><td></td></tr><tr><td>BSD&nbsp;</td><td><a href="http://www.opensource.org/licenses/bsd-license.php">BSD-Style</a>&nbsp;</td><td>com.jsuereth # scala-arm_2.11 # 2.0.0-M1&nbsp;</td><td></td></tr><tr><td>BSD&nbsp;</td><td><a href="http://github.com/slick/slick/blob/master/LICENSE.txt">Two-clause BSD-style license</a>&nbsp;</td><td>com.typesafe.slick # slick-codegen_2.11 # 3.1.1&nbsp;</td><td></td></tr><tr><td>BSD&nbsp;</td><td><a href="http://github.com/slick/slick/blob/master/LICENSE.txt">Two-clause BSD-style license</a>&nbsp;</td><td>com.typesafe.slick # slick-hikaricp_2.11 # 3.1.1&nbsp;</td><td></td></tr><tr><td>BSD&nbsp;</td><td><a href="http://github.com/slick/slick/blob/master/LICENSE.txt">Two-clause BSD-style license</a>&nbsp;</td><td>com.typesafe.slick # slick_2.11 # 3.1.1&nbsp;</td><td></td></tr><tr><td>CC0&nbsp;</td><td><a href="http://creativecommons.org/publicdomain/zero/1.0/">CC0</a>&nbsp;</td><td>org.reactivestreams # reactive-streams # 1.0.0&nbsp;</td><td></td></tr><tr><td>LGPL&nbsp;</td><td><a href="http://logback.qos.ch/license.html">EPL + GNU Lesser General Public License</a>&nbsp;</td><td>ch.qos.logback # logback-classic # 1.1.3&nbsp;</td><td></td></tr><tr><td>LGPL&nbsp;</td><td><a href="http://logback.qos.ch/license.html">EPL + GNU Lesser General Public License</a>&nbsp;</td><td>ch.qos.logback # logback-core # 1.1.3&nbsp;</td><td></td></tr><tr><td>MIT&nbsp;</td><td><a href="http://www.opensource.org/licenses/mit-license.php">MIT License</a>&nbsp;</td><td>com.github.scopt # scopt_2.11 # 3.3.0&nbsp;</td><td></td></tr><tr><td>MIT&nbsp;</td><td><a href="http://www.slf4j.org/license.html">MIT License</a>&nbsp;</td><td>org.slf4j # slf4j-api # 1.7.12&nbsp;</td><td></td></tr><tr><td>Mozilla&nbsp;</td><td><a href="http://h2database.com/html/license.html">MPL 2.0 or EPL 1.0</a>&nbsp;</td><td>com.h2database # h2 # 1.4.193&nbsp;</td><td></td></tr></tbody></table></body></html>


--------------------------------------------------------------------------------
/gnaf-test/src/main/script/searchLucene.js:
--------------------------------------------------------------------------------
  1 | var request = require('request');
  2 | var fs = require('fs');
  3 | var maps = require('./Maps.js');
  4 | 
  5 | Array.prototype.flatMap = function(f) {
  6 |   return this.map(f).flatten();
  7 | }
  8 | Array.prototype.flatten = function() {
  9 |   return Array.prototype.concat.apply([], this);
 10 | }
 11 | 
 12 | /**
 13 |  * Usage: node src/main/node/searchLucene.js input.json
 14 |  * Input: one address per line. Performs bulk lookup using gnaf-lucene-service.
 15 |  * TODO: add proper command line option handling, add options to set numHits and bulk
 16 |  */
 17 | var url = process.argv[2]; // 'http://localhost:9040/bulkSearch'
 18 | var path = process.argv[3]; // 0 -> node; 1 -> src/main/script/search.js; 2 -> url; 3 -> input.json
 19 | var numHits = 10;
 20 | 
 21 | var addr = JSON.parse(fs.readFileSync(path, "utf8"));
 22 | // console.log('addr', addr);
 23 | 
 24 | var bulk = Math.floor(50/3);
 25 | var batches = [];
 26 | for (i = 0; i < addr.length; i += bulk) batches.push(addr.slice(i, Math.min(i + bulk, addr.length)));
 27 | // console.log('bulk', bulk, 'batch sizes', batches.map(b => b.length));
 28 | 
 29 | /** 
 30 |  * return array[i] = index j where hits[i].hits[j].d61AddressNoAlias contains qBatch[i].tc.address
 31 |  * exception: if j > 0 and hits[i].hits[j].score == hits[i].hits[0].score (i.e. hit is first equal score) then return array[i] = 0
 32 |  */
 33 | var findHitIndices = (qBatch, hits) => qBatch.map((q, i) => {
 34 |   var h = hits[i];
 35 |   var j = h.hits.findIndex(h => h.d61AddressNoAlias.indexOf(q.tc.address) != -1)
 36 |   return j > 0 && aboutEqual(h.hits[j].score, h.hits[0].score) ? 0 : j; 
 37 |   // h.hits[j].score == h.hits[0].score instead of aboutEqual appears to work just as well here
 38 | });
 39 | 
 40 | var aboutEqual = (a, b) => Math.abs(a - b) < Math.max(a, b) * 1.e-6;
 41 | 
 42 | 
 43 | /**
 44 |  * each input test case contains 3 different queries
 45 |  * @param tc a test case
 46 |  */
 47 | var queries = tc => [ 
 48 |   {tc: tc, qstr: tc.query, desc: ''},
 49 |   {tc: tc, qstr: tc.queryPostcodeBeforeState, desc: 'PostcodeBeforeState'}, 
 50 |   {tc: tc, qstr: tc.queryTypo, desc:'Typo'}
 51 | ];
 52 | 
 53 | var bulkQueryParam = (addresses, maxEdits) => ({addresses: addresses, numHits: numHits, fuzzy: { minLength: 5, maxEdits: maxEdits, prefixLength: 2} });
 54 | 
 55 | var done = (histMap, errMap) => console.log(JSON.stringify({ histogram: histMap.object(), errors: errMap.object() }));
 56 | 
 57 | /**
 58 |  * Process a batch and on completion recursively do the next.
 59 |  * @param iter provides next batch
 60 |  * @param histMap test description -> histogram
 61 |  *       where histogram is (index of correct hit (0 in best case) -> occurrence count for this index)
 62 |  * @param errMap test description -> index of correct hit -> array of addresses with this index
 63 |  */
 64 | function doBatch(iter, histMap, errMap) {
 65 |   var x = iter.next();
 66 |   if (x.done) done(histMap, errMap);
 67 |   else {    
 68 |     var batch = x.value;
 69 |     
 70 |     // array of batch.length * 3: { tc: tc, qstr: query address string, desc: description }
 71 |     var qBatch = batch.flatMap(queries);
 72 |     // console.log('batch.length', batch.length, 'qBatch.length', qBatch.length); // , 'qBatch', qBatch);
 73 |     var qAddr = qBatch.map(x => x.qstr); // array of query addresses from qBatch
 74 |     
 75 |     function responseHandler(qp, hits) {
 76 |       var idxs = findHitIndices(qBatch, hits);
 77 |       // console.log('idxs', idxs);
 78 |       // histogram(histMap, idxs);
 79 |       // console.log('histMap', histMap);
 80 |       idxs.forEach((v, i) => {
 81 |         var q = qBatch[i];
 82 |         var desc = (qp.fuzzy.maxEdits == 0 ? 'nofuz' : 'fuz') + q.desc;
 83 |         histMap.inc(desc, v);
 84 |         if (v != 0) errMap.get(desc).append(v, q.qstr);
 85 |       });
 86 |       if (qp.fuzzy.maxEdits == 0) {
 87 |         // on completing response for maxEdits == 0, do request with maxEdits == 2
 88 |         doRequest(bulkQueryParam(qAddr, 2));
 89 |       } else {
 90 |         // on completing response for maxEdits == 2 recurse to do next batch 
 91 |         doBatch(iter, histMap, errMap);
 92 |       }
 93 |     };
 94 |     
 95 |     function doRequest(qp) {
 96 |       request.post( { url: url, json: true, body: qp }, (error, response, hits) => {
 97 |         if (error) console.log('error', error);
 98 |         else responseHandler(qp, hits);
 99 |       });
100 |     };
101 |     
102 |     // do request with qp.fuzzyMaxEdits == 0
103 |     doRequest(bulkQueryParam(qAddr, 0));
104 |   };
105 | }
106 | 
107 | 
108 | doBatch(batches[Symbol.iterator](), new maps.MapHist(), new maps.MapMapCont(maps.ctorMapArr));
109 | 
110 | 
111 | 


--------------------------------------------------------------------------------
/gnaf-contrib/src/main/scala/au/csiro/data61/gnaf/contrib/db/ContribTables.scala:
--------------------------------------------------------------------------------
 1 | package au.csiro.data61.gnaf.contrib.db
 2 | // AUTO-GENERATED Slick data model
 3 | 
 4 | /** Slick data model trait for extension, choice of backend or usage in the cake pattern. (Make sure to initialize this late.) */
 5 | trait ContribTables {
 6 |   val profile: slick.driver.JdbcProfile
 7 |   import profile.api._
 8 |   import slick.model.ForeignKeyAction
 9 |   // NOTE: GetResult mappers for plain SQL are only generated for tables where Slick knows how to map the types of all columns.
10 |   import slick.jdbc.{GetResult => GR}
11 | 
12 |   /** DDL for all tables. Call .create to execute. */
13 |   lazy val schema: profile.SchemaDescription = AddressSiteGeocode.schema
14 |   @deprecated("Use .schema instead of .ddl", "3.0")
15 |   def ddl = schema
16 | 
17 |   /** Entity class storing rows of table AddressSiteGeocode
18 |    *  @param id Database column ID SqlType(BIGINT), AutoInc, PrimaryKey
19 |    *  @param contribStatus Database column CONTRIB_STATUS SqlType(VARCHAR), Length(15,true)
20 |    *  @param addressSiteGeocodePid Database column ADDRESS_SITE_GEOCODE_PID SqlType(VARCHAR), Length(15,true)
21 |    *  @param dateCreated Database column DATE_CREATED SqlType(DATE)
22 |    *  @param version Database column VERSION SqlType(INTEGER)
23 |    *  @param addressSitePid Database column ADDRESS_SITE_PID SqlType(VARCHAR), Length(15,true)
24 |    *  @param geocodeTypeCode Database column GEOCODE_TYPE_CODE SqlType(VARCHAR), Length(4,true)
25 |    *  @param longitude Database column LONGITUDE SqlType(DECIMAL)
26 |    *  @param latitude Database column LATITUDE SqlType(DECIMAL) */
27 |   case class AddressSiteGeocodeRow(id: Option[Long], contribStatus: String, addressSiteGeocodePid: Option[String], dateCreated: java.sql.Date, version: Int, addressSitePid: String, geocodeTypeCode: String, longitude: scala.math.BigDecimal, latitude: scala.math.BigDecimal)
28 |   /** GetResult implicit for fetching AddressSiteGeocodeRow objects using plain SQL queries */
29 |   implicit def GetResultAddressSiteGeocodeRow(implicit e0: GR[Long], e1: GR[String], e2: GR[Option[String]], e3: GR[java.sql.Date], e4: GR[Int], e5: GR[scala.math.BigDecimal]): GR[AddressSiteGeocodeRow] = GR{
30 |     prs => import prs._
31 |     AddressSiteGeocodeRow.tupled((<<[Option[Long]], <<[String], <<?[String], <<[java.sql.Date], <<[Int], <<[String], <<[String], <<[scala.math.BigDecimal], <<[scala.math.BigDecimal]))
32 |   }
33 |   /** Table description of table ADDRESS_SITE_GEOCODE. Objects of this class serve as prototypes for rows in queries. */
34 |   class AddressSiteGeocode(_tableTag: Tag) extends Table[AddressSiteGeocodeRow](_tableTag, "ADDRESS_SITE_GEOCODE") {
35 |     def * = (id.?, contribStatus, addressSiteGeocodePid, dateCreated, version, addressSitePid, geocodeTypeCode, longitude, latitude) <> (AddressSiteGeocodeRow.tupled, AddressSiteGeocodeRow.unapply)
36 |     /** Maps whole row to an option. Useful for outer joins. */
37 |     def ? = (Rep.Some(id), Rep.Some(contribStatus), addressSiteGeocodePid, Rep.Some(dateCreated), Rep.Some(version), Rep.Some(addressSitePid), Rep.Some(geocodeTypeCode), Rep.Some(longitude), Rep.Some(latitude)).shaped.<>({r=>import r._; _1.map(_=> AddressSiteGeocodeRow.tupled((_1, _2.get, _3, _4.get, _5.get, _6.get, _7.get, _8.get, _9.get)))}, (_:Any) =>  throw new Exception("Inserting into ? projection not supported."))
38 | 
39 |     /** Database column ID SqlType(BIGINT), AutoInc, PrimaryKey */
40 |     val id: Rep[Long] = column[Long]("ID", O.AutoInc, O.PrimaryKey)
41 |     /** Database column CONTRIB_STATUS SqlType(VARCHAR), Length(15,true) */
42 |     val contribStatus: Rep[String] = column[String]("CONTRIB_STATUS", O.Length(15,varying=true))
43 |     /** Database column ADDRESS_SITE_GEOCODE_PID SqlType(VARCHAR), Length(15,true) */
44 |     val addressSiteGeocodePid: Rep[Option[String]] = column[Option[String]]("ADDRESS_SITE_GEOCODE_PID", O.Length(15,varying=true))
45 |     /** Database column DATE_CREATED SqlType(DATE) */
46 |     val dateCreated: Rep[java.sql.Date] = column[java.sql.Date]("DATE_CREATED")
47 |     /** Database column VERSION SqlType(INTEGER) */
48 |     val version: Rep[Int] = column[Int]("VERSION")
49 |     /** Database column ADDRESS_SITE_PID SqlType(VARCHAR), Length(15,true) */
50 |     val addressSitePid: Rep[String] = column[String]("ADDRESS_SITE_PID", O.Length(15,varying=true))
51 |     /** Database column GEOCODE_TYPE_CODE SqlType(VARCHAR), Length(4,true) */
52 |     val geocodeTypeCode: Rep[String] = column[String]("GEOCODE_TYPE_CODE", O.Length(4,varying=true))
53 |     /** Database column LONGITUDE SqlType(DECIMAL) */
54 |     val longitude: Rep[scala.math.BigDecimal] = column[scala.math.BigDecimal]("LONGITUDE")
55 |     /** Database column LATITUDE SqlType(DECIMAL) */
56 |     val latitude: Rep[scala.math.BigDecimal] = column[scala.math.BigDecimal]("LATITUDE")
57 |   }
58 |   /** Collection-like TableQuery object for table AddressSiteGeocode */
59 |   lazy val AddressSiteGeocode = new TableQuery(tag => new AddressSiteGeocode(tag))
60 | }
61 | 


--------------------------------------------------------------------------------
/gnaf-lucene/src/test/scala/au/csiro/data61/gnaf/lucene/GnafLuceneTest.scala:
--------------------------------------------------------------------------------
  1 | package au.csiro.data61.gnaf.lucene
  2 | 
  3 | import org.apache.lucene.document.{ Document, DoublePoint, Field }
  4 | import org.apache.lucene.search.ScoreDoc
  5 | import org.apache.lucene.store.{ Directory, RAMDirectory }
  6 | import org.scalatest.{ Finders, FlatSpec, Matchers }
  7 | 
  8 | import GnafLucene._
  9 | import LuceneUtil.Searcher
 10 | import au.csiro.data61.gnaf.util.Util.getLogger
 11 | import resource.managed
 12 | 
 13 | /**
 14 |  * More a test bed for:<ul>
 15 |  * <li>examining unexpected results
 16 |  * <li>experimenting with search techniques
 17 |  * </ul>
 18 |  * than a conventional unit test.
 19 |  */
 20 | class GnafLuceneTest extends FlatSpec with Matchers {
 21 |   val log = getLogger(getClass)
 22 |   
 23 |   val s = "some test string"
 24 | 
 25 |   "countOccurences" should "count" in {
 26 |     for {
 27 |       (x, n) <- Seq((" ", 2), ("in", 1), (",", 0))
 28 |     } countOccurrences(s, x) should be(n)
 29 |     
 30 |     countOccurrences("", "some") should be(0)
 31 |   }
 32 |   
 33 |   it should "throw AssertionError on empty find string" in {
 34 |     a [AssertionError] should be thrownBy {
 35 |       countOccurrences(s, "")
 36 |     } 
 37 |   }
 38 |   
 39 |   case class Hit(id: Int, score: Float, d61Address: List[String], d61AddressNoAlias: String)
 40 |   def toHit(scoreDoc: ScoreDoc, doc: Document) = {
 41 |     Hit(scoreDoc.doc, scoreDoc.score, doc.getValues(F_ADDRESS).toList, doc.get(F_ADDRESS_NOALIAS))
 42 |   }
 43 |   
 44 |   case class Result(totalHits: Int, elapsedSecs: Float, hits: Seq[Hit], error: Option[String])
 45 |   def toResult(totalHits: Int, elapsedSecs: Float, hits: Seq[Hit], error: Option[String])
 46 |     = Result(totalHits, elapsedSecs, hits, error)
 47 |   
 48 |   def mkSearcher(dir: Directory) = {
 49 |     val s = new Searcher(dir, toHit, toResult)
 50 |     s.searcher.setSimilarity(GnafSimilarity)
 51 |     s
 52 |   }
 53 |   
 54 |   def mkDoc(addr: (Seq[String], Option[String], Int, String, Double, Double)) = {
 55 |     val d = new Document
 56 |     for (a <- addr._1) { 
 57 |       log.debug(s"mkDoc: add: $a")
 58 |       d.add(new Field(F_ADDRESS, a, addressFieldType))
 59 |     }
 60 |     for (a <- addr._2) { 
 61 |       log.debug(s"mkDoc: add: $a")
 62 |       d.add(new Field(F_ADDRESS, a, addressFieldType))
 63 |     }
 64 |     for (i <- 0 until addr._3) d.add(new Field(F_MISSING_DATA, MISSING_DATA_TOKEN, missingDataFieldType))
 65 |     d.add(new Field(F_ADDRESS_NOALIAS, addr._4, storedNotIndexedFieldType))
 66 |     d.add(new DoublePoint(F_LOCATION, addr._5, addr._6))
 67 |     d
 68 |   }
 69 |   
 70 |   "searcher" should "find" in {
 71 | 	  for (dir <- managed(new RAMDirectory)) {
 72 | 		  for (indexer <- managed(mkIndexer(dir))) {
 73 | 			  Seq( //                                                  v noneCount = number of fields with missing data: streetNo, build/site, flat, level
 74 | 				  (Seq("3204 INVERNESS ROAD", "DUMGREE QLD 4715"), None, 3, "3204 INVERNESS ROAD DUMGREE QLD 4715", 0.5d, 10.5d),
 75 | 				  (Seq("INVERNESS ROAD", "DUMGREE QLD 4715"),      None, 4, "INVERNESS ROAD DUMGREE QLD 4715", 0.7d, 11.5d),
 76 | 				  (Seq("FLAT 1", "2400 INVERNESS ROAD", "DUMGREE QLD 4715"), Some("1" + BIGRAM_SEPARATOR + "2400"), 2, "FLAT 1 2400 INVERNESS ROAD DUMGREE QLD 4715", 0d, 10d)
 77 | 			  ).foreach(a => indexer.addDocument(mkDoc(a)))
 78 | 		  } // indexer.close
 79 | 
 80 | 		  for (searcher <- managed(mkSearcher(dir))) {
 81 | 			  // addr: String, numHits: Int, minFuzzyLength: Int, fuzzyMaxEdits: Int, fuzzyPrefixLength: Int
 82 | 		    {
 83 |   			  val q = QueryParam("INVERNESS ROAD DUMGREE QLD 4715", 3, None, None).toQuery
 84 |   			  val r = searcher.search(q, 3)
 85 |   			  log.debug(r.toString)
 86 |   			  for (h <- r.hits) {
 87 |   				  log.debug(h.toString)
 88 |   				  log.debug(searcher.searcher.explain(q, h.id).toString)
 89 |   			  }
 90 |   	  	  r.hits(0).d61AddressNoAlias should be("INVERNESS ROAD DUMGREE QLD 4715")
 91 |   	  	  // Lucene docId is 0, 1, 2 in order that docs are indexed
 92 |   	  	  r.hits.map(_.id) should be(Seq(1, 0, 2)) // in order of decreasing noneCount: 4, 3, 2
 93 | 		    }
 94 | 	  	  
 95 | 		    {
 96 |   	  	  val q = QueryParam("FLAT 1", 0, None, None).toQuery
 97 |   	  	  val r = searcher.search(q, 1)
 98 |   			  log.debug(r.toString)
 99 |   	  	  val h = r.hits(0)
100 |   	  	  log.debug(h.toString)
101 |   	  	  log.debug(searcher.searcher.explain(q, h.id).toString)
102 |   	  	  
103 |   	  	  val q2 = QueryParam("1 2400", 0, None, None).toQuery
104 |   	  	  val r2 = searcher.search(q2, 1)
105 |   			  log.debug(r2.toString)
106 |   	  	  val h2 = r2.hits(0)
107 |   	  	  log.debug(h2.toString)
108 |   	  	  log.debug(searcher.searcher.explain(q2, h2.id).toString)
109 | 		    }
110 | 		    
111 | 		    {
112 | 	  	  val q = DoublePoint.newRangeQuery(F_LOCATION, Array[Double](-0.25, 9.75), Array[Double](0.75, 10.75))
113 | 	  	  val r = searcher.search(q, 3)
114 | 			  log.debug(r.toString)
115 | 			  r.hits.map(_.id).toSet should be(Set(0, 2)) // doc 1 not in box
116 | 		    }
117 | 		  } // searcher.close
118 | 	  } // dir.close
119 |   }
120 | }


--------------------------------------------------------------------------------
/gnaf-extractor/src/main/script/gnafMapping.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 
 3 | "settings": { "index": {
 4 |   "number_of_shards" : "4",
 5 |   "number_of_replicas" : "0",
 6 |   "refresh_interval": "60s",
 7 |   "query.default_field": "d61Address",
 8 |   "analysis": {
 9 |     "analyzer": {
10 |       "d61Whitespace": {
11 |         "tokenizer": "whitespace",
12 |         "filter": [ "lowercase" ]
13 |       },
14 |       "d61Shingle": {
15 |         "tokenizer": "whitespace",
16 |         "filter": [ "lowercase", "filter_shingle" ]
17 |       }
18 |     },
19 |     "filter": {
20 |       "filter_shingle": {
21 |         "type": "shingle",
22 |         "max_shingle_size": 2,
23 |         "min_shingle_size": 2,
24 |         "output_unigrams": "true"
25 |       }
26 |     }
27 |   }
28 | } },
29 | 
30 | "mappings": { "gnaf": { 
31 |   "_all": { "enabled": false },
32 |   "properties": {
33 |     // comments not normally allowed in JSON, but this doesn't appear to break Elasticsearch    
34 |     "d61AddressNoAlias": { "type": "string", "index": "no" },
35 |     "addressDetailPid": { "type": "string", "index": "not_analyzed" },
36 |     "addressSiteName": { "type": "string", "analyzer": "d61Whitespace", "null_value": "D61_NULL" },
37 |     "buildingName": { "type": "string", "analyzer": "d61Whitespace", "null_value": "D61_NULL" },
38 |     
39 |     "flatTypeCode": { "type": "string", "index": "not_analyzed", "null_value": "D61_NULL" },
40 |     "flatTypeName": { "type": "string", "analyzer": "d61Whitespace", "null_value": "D61_NULL" },
41 |     "flat": { "type": "object", "properties": {
42 |       "prefix": { "type": "string", "index": "not_analyzed", "null_value": "D61_NULL" },
43 |       "number": { "type": "integer", "null_value": -1 },
44 |       "suffix": { "type": "string", "index": "not_analyzed", "null_value": "D61_NULL" }
45 |     } },
46 |     
47 |     "levelTypeCode": { "type": "string", "index": "not_analyzed", "null_value": "D61_NULL" },
48 |     "levelTypeName": { "type": "string", "analyzer": "d61Whitespace", "null_value": "D61_NULL" }, // UPPER GROUND FLOOR
49 |     "level": { "type": "object", "properties": {
50 |       "prefix": { "type": "string", "index": "not_analyzed", "null_value": "D61_NULL" },
51 |       "number": { "type": "integer", "null_value": -1 },
52 |       "suffix": { "type": "string", "index": "not_analyzed", "null_value": "D61_NULL" }
53 |     } },
54 |     
55 |     "numberFirst": { "type": "object", "properties": {
56 |       "prefix": { "type": "string", "index": "not_analyzed", "null_value": "D61_NULL" },
57 |       "number": { "type": "integer", "null_value": -1 },
58 |       "suffix": { "type": "string", "index": "not_analyzed", "null_value": "D61_NULL" }
59 |     } },
60 |     
61 |     "numberLast": { "type": "object", "properties": {
62 |       "prefix": { "type": "string", "index": "not_analyzed" },
63 |       "number": { "type": "integer", "null_value": -1 },
64 |       "suffix": { "type": "string", "index": "not_analyzed" }
65 |     } },
66 |     
67 |     "street": { "type": "object", "properties": {
68 |       "name": { "type": "string", "analyzer": "d61Whitespace", "null_value": "D61_NULL" },
69 |       "typeCode": { "type": "string", "analyzer": "d61Whitespace", "null_value": "D61_NULL" }, // reversed from other lookup tables!
70 |       "typeName": { "type": "string", "index": "not_analyzed", "null_value": "D61_NULL" },  // code is long, name is short abbreviation
71 |       "suffixCode": {"type": "string", "index": "not_analyzed", "null_value": "D61_NULL" },
72 |       "suffixName": { "type": "string", "analyzer": "d61Whitespace", "null_value": "D61_NULL" }
73 |     } },
74 |     
75 |     "localityName": { "type": "string", "analyzer": "d61Whitespace", "null_value": "D61_NULL" },
76 |     "stateAbbreviation": { "type": "string", "index": "not_analyzed", "null_value": "D61_NULL" },
77 |     "stateName": { "type": "string", "analyzer": "d61Whitespace", "null_value": "D61_NULL" },
78 |     "postcode": { "type": "string", "index": "not_analyzed", "null_value": "D61_NULL" }, // string in GNAF, not a bad choice as not used as a number
79 |     
80 |     "aliasPrincipal": { "type": "string", "index": "not_analyzed", "null_value": "0" },   // TODO: in H2 & Scala this is Option[Char] so "0" used rather than "D61_NULL"
81 |     "primarySecondary": { "type": "string", "index": "not_analyzed", "null_value": "0" }, // however if we convert the null here we could use the latter
82 |     
83 |     "location": { "type": "geo_point" },
84 |     
85 |     "streetVariant": { "type": "nested", "properties": {
86 |       "name": { "type": "string", "analyzer": "d61Whitespace", "null_value": "D61_NULL" },
87 |       "typeCode": { "type": "string", "index": "not_analyzed", "null_value": "D61_NULL" },
88 |       "typeName": { "type": "string", "analyzer": "d61Whitespace", "null_value": "D61_NULL" },
89 |       "suffixCode": {"type": "string", "index": "not_analyzed", "null_value": "D61_NULL" },
90 |       "suffixName": { "type": "string", "analyzer": "d61Whitespace", "null_value": "D61_NULL" }
91 |     } },
92 |     
93 |     "localityVariant": { "type": "nested", "properties": {
94 |       "localityName": { "type": "string", "analyzer": "d61Whitespace", "null_value": "D61_NULL" }
95 |     } }
96 |   } }
97 | }
98 | 
99 | }


--------------------------------------------------------------------------------
/src/main/script/run.sh:
--------------------------------------------------------------------------------
  1 | #! /bin/bash
  2 | # script to run the whole thing
  3 | # executable documentation
  4 | # you might not want to run all this each time
  5 | 
  6 | set -ex
  7 | version=`sed 's/.*"\(.*\)"/\1/' version.sbt`
  8 | scalaVersion=2.11
  9 | 
 10 | # optional recovery from gnaf-extractor connection timeout after successful population of database
 11 | if [[ "$1" != "skip" ]]; then
 12 | 
 13 | # === Delete/Create database ===
 14 | 
 15 | if [[ -f ~/gnaf.mv.db ]]; then
 16 |   rm -f ~/gnaf-old.mv.db
 17 |   mv ~/gnaf{,-old}.mv.db
 18 | fi
 19 | # rm -rf gnaf-db/data/unzipped
 20 | 
 21 | # create SQL load script
 22 | ( cd gnaf-db; src/main/script/createGnafDb.sh;)
 23 | 
 24 | if [[ $? -eq 5 ]]; then echo "no new data found, cancelling build"; exit 0; fi
 25 | 
 26 | # build scala projects
 27 | # 1. above gnaf-db/src/main/script/createGnafDb.sh creates gnaf-db/target/generated/version.json
 28 | # 2. this version.json file is included in the gnaf-search jar by gnaf-search/build.sbt (so we need to build after running the above script)
 29 | # 3. h2 (used below) is downloaded by the build if necessary, so we need build before running h2
 30 | sbt one-jar
 31 | 
 32 | # run h2 with postgres protocol, remembering its PID
 33 | # get h2 version
 34 | h2ver=$( sed --quiet --regexp-extended '/com.h2database/s/.*"h2"[^"]*"([^"]*)".*/\1/p' gnaf-db/build.sbt )
 35 | echo $h2ver
 36 | 
 37 | java -Xmx3G -jar ~/.ivy2/cache/com.h2database/h2/jars/h2-${h2ver}.jar -web -pg &
 38 | H2_PID=$!
 39 | sleep 10
 40 | 
 41 | # set psql gnaf password to gnaf
 42 | [[ -r ~/.pgpass ]] && grep -q gnaf ~/.pgpass || {
 43 |   echo "localhost:5435:~/gnaf:gnaf:gnaf" >> ~/.pgpass
 44 |   chmod 600 ~/.pgpass
 45 | }
 46 | 
 47 | # run load script using Postgres client, takes about 90 minutes with a SSD
 48 | # see gnaf-db/README.md for an alternative method using the h2 client
 49 | psql --host=localhost --port=5435 --username=gnaf --dbname=~/gnaf < gnaf-db/data/createGnafDb.sql
 50 | 
 51 | # attempt to avoid gnaf-extractor failing below with: java.sql.SQLTimeoutException: Timeout after 10000ms of waiting for a connection
 52 | sleep 10
 53 | 
 54 | # stop h2
 55 | kill $H2_PID
 56 | wait
 57 | 
 58 | fi
 59 | 
 60 | # === Extract JSON address data and load into Lucene ===
 61 | 
 62 | # takes about 23 min
 63 | time java -Xmx3G -jar gnaf-extractor/target/scala-${scalaVersion}/gnaf-extractor_${scalaVersion}-${version}-one-jar.jar | gzip > addresses.gz
 64 | 
 65 | # takes about 13 min
 66 | time zcat addresses.gz | java -jar gnaf-indexer/target/scala-${scalaVersion}/gnaf-indexer_${scalaVersion}-${version}-one-jar.jar
 67 | 
 68 | #
 69 | #
 70 | ## === demo gnaf-search and gnaf-test ===
 71 | #
 72 | #java -jar gnaf-search/target/scala-${scalaVersion}/gnaf-search_${scalaVersion}-${version}-one-jar.jar &
 73 | #SEARCH_PID=$!
 74 | #sleep 15 # we could wait for it to log a message
 75 | #
 76 | #echo "gnaf-search: swagger.json ..."
 77 | #curl http://localhost:9040/api-docs/swagger.json
 78 | #curl -X POST --header 'Content-Type: application/json' --header 'Accept: application/json' -d '{
 79 | #  "addr": "137-~45 CHEVALLUM SCHOOL ROAD PALMWOODS QLD 4555",
 80 | #  "numHits": 3,
 81 | #  "fuzzy": {
 82 | #    "maxEdits": 2,
 83 | #    "minLength": 5,
 84 | #    "prefixLength": 2
 85 | #  }
 86 | #}' 'http://localhost:9040/search'
 87 | #echo
 88 | #
 89 | ## takes about 12 min
 90 | ## gnaf-search must be running
 91 | ## gnaf-db-service must not be running (both use the gnaf database in embedded mode, to run at the same time they would need
 92 | ## to use different databases or not use embedded mode).
 93 | #echo "gnaf-test ..."
 94 | #cd gnaf-test
 95 | #npm install
 96 | #time src/main/script/run.sh
 97 | #cd ..
 98 | #
 99 | ## === demo gnaf-db-service ===
100 | #
101 | #java -jar gnaf-db-service/target/scala-${scalaVersion}/gnaf-db-service_${scalaVersion}-${version}-one-jar.jar &
102 | #DB_PID=$!
103 | #sleep 15
104 | #
105 | #echo "gnaf-db-service: swagger.json ..."
106 | #curl http://localhost:9000/api-docs/swagger.json
107 | #echo "get geocode types and descriptions ..."
108 | #curl 'http://localhost:9000/gnaf/geocodeType'
109 | #echo "get type of address e.g. RURAL, often missing, for an addressDetailPid ..."
110 | #curl 'http://localhost:9000/gnaf/addressType/GANSW716635201'
111 | #echo "get all geocodes for an addressDetailPid, almost always 1, sometimes 2, never more ..."
112 | #curl 'http://localhost:9000/gnaf/addressGeocode/GASA_414912543'
113 | #echo
114 | #
115 | #
116 | ## === demo gnaf-contrib ===
117 | #
118 | #java -jar gnaf-contrib/target/scala-${scalaVersion}/gnaf-contrib_${scalaVersion}-${version}-one-jar.jar &
119 | #CONTRIB_PID=$!
120 | #sleep 15
121 | #
122 | #echo "gnaf-contrib: swagger.json ..."
123 | #curl http://localhost:9010/api-docs/swagger.json
124 | #echo "add contributed geocode for an addressSite ..."
125 | #curl -XPOST 'http://localhost:9010/contrib/' -H 'Content-Type:application/json' -d '{
126 | #  "contribStatus":"Submitted","addressSitePid":"712279621","geocodeTypeCode":"EM",
127 | #  "longitude":149.1213974,"latitude":-35.280994199999995,"dateCreated":0,"version":0
128 | #}'
129 | #echo "list contributed geocodes for an addressSite ..."
130 | #curl 'http://localhost:9010/contrib/712279621'
131 | ## there are also delete and update methods
132 | #
133 | ## === Stop JSON web services ===
134 | #
135 | #kill $SEARCH_PID
136 | #kill $DB_PID
137 | #kill $CONTRIB_PID
138 | #wait
139 | 


--------------------------------------------------------------------------------
/gnaf-db/src/main/script/createGnafDb.sh:
--------------------------------------------------------------------------------
  1 | #! /bin/bash
  2 | # script to download and unpack GNAF and write a SQL script to load it.
  3 | set -ex
  4 | 
  5 | baseDir=$PWD
  6 | scriptDir=$baseDir/src/main/script
  7 | dataDir=$baseDir/data
  8 | mkdir -p $dataDir
  9 | 
 10 | # JSON URL from near top-right of: http://www.data.gov.au/dataset/geocoded-national-address-file-g-naf
 11 | jsonUrl=http://www.data.gov.au/api/3/action/package_show?id=19432f89-dc3a-4ef3-b943-5326ef1dbecc
 12 | # get data URL for current version from JSON
 13 | curl -sL $jsonUrl > meta.json
 14 | dataUrl=$( jq -r '.result.resources[] | select(.format == "ZIP") | .url' meta.json )
 15 | last_modified=$( jq -r '.result.resources[] | select(.format == "ZIP") | .last_modified' meta.json )
 16 | 
 17 | # download ZIP data file unless already done
 18 | zip=$dataDir/${dataUrl##*/}
 19 | [[ -f "$zip" ]] || ( cd $dataDir; wget "$dataUrl" )
 20 | 
 21 | unzipped=$dataDir/unzipped
 22 | # get dir path where the zip file's */Extras/ will be extracted (contains release month so releases don't clobber each other)
 23 | # get path from zip, discard leading info up to time and following spaces, keep the rest apart from the trailing /
 24 | # maybe a bit too brittle?
 25 | gnafExtras="$unzipped/$( unzip -l "$zip" '*/Extras/' | sed -rn '/Extras/s~^.*[0-9][0-9]:[0-9][0-9] *(.*)/$~\1~p' )"
 26 | # unzip unless $gnafExtras already exists 
 27 | [[ -d "$gnafExtras" ]] || ( mkdir -p $unzipped; cd $unzipped; unzip $zip )
 28 | # get dir path parent of Standard/
 29 | gnafData="$unzipped/$( unzip -l "$zip" '*/Standard/' | sed -rn '/Standard/s~^.*[0-9][0-9]:[0-9][0-9] *(.*)/Standard/$~\1~p' )"
 30 | 
 31 | mkdir -p target/generated
 32 | cat > target/generated/version.json <<EoF
 33 | {
 34 |   "git-commit": "$( git rev-parse HEAD )",
 35 |   "sbt-version": "$( sed --regexp-extended 's/.*:=\s*"([^"]+)"/\1/' ../version.sbt )",
 36 |   "gnaf-version": "$last_modified"
 37 | }
 38 | EoF
 39 | 
 40 | # echo ${gnafData##*/G-NAF } -- old method for determining version
 41 | 
 42 | # Load GNAF into a relational database following https://www.psma.com.au/sites/default/files/g-naf_-_getting_started_guide.pdf
 43 | 
 44 | {
 45 | 
 46 | # issue message during SQL script execution
 47 | progress() {
 48 |   echo
 49 |   echo "SELECT '$1' AS Progress, CURRENT_TIME() AS Time;"
 50 |   echo
 51 | }
 52 | 
 53 | 
 54 | progress "modified: $gnafExtras/GNAF_TableCreation_Scripts/create_tables_ansi.sql"
 55 | 
 56 | sed -e 's/DROP TABLE/DROP TABLE IF EXISTS/' -e 's/numeric([0-9])/integer/' "$gnafExtras/GNAF_TableCreation_Scripts/create_tables_ansi.sql"
 57 | 
 58 | progress "load Authority Code ..."
 59 | while read tbl
 60 | do
 61 |   echo "INSERT INTO ${tbl} SELECT * FROM CSVREAD('$gnafData/Authority Code/Authority_Code_${tbl}_psv.psv', null, 'fieldSeparator=|');"
 62 | done <<-'EoF'
 63 | ADDRESS_ALIAS_TYPE_AUT
 64 | ADDRESS_TYPE_AUT
 65 | FLAT_TYPE_AUT
 66 | GEOCODE_RELIABILITY_AUT
 67 | GEOCODE_TYPE_AUT
 68 | GEOCODED_LEVEL_TYPE_AUT
 69 | LEVEL_TYPE_AUT
 70 | LOCALITY_ALIAS_TYPE_AUT
 71 | LOCALITY_CLASS_AUT
 72 | MB_MATCH_CODE_AUT
 73 | PS_JOIN_TYPE_AUT
 74 | STREET_CLASS_AUT
 75 | STREET_TYPE_AUT
 76 | STREET_LOCALITY_ALIAS_TYPE_AUT
 77 | STREET_SUFFIX_AUT
 78 | EoF
 79 | # table names pasted from g-naf_-_getting_started_guide.pdf referenced above
 80 | 
 81 | progress "load Standard ..."
 82 | while read tbl
 83 | do
 84 |   progress "load ${tbl} ..."
 85 |   # A-Z mess matches 2 and 3 char state abreviations (note * would try to load {state}_STREET_LOCALITY_psv.psv into LOCALITY)
 86 |   ls -1 "${gnafData}"/Standard/{[A-Z][A-Z],[A-Z][A-Z][A-Z]}_${tbl}_psv.psv | while read f
 87 |   do
 88 |     echo "INSERT INTO ${tbl} SELECT * FROM CSVREAD('$f', null, 'fieldSeparator=|');"
 89 |   done
 90 | done <<-'EoF'
 91 | ADDRESS_ALIAS
 92 | ADDRESS_DEFAULT_GEOCODE
 93 | ADDRESS_DETAIL
 94 | ADDRESS_MESH_BLOCK_2011
 95 | ADDRESS_SITE_GEOCODE
 96 | ADDRESS_SITE
 97 | LOCALITY
 98 | LOCALITY_ALIAS
 99 | LOCALITY_NEIGHBOUR
100 | LOCALITY_POINT
101 | MB_2011
102 | PRIMARY_SECONDARY
103 | STATE
104 | STREET_LOCALITY
105 | STREET_LOCALITY_ALIAS
106 | STREET_LOCALITY_POINT
107 | EoF
108 | # table names pasted from g-naf_-_getting_started_guide.pdf referenced above
109 | 
110 | progress "add constraints ..."
111 | sed --regexp-extended --file=$scriptDir/constraint.sed "$gnafExtras/GNAF_TableCreation_Scripts/add_fk_constraints.sql"
112 | 
113 | progress "add an index on STREET_NAME (this is not part of the getting_started_guide)..."
114 | echo "create index STREET_LOCALITY_NAME_IDX on STREET_LOCALITY (STREET_NAME);"
115 | 
116 | # progress "add view (suggested in getting_started_guide) ..."
117 | # commented out as not useful/too slow
118 | # cat "$gnafExtras/GNAF_View_Scripts/address_view.sql"
119 | # echo ";"
120 | 
121 | progress "Create READONLY user ..."
122 | cat <<-'EoF'
123 | CREATE USER READONLY PASSWORD 'READONLY';
124 | GRANT SELECT ON SCHEMA PUBLIC TO READONLY;
125 | EoF
126 | 
127 | } | sed 's/REM/--/' > $dataDir/createGnafDb.sql
128 | 
129 | cat <<-'EoF'
130 | 
131 | Start H2 database engine with: java -jar h2*.jar -web -pg
132 | Create an empty database by connecting to a new dburl e.g. jdbc:h2:file:~/gnaf (specify 'gnaf' as the username and password).
133 | In the SQL input area enter: RUNSCRIPT FROM 'data/createGnafDb.sql'
134 | or paste in the content of this file (to get progress feedback lacking with RUNSCRIPT).
135 | After an hour (with SSD) you should have a GNAF database.
136 | EoF
137 | 
138 | 
139 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # gnaf
 2 | 
 3 | ## Introduction
 4 | This project:
 5 | 
 6 | - loads the [G-NAF data set](http://www.data.gov.au/dataset/geocoded-national-address-file-g-naf) into a relational database and search engine;
 7 | - provides JSON web services to access the database and search engine; and
 8 | - provides a demonstration web user interface using the web services.
 9 | 
10 | Users of `gnaf-search` should note the [suggested preprocessing](gnaf-lucene/README.md#suggested-preprocessing-for-client-applications) for
11 | query strings.
12 | 
13 | ## Project Structure
14 | These sub-directories contain sub-projects:
15 | 
16 | 1. gnaf-util: common code
17 | 2. gnaf-db: scripts to load the [G-NAF data set](http://www.data.gov.au/dataset/geocoded-national-address-file-g-naf) into a relational database
18 | and [Slick](http://slick.typesafe.com/) "Functional Relational Mapping" bindings for the database.
19 | The README.md discusses the H2 database and G-NAF data.
20 | 3. gnaf-extractor: queries the database to produce JSON address data
21 | 4. gnaf-lucene: common code for indexing and searching G-NAF with [Lucene](https://lucene.apache.org/).
22 | The README.md discusses the search techniques used.
23 | 5. gnaf-indexer: loads JSON address data into a [Lucene](https://lucene.apache.org/) index
24 | 6. gnaf-search: JSON web service to search the [Lucene](https://lucene.apache.org/) index
25 | 7. gnaf-test: queries the database to produce test address data with many variations, scripts to perform bulk lookups of the test data and evaluate results
26 | 8. gnaf-db-service: JSON web service providing access to the G-NAF database
27 | 9. gnaf-contrib: a JSON web service providing access to the gnafContrib database of user supplied geocodes
28 | 10. gnaf-ui: static files providing a demonstration web user interface using gnaf-search, gnad-db-service and gnaf-contrib.
29 | 
30 | Nature of Sub-projects:
31 | 
32 | - 1, 2 & 4 produce a jar file of library code used by other sub-projects
33 | - 3, 5 & 7 produce command line programs packaged as a [onejar](https://github.com/sbt/sbt-onejar).
34 | This is a jar file containing all dependencies and run simply with: `java -jar {filename.jar}`
35 | - 6, 8 & 9 produce JSON web services also packaged as a [onejar](https://github.com/sbt/sbt-onejar).
36 | These are run as above (not in a servlet container). They produce [Swagger](http://swagger.io/) API documentation at `/api-docs/swagger.json`.
37 | 
38 | The top level directory provides:
39 | - the [sbt](http://www.scala-sbt.org/) build for the [Scala](http://scala-lang.org/) code in projects 1-9 (no build is required for 10); and
40 | - [src/main/script/run.sh](src/main/script/run.sh) to run everything, but first:
41 |   - take a look as its intended as executable documentation and you may not wish to run it all each time
42 |   - install tools
43 | 
44 | ## Install Tools
45 | 
46 | To run the Scala code install:
47 | - a JRE e.g. from openjdk-8 (version 8 or higher is required by some dependencies);
48 | - the build tool [sbt](http://www.scala-sbt.org/).
49 | 
50 | To develop [Scala](http://scala-lang.org/) code install:
51 | - the above items (you may prefer to install the full JDK instead of just the JRE but I think the JRE is sufficient);
52 | - the [Scala IDE](http://scala-ide.org/download/current.html).
53 | 
54 | ### Dependencies
55 | 
56 | - scripts assume a *nix environment
57 | - [gnaf-db/src/main/script/createGnafDb.sh](gnaf-db/src/main/script/createGnafDb.sh) requires [jq](https://stedolan.github.io/jq/)
58 | - [src/main/script/run.sh](src/main/script/run.sh) requires:
59 |   - `jq` (because it runs `createGnafDb.sh`)
60 |   - the Postgres client `psql` to load the database (see [gnaf-db](gnaf-db) for an alternative method using the h2 client);
61 |   - `node` and `npm` to run [gnaf-test](gnaf-test) (see its README).
62 | - the `/version` endpoint provided by `gnaf-search` reports the software and data version, but relies on a file created by `createGnafDb.sh`
63 |   being available when gnaf-search is built. `run.sh` does things in the right order for this to work.
64 | 
65 | ## Running and Usage
66 | 
67 | See [src/main/script/run.sh](src/main/script/run.sh).
68 | 
69 | ## Build
70 | 
71 | Automatic builds are available at: https://t3as-jenkins.it.csiro.au/ (only within the CSIRO network).
72 | 
73 | The command:
74 | 
75 |     sbt clean test one-jar dumpLicenseReport
76 | 
77 | from the project's top level directory cleans out previous build products, runs unit tests,
78 | builds one-jar files and creates license reports on dependencies.
79 | 
80 | ## Develop With Eclipse
81 | 
82 | The command:
83 | 
84 |     sbt update-classifiers eclipse
85 | 
86 | uses the [sbteclipse](https://github.com/typesafehub/sbteclipse/wiki/Using-sbteclipse) plugin to create the .project and .classpath files required by Eclipse (with source attachments for dependencies).
87 | 
88 | ## Software License
89 | 
90 | This software is released under the CSIRO BSD license - see `Licence.txt`.
91 | Each of the sub-projects lists its dependencies and their licenses in `3rd-party-licenses.html`.
92 | 
93 | ## Data License
94 | 
95 | Incorporates or developed using G-NAF ©PSMA Australia Limited licensed by the Commonwealth of Australia under the
96 | [Open Geo-coded National Address File (G-NAF) End User Licence Agreement](http://data.gov.au/dataset/19432f89-dc3a-4ef3-b943-5326ef1dbecc/resource/09f74802-08b1-4214-a6ea-3591b2753d30/download/20160226---EULA---Open-G-NAF.pdf).
97 | 
98 | 


--------------------------------------------------------------------------------
/gnaf-test/src/main/script/searchEs.js:
--------------------------------------------------------------------------------
  1 | var request = require('request');
  2 | var fs = require('fs');
  3 | var maps = require('./Maps.js');
  4 | 
  5 | Array.prototype.flatMap = function(f) {
  6 |   return this.map(f).flatten();
  7 | }
  8 | Array.prototype.flatten = function() {
  9 |   return Array.prototype.concat.apply([], this);
 10 | }
 11 | 
 12 | /**
 13 |  * Usage: node src/main/node/searchEs.js input.json
 14 |  * Input: one address per line. Performs bulk lookup using Elasticsearch index created by gnaf-indexer.
 15 |  * TODO: add proper command line option handling, add options to set numHits and bulk
 16 |  */
 17 | var url = process.argv[2]; // 'http://localhost:9200/gnaf/_msearch'
 18 | var path = process.argv[3]; // 0 -> node; 1 -> src/main/script/search.js; 2 -> url; 3 -> input.json
 19 | var numHits = 10;
 20 | 
 21 | var addr = JSON.parse(fs.readFileSync(path, "utf8"));
 22 | // console.log('addr', addr);
 23 | 
 24 | var bulk = 10;
 25 | var batches = [];
 26 | for (i = 0; i < addr.length; i += bulk) batches.push(addr.slice(i, Math.min(i + bulk, addr.length)));
 27 | // console.log('batches', batches);
 28 | 
 29 | /** return array[i] = index j where esHits.responses[i].hits.hits[j].fields.d61AddressNoAlias[0] contains qBatch[i].tc.address */
 30 | var findHitIndices = (qBatch, esHits) => qBatch.map( (q, i) =>
 31 |   esHits.responses[i].hits.hits.findIndex(h => h.fields.d61AddressNoAlias[0].indexOf(q.tc.address) != -1)
 32 | );
 33 | 
 34 | /**
 35 |  * @return non-fuzzy elasticsearch query
 36 |  * 
 37 |  * @param qstr a query address string
 38 |  * 
 39 |  * If we don't specify "fields" we get _source.d61AddressNoAlias as a String
 40 |  * however if we do specify "fields" _source is omitted and we get fields.d61AddressNoAlias as an array of Strings (with just 1 element).
 41 |  */
 42 | var esNoFuz = qstr =>
 43 | ({
 44 |   query:{ match:{ d61Address: qstr }},
 45 |   fields:[ "d61AddressNoAlias" ],
 46 |   size:numHits
 47 | });
 48 | 
 49 | /**
 50 |  * @return fuzzy elasticsearch query
 51 |  * 
 52 |  * @param qstr a query address string
 53 |  */
 54 | var esFuz = qstr =>
 55 | ({
 56 |   query:{ match:{ d61Address:{ query: qstr, fuzziness: 2, prefix_length: 2 }}},
 57 |   // rescore:{ query:{ rescore_query:{ match:{ d61Address:{ query: qstr }}}, query_weight: 0 }}, why did I think this was a good idea???
 58 |   fields: [ "d61AddressNoAlias" ],
 59 |   size: numHits
 60 | });
 61 | 
 62 | /** @return array elements for a non-fuzzy and a fuzzy search */
 63 | var mkEs = (tc, qstr, desc) => 
 64 | [
 65 |   { tc: tc, qstr: qstr, qes: esNoFuz(qstr), desc: 'nofuz' + desc },
 66 |   { tc: tc, qstr: qstr, qes: esFuz(qstr),   desc: 'fuz' + desc }
 67 | ];
 68 | 
 69 | /**
 70 |  * 6 combinations of queries: 3 different queries with and without fuzzy search
 71 |  * @return array of { tc: tc, qstr: query address string, qes: elasticsearch query, desc: description }
 72 |  * @param tc a test case
 73 |  */
 74 | var queries = tc => [
 75 |   mkEs(tc, tc.query, ''), 
 76 |   mkEs(tc, tc.queryPostcodeBeforeState, 'PostcodeBeforeState'), 
 77 |   mkEs(tc, tc.queryTypo, 'Typo')
 78 | ].flatten();
 79 | 
 80 | 
 81 | // comparitor to sort by score then shortest d61AddressNoAlias first
 82 | var scoreThenLength = (a, b) =>
 83 |   b._score != a._score ? b._score - a._score
 84 |                        : a.fields.d61AddressNoAlias[0].length - b.fields.d61AddressNoAlias[0].length;
 85 | 
 86 | // sort each esHits.responses[i].hits.hits according to comparitor cmp
 87 | var sortHits = (esHits, cmp) => {
 88 |   esHits.responses.forEach(r => r.hits.hits.sort(cmp));
 89 |   return esHits;
 90 | };
 91 | 
 92 | var done = (histMap, errMap) => console.log(JSON.stringify({ histogram: histMap.object(), errors: errMap.object() }));
 93 | 
 94 | /**
 95 |  * Process a batch and on completion recursively do the next.
 96 |  * @param iter provides next batch
 97 |  * @param histMap test description -> histogram
 98 |  *       where histogram is (index of correct hit (0 in best case) -> occurrence count for this index)
 99 |  * @param errMap test description -> index of correct hit -> array of addresses with this index
100 |  */
101 | function doBatch(iter, histMap, errMap) {
102 |   var x = iter.next();
103 |   if (x.done) done(histMap, errMap);
104 |   else {    
105 |     var batch = x.value;
106 |     
107 |     // array of batch.length * 6:
108 |     //   { tc: tc, qstr: query address string, qes: elasticsearch query, desc: description }
109 |     var qBatch = batch.flatMap(queries);
110 |     // console.log('qBatch', qBatch);
111 |     
112 |     var esBulk = qBatch.flatMap(q => [ '{}', JSON.stringify(q.qes) ]).join('\n') + '\n';
113 |     // console.log('esBulk', esBulk);
114 |   
115 |     request.post( { url: url, body: esBulk }, (error, response, body) => {
116 |       if (error) console.log('error', error)
117 |       else {
118 |         // console.log('statusCode', response.statusCode, 'body', body);
119 |         var esHits = sortHits(JSON.parse(body), scoreThenLength);
120 |         // console.log('esHits', JSON.stringify(esHits));
121 |         var idxs = findHitIndices(qBatch, esHits);
122 |         // console.log('idxs', idxs);
123 |         // histogram(histMap, idxs);
124 |         // console.log('histMap', histMap);
125 |         idxs.forEach((v, i) => {
126 |           var q = qBatch[i];
127 |           histMap.inc(q.desc, v);
128 |           if (v != 0) errMap.get(q.desc).append(v, q.qstr);
129 |         });
130 |         doBatch(iter, histMap, errMap);
131 |       }
132 |     });
133 |   };
134 | }
135 | 
136 | 
137 | doBatch(batches[Symbol.iterator](), new maps.MapHist(), new maps.MapMapCont(maps.ctorMapArr));
138 | 
139 | 
140 | 


--------------------------------------------------------------------------------
/template.yaml:
--------------------------------------------------------------------------------
  1 | AWSTemplateFormatVersion: '2010-09-09'
  2 | Description: 'This stack hosts the gnaf-search service in a docker container'
  3 | Parameters:
  4 |   sslcertificate:
  5 |     Type: 'String'
  6 |     Description: 'The ARN for the SSL certificate to use on the load balancer to handle https traffic. See Amazon Certificate Manager to find this'
  7 | Resources:
  8 |   gnafelbsg:
  9 |     Type: 'AWS::EC2::SecurityGroup'
 10 |     Properties:
 11 |       GroupDescription: 'Allows Load Balancer Ingress on 80 from public'
 12 |       SecurityGroupIngress:
 13 |         -
 14 |           IpProtocol: 'tcp'
 15 |           FromPort: '80'
 16 |           ToPort: '80'
 17 |           CidrIp: '0.0.0.0/0'
 18 |         -
 19 |           IpProtocol: 'tcp'
 20 |           FromPort: '443'
 21 |           ToPort: '443'
 22 |           CidrIp: '0.0.0.0/0'
 23 |   gnafec2sg:
 24 |     Type: 'AWS::EC2::SecurityGroup'
 25 |     Properties:
 26 |       GroupDescription: 'Allows access to Instances on port 80 from Load Balancer'
 27 |       SecurityGroupIngress:
 28 |         -
 29 |           IpProtocol: 'tcp'
 30 |           FromPort: '80'
 31 |           ToPort: '80'
 32 |           SourceSecurityGroupId: !GetAtt gnafelbsg.GroupId
 33 |   gnafelb: # No LoadbalancerName specified to allow for potential replacements if Cloud Formation changes are needed.
 34 |     Type: 'AWS::ElasticLoadBalancing::LoadBalancer'
 35 |     Properties:
 36 |       AvailabilityZones:
 37 |         Fn::GetAZs: 'ap-southeast-2'
 38 |       CrossZone: True
 39 |       AccessLoggingPolicy:
 40 |         Enabled: True
 41 |         S3BucketName: 'gnaf-logs'
 42 |       HealthCheck:
 43 |           HealthyThreshold: '5'
 44 |           Interval: '10'
 45 |           Target: 'HTTP:80/v2/api-docs/swagger.json'
 46 |           Timeout: '9'
 47 |           UnhealthyThreshold: '5'
 48 |       Listeners:
 49 |         -
 50 |           InstancePort: '80'
 51 |           InstanceProtocol: 'HTTP'
 52 |           LoadBalancerPort: '80'
 53 |           Protocol: 'HTTP'
 54 |         -
 55 |           InstancePort: '80'
 56 |           InstanceProtocol: 'HTTP'
 57 |           LoadBalancerPort: '443'
 58 |           Protocol: 'HTTPS'
 59 |           SSLCertificateId: !Ref sslcertificate
 60 |       SecurityGroups:
 61 |         - !GetAtt gnafelbsg.GroupId
 62 |   gnaflc:
 63 |     Type: 'AWS::AutoScaling::LaunchConfiguration'
 64 |     Properties:
 65 |       ImageId: 'ami-09332079312dc6085'
 66 |       InstanceType: 't2.medium'
 67 |       SecurityGroups:
 68 |         - !GetAtt gnafec2sg.GroupId
 69 |       KeyName: 'natmap-peter'
 70 |       IamInstanceProfile: !GetAtt gnafecraccessinstanceprofile.Arn #'arn:aws:iam::933940466036:instance-profile/gnafECRPullAccess'
 71 |       UserData: !Base64 |
 72 |         #cloud-config
 73 |         apt_upgrade: True
 74 |         apt_reboot_if_required: True
 75 |         packages:
 76 |           - nginx
 77 |         write_files:
 78 |           - path: /etc/nginx/sites-available/default
 79 |             content: |
 80 |               server {
 81 |                 listen 80 default_server;
 82 |                 location / {
 83 |                   rewrite ^/$ https://github.com/data61/gnaf/ redirect;
 84 |                   }
 85 |                 location /v2/ {
 86 |                   proxy_pass http://localhost:8080/;
 87 |                   add_header Access-Control-Allow-Headers 'Content-Type';
 88 |                   add_header Access-Control-Allow-Origin '*';
 89 |                   }
 90 |                 }
 91 |         runcmd:
 92 |           - "curl -fsSL get.docker.com | bash"
 93 |           - "curl --silent --show-error --retry 5 https://bootstrap.pypa.io/get-pip.py | python3"
 94 |           - "pip install awscli"
 95 |           - "$(aws ecr get-login --no-include-email --region ap-southeast-2)"
 96 |           - "nginx -s reload"
 97 |           - "docker run -p 8080:9040 --restart=always 933940466036.dkr.ecr.ap-southeast-2.amazonaws.com/gnaf:latest"
 98 |   gnafasg:
 99 |     UpdatePolicy:
100 |         AutoScalingRollingUpdate:
101 |           MinInstancesInService: '1'
102 |           MaxBatchSize: '1'
103 |           PauseTime: 'PT2M30S'
104 |     Type: "AWS::AutoScaling::AutoScalingGroup"
105 |     Properties:
106 |       AvailabilityZones:
107 |         Fn::GetAZs: 'ap-southeast-2'
108 |       Cooldown: '300'
109 |       DesiredCapacity: '2'
110 |       HealthCheckGracePeriod: '300'
111 |       HealthCheckType: 'ELB'
112 |       LaunchConfigurationName: !Ref gnaflc
113 |       LoadBalancerNames:
114 |         - !Ref gnafelb
115 |       MaxSize: 2
116 |       MinSize: 1
117 |       Tags:
118 |         - Key: 'Name'
119 |           Value: 'gnaf-search-instance'
120 |           PropagateAtLaunch: True
121 |   gnafecraccessinstanceprofile:
122 |     Type: "AWS::IAM::InstanceProfile"
123 |     Properties:
124 |       Roles:
125 |         - !Ref gnafecraccessrole
126 |       InstanceProfileName:
127 |         Fn::Join:
128 |           - ''
129 |           - - !Ref 'AWS::StackName'
130 |             - '-GNAF-ECR-Access-instanceprofile'
131 |   gnafecraccessrole:
132 |     Type: "AWS::IAM::Role"
133 |     Properties:
134 |       AssumeRolePolicyDocument:
135 |         Version: "2012-10-17"
136 |         Statement:
137 |           -
138 |             Effect: "Allow"
139 |             Principal:
140 |               Service: "ec2.amazonaws.com"
141 |             Action: "sts:AssumeRole"
142 |       Policies:
143 |         -
144 |           PolicyDocument: {
145 |             "Version": "2012-10-17",
146 |             "Statement": [
147 |               {
148 |                 "Sid": "ecraccessforgnafec2instances",
149 |                 "Effect": "Allow",
150 |                 "Action": [
151 |                   "ecr:BatchCheckLayerAvailability",
152 |                   "ecr:BatchGetImage",
153 |                   "ecr:GetAuthorizationToken",
154 |                   "ecr:GetDownloadUrlForLayer"
155 |                 ],
156 |                 "Resource": "*"
157 |               }
158 |             ]
159 |           }
160 |           PolicyName:
161 |             Fn::Join:
162 |             - ''
163 |             - - !Ref 'AWS::StackName'
164 |               - '-GNAF-ECR-access-policy'
165 |       ManagedPolicyArns:
166 |         - "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore"
167 | 


--------------------------------------------------------------------------------
/gnaf-db-service/src/main/scala/au/csiro/data61/gnaf/db/service/DbService.scala:
--------------------------------------------------------------------------------
  1 | package au.csiro.data61.gnaf.db.service
  2 | 
  3 | import scala.concurrent.{ ExecutionContextExecutor, Future }
  4 | import scala.math.BigDecimal
  5 | 
  6 | import com.github.swagger.akka.{ HasActorSystem, SwaggerHttpService }
  7 | import com.github.swagger.akka.model.Info
  8 | import com.typesafe.config.{ Config, ConfigFactory }
  9 | 
 10 | import akka.actor.ActorSystem
 11 | import akka.event.{ Logging, LoggingAdapter }
 12 | import akka.http.scaladsl.Http
 13 | import akka.http.scaladsl.marshallers.sprayjson.SprayJsonSupport.sprayJsonMarshaller
 14 | import akka.http.scaladsl.server.Directives.{ Segment, _enhanceRouteWithConcatenation, _segmentStringToPathMatcher, complete, get, logRequestResult, path, pathPrefix }
 15 | import akka.stream.{ ActorMaterializer, Materializer }
 16 | import au.csiro.data61.gnaf.db.GnafTables
 17 | import au.csiro.data61.gnaf.util.Util
 18 | import ch.megard.akka.http.cors.CorsDirectives.cors
 19 | import io.swagger.annotations.{ Api, ApiOperation }
 20 | import io.swagger.models.Swagger
 21 | import javax.ws.rs.{ Path, PathParam }
 22 | import spray.json.DefaultJsonProtocol
 23 | 
 24 | // for latitude: BigDecimal swagger type is number, but for Option[BigDecimal] swagger type is complex internal representation of scala.math.BigDecimal, so we avoid using Option here
 25 | case class Geocode(geocodeTypeCode: Option[String], geocodeTypeDescription: Option[String], reliabilityCode: Option[Int], isDefault: Boolean, latitude: BigDecimal, longitude: BigDecimal)
 26 | 
 27 | case class AddressType(addressSitePid: String, addressType: Option[String])
 28 | case class AddressTypeOpt(addressType: Option[AddressType])
 29 | 
 30 | case class GeocodeType(code: String, description: String)
 31 | case class GeocodeTypes(types: Seq[GeocodeType])
 32 | 
 33 | trait Protocols extends DefaultJsonProtocol {
 34 |   implicit val geocodeFormat = jsonFormat6(Geocode.apply)
 35 |   
 36 |   implicit val addressTypeFormat = jsonFormat2(AddressType.apply)
 37 |   implicit val addressTypeOptFormat = jsonFormat1(AddressTypeOpt.apply)
 38 |   
 39 |   implicit val geocodeTypeFormat = jsonFormat2(GeocodeType.apply)
 40 |   implicit val geocodeTypesFormat = jsonFormat1(GeocodeTypes.apply)
 41 | }
 42 | 
 43 | @Api(value = "gnaf", produces = "application/json")
 44 | @Path("gnaf")
 45 | class DbService(logger: LoggingAdapter, config: Config)(implicit system: ActorSystem, executor: ExecutionContextExecutor, materializer: Materializer) extends Protocols {
 46 | 
 47 |   object MyGnafTables extends {
 48 |     val profile = Util.getObject[slick.driver.JdbcProfile](config.getString("gnafDb.slickDriver")) // e.g. slick.driver.{H2Driver,PostgresDriver}
 49 |   } with GnafTables
 50 |   val gnafTables = MyGnafTables
 51 |   import gnafTables._
 52 |   import gnafTables.profile.api._
 53 |   
 54 |   implicit val db = Database.forConfig("gnafDb", config)
 55 |     
 56 |   // map code -> description
 57 |   lazy val geocodeTypesFuture: Future[Map[String, String]] = db.run(GeocodeTypeAut.result).map(_.map(t => t.code -> t.description.getOrElse(t.code)).toMap)
 58 |   
 59 |   @Path("geocodeType")
 60 |   @ApiOperation(value = "List geocode types", nickname = "geocodeType", httpMethod = "GET", response = classOf[GeocodeType], responseContainer = "List")
 61 |   def geocodeType = complete {
 62 |     geocodeTypesFuture.map { x =>
 63 |       GeocodeTypes(x.toSeq.map(GeocodeType.tupled))
 64 |     }
 65 |   }
 66 |   
 67 |   // left join because some addressDetailPid have no AddressSiteGeocode
 68 |   val qGeocodes = {
 69 |     def q(addressDetailPid: Rep[String]) = for {
 70 |       (ad, sg) <- AddressDetail joinLeft AddressSiteGeocode on (_.addressSitePid === _.addressSitePid) if ad.addressDetailPid === addressDetailPid
 71 |       dg <- AddressDefaultGeocode if dg.addressDetailPid === addressDetailPid
 72 |     } yield (dg, sg)
 73 |     Compiled(q _)
 74 |   }
 75 |   
 76 |   @Path("addressGeocode/{addressDetailPid}")
 77 |   @ApiOperation(value = "List geocodes for an addressSitePid", nickname = "addressGeocode", httpMethod = "GET", response = classOf[Geocode], responseContainer = "List")
 78 |   def addressGeocode(@PathParam("addressDetailPid") addressDetailPid: String) = {
 79 |     val f = for {
 80 |       typ <- geocodeTypesFuture
 81 |       seq <- db.run(qGeocodes(addressDetailPid).result)
 82 |     } yield seq.map { case (dg, sg) =>
 83 |       // should either have 1 (dg, None) or 1 or more (dg, Some(addressSiteGeocode)), the latitude & longitude values should not be None
 84 |       sg.map { x => Geocode(x.geocodeTypeCode, x.geocodeTypeCode.map(typ), Some(x.reliabilityCode), Some(dg.geocodeTypeCode) == x.geocodeTypeCode && dg.latitude == x.latitude && dg.longitude == x.longitude, x.latitude.getOrElse(0), x.longitude.getOrElse(0)) }
 85 |         .getOrElse(Geocode(Some(dg.geocodeTypeCode), Some(typ(dg.geocodeTypeCode)), None, true, dg.latitude.getOrElse(0), dg.longitude.getOrElse(0))) // handle the (dg, None) no AddressSiteGeocode case
 86 |     }.sortBy(!_.isDefault)
 87 |     
 88 |     complete { f }
 89 |   }
 90 |     
 91 |   lazy val addressTypesFuture = db.run(AddressTypeAut.result).map(_.map(t => t.code -> t.description.getOrElse(t.code)).toMap)
 92 |     
 93 |   val qAddressSite = {
 94 |     def q(addressDetailPid: Rep[String]) = for {
 95 |       ad <- AddressDetail if ad.addressDetailPid === addressDetailPid
 96 |       as <- AddressSite if as.addressSitePid === ad.addressSitePid
 97 |     } yield as
 98 |     Compiled(q _)
 99 |   }
100 |   
101 |   @Path("addressType/{addressDetailPid}")
102 |   @ApiOperation(value = "AddressType for an addressSitePid", nickname = "addressType", httpMethod = "GET", response = classOf[AddressTypeOpt])
103 |   def addressType(@PathParam("addressDetailPid") addressDetailPid: String) = {
104 |     val f = for {
105 |       typ <- addressTypesFuture
106 |       asOpt <- db.run(qAddressSite(addressDetailPid).result.headOption)
107 |     } yield AddressTypeOpt(asOpt.map(as => AddressType(as.addressSitePid, as.addressType.map(typ))))
108 |     
109 |     complete { f }
110 |   }
111 | 
112 |   val routes = pathPrefix("gnaf") {
113 |       pathPrefix("geocodeType") {
114 |         get { geocodeType }
115 |       } ~
116 |       pathPrefix("addressGeocode") {
117 |         (get & path(Segment)) { addressGeocode }
118 |       } ~
119 |       pathPrefix("addressType") {
120 |         (get & path(Segment)) { addressType }
121 |       }
122 |   }
123 | }
124 | 
125 | object DbService {
126 |   implicit val sys = ActorSystem()
127 |   implicit val exec = sys.dispatcher
128 |   implicit val mat = ActorMaterializer()
129 |   
130 |   val logger = Logging(sys, getClass)
131 |   val config = ConfigFactory.load
132 |   val interface = config.getString("http.interface")
133 |   val port = config.getInt("http.port")
134 |   
135 |   val service = new DbService(logger, config)
136 |   
137 |   // /api-docs/swagger.json
138 |   val swagger = new SwaggerHttpService with HasActorSystem {
139 |     import scala.reflect.runtime.{ universe => ru }
140 | 
141 |     override implicit val actorSystem = sys
142 |     override implicit val materializer = mat
143 |     override val apiTypes = Seq(ru.typeOf[DbService])
144 |     override def swaggerConfig = new Swagger().basePath(prependSlashIfNecessary(basePath)) // don't specify protocol://host
145 |   }
146 | 
147 |   def main(args: Array[String]): Unit = {
148 |     val routes = cors() {
149 |       logRequestResult("DbService") { service.routes } ~ 
150 |       logRequestResult("Swagger") { swagger.routes }
151 |     }
152 |     Http().bindAndHandle(routes, interface, port)
153 |   }
154 | }
155 | 


--------------------------------------------------------------------------------
/gnaf-lucene/src/main/scala/au/csiro/data61/gnaf/lucene/GnafLucene.scala:
--------------------------------------------------------------------------------
  1 | package au.csiro.data61.gnaf.lucene
  2 | 
  3 | import org.apache.lucene.analysis.Analyzer
  4 | import org.apache.lucene.analysis.Analyzer.TokenStreamComponents
  5 | import org.apache.lucene.analysis.LowerCaseFilter
  6 | import org.apache.lucene.analysis.core.WhitespaceTokenizer
  7 | import org.apache.lucene.analysis.shingle.ShingleFilter
  8 | import org.apache.lucene.document.{ DoublePoint, FieldType }
  9 | import org.apache.lucene.index.{ FieldInvertState, IndexOptions, IndexWriter, IndexWriterConfig, Term }
 10 | import org.apache.lucene.search.{ BooleanClause, BooleanQuery, BoostQuery, FuzzyQuery, Query, TermQuery }
 11 | import org.apache.lucene.search.similarities.ClassicSimilarity
 12 | import org.apache.lucene.store.Directory
 13 | 
 14 | import LuceneUtil.tokenIter
 15 | import au.csiro.data61.gnaf.util.Util.getLogger
 16 | import org.apache.lucene.search.MatchAllDocsQuery
 17 | import org.apache.lucene.search.similarities.PerFieldSimilarityWrapper
 18 | 
 19 | /**
 20 |  * GNAF specific field names, analyzers and scoring for Lucene.
 21 |  */
 22 | object GnafLucene {
 23 |   val log = getLogger(getClass)
 24 |   
 25 |   /** GNAF Lucene field names */
 26 |   val F_JSON = "json"
 27 |   val F_LOCATION = "location"
 28 |   val F_ADDRESS = "address"
 29 |   val F_ADDRESS_NOALIAS = "addressNoAlias"
 30 |   val F_MISSING_DATA = "noData"
 31 |   
 32 |   val MISSING_DATA_TOKEN = "N" // store this token in F_MISSING_DATA once for each missing: site/building, flat, level, streetNum
 33 |   
 34 |   val BIGRAM_SEPARATOR = "~"
 35 |     
 36 |   /** count occurrences of x in s, x must be non-empty */
 37 |   def countOccurrences(s: String, x: String) = {
 38 |     assert(x.nonEmpty)
 39 |     var n = 0
 40 |     var i = 0
 41 |     while (i < s.length - x.length) {
 42 |       i = s.indexOf(x, i)
 43 |       if (i == -1) i = s.length
 44 |       else {
 45 |         n += 1
 46 |         i += x.length
 47 |       }
 48 |     }
 49 |     n
 50 |   }
 51 |   
 52 |   /** get n-gram size n */
 53 |   def shingleSize(s: String) = countOccurrences(s, BIGRAM_SEPARATOR) + 1
 54 |   
 55 |   /**
 56 |    * gnaf-test shows tf-idf doesn't work well with addresses
 57 |    * For F_DADDRESS disable tf, idf and length norm,
 58 |    * but for F_MISSING_DATA keep tf to favour multiple MISSING_DATA_TOKENs.
 59 |    */
 60 |   class MissingDataSimilarity extends ClassicSimilarity {
 61 |     // default tf - boost repeated MISSING_DATA_TOKEN tokens
 62 |     override def lengthNorm(state: FieldInvertState) = state.getBoost // no length norm, don't penalize multiple MISSING_DATA_TOKENs or multiple aliases
 63 |     override def idf(docFreq: Long, docCount: Long): Float = 1.0f // don't penalize MISSING_DATA_TOKEN or SMITH STREET for being common
 64 |   }
 65 |   class AddressSimilarity extends MissingDataSimilarity {
 66 |     override def tf(freq: Float): Float = 1.0f // don't boost street and locality name being the same
 67 |   }
 68 |   val classicSimilarity = new ClassicSimilarity
 69 |   object GnafSimilarity extends PerFieldSimilarityWrapper(classicSimilarity) {
 70 |     val md = new MissingDataSimilarity
 71 |     val addr = new AddressSimilarity
 72 |     override def get(name: String) = if (name == F_ADDRESS) addr else if (name == F_MISSING_DATA) md else classicSimilarity
 73 |   }
 74 | 
 75 |   val storedNotIndexedFieldType = {
 76 |     val t = new FieldType
 77 |     // based on StringField
 78 |     t.setOmitNorms(true);
 79 |     t.setStored(true);
 80 |     t.setTokenized(false);
 81 |     t.setIndexOptions(IndexOptions.NONE); // StringField has DOCS
 82 |     t.freeze();
 83 |     t
 84 |   }
 85 |   
 86 |   val addressFieldType = {
 87 |     val t = new FieldType
 88 |     // based on TextField
 89 |     t.setOmitNorms(true);
 90 |     t.setStored(true);
 91 |     t.setTokenized(true);
 92 |     t.setIndexOptions(IndexOptions.DOCS); // not using term freq, TextField has DOCS_AND_FREQS_AND_POSITIONS
 93 |     t.freeze();
 94 |     t
 95 |   }
 96 |   
 97 |   val flatStreetNumFieldType = {
 98 |     val t = new FieldType
 99 |     t.setOmitNorms(true);
100 |     t.setStored(false);
101 |     t.setTokenized(false);
102 |     t.setIndexOptions(IndexOptions.DOCS);
103 |     t.freeze();
104 |     t
105 |   }
106 |   
107 |   val missingDataFieldType = {
108 |     val t = new FieldType
109 |     t.setOmitNorms(true);
110 |     t.setStored(false);
111 |     t.setTokenized(false);
112 |     t.setIndexOptions(IndexOptions.DOCS_AND_FREQS); // using term freq
113 |     t.freeze();
114 |     t
115 |   }
116 |   
117 |   val shingleWhiteLowerAnalyzer = new Analyzer {
118 |     
119 |     override protected def createComponents(fieldName: String) = {
120 |       val source = new WhitespaceTokenizer()
121 |       // ShingleFilter defaults are:
122 |       //   minShingleSize = 2 (error if set < 2), maxShingleSize = 2
123 |       //   outputUnigrams = true
124 |       val result = new ShingleFilter(new LowerCaseFilter(source), 2, 2)
125 |       result.setTokenSeparator(BIGRAM_SEPARATOR) // default is " ", changed so we can explicitly add a bigram by passing "a~b" through the tokenizer
126 |       new TokenStreamComponents(source, result)
127 |     }
128 |     
129 |     override def getPositionIncrementGap(fieldName: String): Int = 100 // stop shingles matching across boundaries
130 |   }
131 |   
132 |   def mkIndexer(dir: Directory) = new IndexWriter(
133 |     dir,
134 |     new IndexWriterConfig(shingleWhiteLowerAnalyzer)
135 |       .setOpenMode(IndexWriterConfig.OpenMode.CREATE)
136 |       .setSimilarity(GnafSimilarity)
137 |   )
138 |   
139 |   case class FuzzyParam(
140 |       /** max number of edits permitted for a match (0 for no fuzzy matching) */
141 |       maxEdits: Int, 
142 |       /** fuzzy matching only applied to terms of at least this length */
143 |       minLength: Int,
144 |       /** the initial length that must match exactly before fuzzy matching is applied to the remainder */
145 |       prefixLength: Int
146 |   )
147 |   
148 |   case class BoundingBox(minLat: Double, minLon: Double, maxLat: Double, maxLon: Double) {
149 |     def toQuery = DoublePoint.newRangeQuery(F_LOCATION, Array[Double](minLat, minLon), Array[Double](maxLat, maxLon))
150 |   }
151 |   
152 |   case class QueryParam(
153 |       /** address search terms - best results if ordered: site/building name, unit/flat, level, street, locality, state abbreviation, postcode */
154 |       addr: String,
155 |       /** number of search results to return */
156 |       numHits: Int,
157 |       /** optional fuzzy matching */
158 |       fuzzy: Option[FuzzyParam], 
159 |       /** optional filtering by a bounding box (addr may be blank) */
160 |       box: Option[BoundingBox]
161 |     ) {
162 |     def toQuery: Query = {
163 |       val q = tokenIter(shingleWhiteLowerAnalyzer, F_ADDRESS, addr).foldLeft {
164 |         val b = new BooleanQuery.Builder
165 |         // small score increment for missing: build/site, flat, level, streetNo (smaller than for an actual match)
166 |         b.add(new BooleanClause(new BoostQuery(new TermQuery(new Term(F_MISSING_DATA, MISSING_DATA_TOKEN)), 0.05f), BooleanClause.Occur.SHOULD))
167 |         box.foreach(x => b.add(new BooleanClause(x.toQuery, BooleanClause.Occur.FILTER)))
168 |         if (addr.trim.isEmpty)
169 |           // mobile use case: all addresses in box around me
170 |           b.add(new BooleanClause(new MatchAllDocsQuery, BooleanClause.Occur.SHOULD))
171 |         else
172 |           b.setMinimumNumberShouldMatch(2) // could be MISSING_DATA_TOKEN and 1 user term or 2 user terms
173 |         b
174 |       }{ (b, t) =>
175 |         val q = {
176 |           val term = new Term(F_ADDRESS, t)
177 |           val q = fuzzy
178 |             .filter(f => f.maxEdits > 0 && t.length >= f.minLength)
179 |             .map(f => new FuzzyQuery(term, f.maxEdits, f.prefixLength))
180 |             .getOrElse(new TermQuery(term))
181 |           val n = shingleSize(t)
182 |           if (n < 2) q else new BoostQuery(q, Math.pow(3.0, n-1).toFloat)
183 |         }
184 |         b.add(new BooleanClause(q, BooleanClause.Occur.SHOULD))
185 |       }.build
186 |       log.debug(s"mkQuery: bool query = ${q.toString(F_ADDRESS)}")
187 |       q
188 |     }
189 |   }
190 |   
191 | }


--------------------------------------------------------------------------------
/gnaf-contrib/src/main/scala/au/csiro/data61/gnaf/contrib/service/ContribService.scala:
--------------------------------------------------------------------------------
  1 | package au.csiro.data61.gnaf.contrib.service
  2 | 
  3 | import scala.concurrent.ExecutionContextExecutor
  4 | import scala.concurrent.duration.DurationInt
  5 | import scala.math.BigDecimal
  6 | 
  7 | import com.github.swagger.akka.{ HasActorSystem, SwaggerHttpService }
  8 | import com.github.swagger.akka.model.Info
  9 | import com.typesafe.config.{ Config, ConfigFactory }
 10 | 
 11 | import akka.actor.ActorSystem
 12 | import akka.event.{ Logging, LoggingAdapter }
 13 | import akka.http.scaladsl.Http
 14 | import akka.http.scaladsl.marshallers.sprayjson.SprayJsonSupport.{ sprayJsonMarshaller, sprayJsonUnmarshaller }
 15 | import akka.http.scaladsl.marshalling.ToResponseMarshallable
 16 | import akka.http.scaladsl.model.HttpMethods
 17 | import akka.http.scaladsl.model.StatusCodes.BadRequest
 18 | import akka.http.scaladsl.server.Directives._
 19 | import akka.stream.{ ActorMaterializer, Materializer }
 20 | import au.csiro.data61.gnaf.util.Util
 21 | import au.csiro.data61.gnaf.contrib.db.ContribTables
 22 | import ch.megard.akka.http.cors.CorsDirectives.cors
 23 | import ch.megard.akka.http.cors.CorsSettings.defaultSettings
 24 | import io.swagger.annotations.{ Api, ApiParam, ApiImplicitParams, ApiImplicitParam, ApiOperation }
 25 | import io.swagger.models.Swagger
 26 | import javax.ws.rs.{ Path, PathParam, DefaultValue }
 27 | import slick.dbio.DBIOAction
 28 | import slick.jdbc.ResultSetAction
 29 | import spray.json.DefaultJsonProtocol
 30 | 
 31 | case class ContribGeocode(id: Option[Long], contribStatus: String, addressSiteGeocodePid: Option[String], dateCreated: Long, version: Int, addressSitePid: String, geocodeTypeCode: String, longitude: BigDecimal, latitude: BigDecimal)
 32 | case class ContribGeocodeKey(id: Long, version: Int)
 33 | 
 34 | object JsonProtocol extends DefaultJsonProtocol {
 35 |   implicit val contribGeocodeFormat = jsonFormat9(ContribGeocode.apply)
 36 |   implicit val contribGeocodeKeyFormat = jsonFormat2(ContribGeocodeKey.apply)
 37 | }
 38 | import JsonProtocol._
 39 | import io.swagger.models.Swagger
 40 | 
 41 | @Api(value = "contrib", produces = "application/json")
 42 | @Path("contrib")
 43 | class ContribService(logger: LoggingAdapter, config: Config)(implicit system: ActorSystem, executor: ExecutionContextExecutor, materializer: Materializer) {
 44 |   object MyContribTables extends {
 45 |     val profile = Util.getObject[slick.driver.JdbcProfile](config.getString("gnafContribDb.slickDriver")) // e.g. slick.driver.{H2Driver,PostgresDriver}
 46 |   } with ContribTables
 47 |   import MyContribTables._
 48 |   import MyContribTables.profile.api._
 49 |   
 50 |   implicit val db = Database.forConfig("gnafContribDb", config)
 51 |     
 52 |   def createSchemaIfNotExists = {
 53 |     import scala.concurrent.Await
 54 |     import scala.concurrent.duration._
 55 |     import slick.jdbc.GetResult._
 56 |     import slick.jdbc.ResultSetAction
 57 |     
 58 |     val listTablesAction = ResultSetAction[(String, String, String, String)](_.conn.getMetaData.getTables("", "", null, null)).map(_.filter(_._4 == "TABLE").map(_._3))
 59 |     val createIfNotExistsAction = listTablesAction.flatMap { tbls => 
 60 |       if (tbls.isEmpty) schema.create.map(_ => "createSchemaIfNotExists: schema created")
 61 |       else DBIOAction.successful(s"createSchemaIfNotExists: pre-existing tables = $tbls") 
 62 |     }
 63 |     logger.info(Await.result(db.run(createIfNotExistsAction), 15.seconds))
 64 |   }
 65 |   
 66 |   val qList = {
 67 |     def q(addressSitePid: Rep[String]) = AddressSiteGeocode.filter(_.addressSitePid === addressSitePid)
 68 |     Compiled(q _)
 69 |   }
 70 |   
 71 |   def toContribGeocode(x: AddressSiteGeocodeRow) = ContribGeocode(x.id, x.contribStatus, x.addressSiteGeocodePid, x.dateCreated.getTime, x.version, x.addressSitePid, x.geocodeTypeCode, x.longitude, x.latitude)
 72 | 
 73 |   @Path("{addressSitePid}")
 74 |   @ApiOperation(value = "List contributed geocodes for an addressSitePid", nickname = "list", 
 75 |       httpMethod = "GET", response = classOf[ContribGeocode], responseContainer = "List")
 76 |   def listRoute(
 77 |       @PathParam("addressSitePid")
 78 |       addressSitePid: String
 79 |   ) = {
 80 |     val f = db.run(qList(addressSitePid).result).map(_.map(toContribGeocode))
 81 |     complete { f }
 82 |   }
 83 | 
 84 |   val contribGeocodeWithId = (AddressSiteGeocode returning AddressSiteGeocode.map(_.id) )
 85 |   def toAddressSiteGeocodeRow(x: ContribGeocode) = AddressSiteGeocodeRow(x.id, x.contribStatus, x.addressSiteGeocodePid, new java.sql.Date(x.dateCreated), x.version, x.addressSitePid, x.geocodeTypeCode, x.longitude, x.latitude)
 86 |   
 87 |   @ApiOperation(value = "Add a new contributed geocode for an addressSitePid", nickname = "create",
 88 |       notes="""id, version and dateCreated input ignored & output set by system (however input values for version and dateCreated are still required). 
 89 |         
 90 | Example input (included here as @ApiParam(defaultValue) and @DefaultValue aren't working so far):
 91 | {
 92 | 	"contribStatus":"Submitted",
 93 | 	"addressSitePid":"712279621",
 94 | 	"geocodeTypeCode":"EM",
 95 | 	"longitude":149.1213974,
 96 | 	"latitude":-35.280994199999995,
 97 | 	"dateCreated":0,
 98 | 	"version":0
 99 | }    
100 | """,
101 |       httpMethod = "POST", response = classOf[ContribGeocode])
102 |   def createContribRoute(
103 |     @ApiParam(value = "contribGeocode", required = true, defaultValue = "Fred")
104 |     @DefaultValue("harry")
105 |     c: ContribGeocode
106 |   ) = {
107 |     val c2 = (c.copy(dateCreated = System.currentTimeMillis, version = 1))
108 |     val f = db.run(contribGeocodeWithId += toAddressSiteGeocodeRow(c2)).map(id => c2.copy(id = Some(id)))
109 |     complete { f }
110 |   }
111 |   
112 |   def qGet = {
113 |     def q(id: Rep[Long], version: Rep[Int]) = AddressSiteGeocode.filter(x => x.id === id && x.version === version)
114 |     Compiled(q _)
115 |   }
116 |   
117 |   @ApiOperation(value = "Delete a contributed geocode for an addressSitePid", nickname = "delete", 
118 |       notes="optimistic lock version must match to succeed", httpMethod = "DELETE", response = classOf[ContribGeocodeKey])
119 |   def deleteContribRoute(
120 |     @ApiParam(value = "contribGeocodeKey", required = true) 
121 |     key: ContribGeocodeKey
122 |   ) = {
123 |     val f = db.run(qGet(key.id, key.version).delete)
124 |     complete {
125 |       f.map[ToResponseMarshallable] { cnt =>
126 |         if (cnt == 1) key
127 |         else BadRequest -> s"key = $key not found"
128 |       }
129 |     }
130 |   }
131 | 
132 |   @ApiOperation(value = "Update a contributed geocode for an addressSitePid", nickname = "update",
133 |       notes = """optimistic lock version must match to succeed.
134 |         
135 | dateCreated input ignored (but still required); version and dateCreated output set by system
136 | """, 
137 |       httpMethod = "PUT", response = classOf[ContribGeocode])
138 |   def updateContribRoute(
139 |     @ApiParam(value = "contribGeocode", required = true) 
140 |     c: ContribGeocode
141 |   ) = {
142 |     val c2 = c.copy(version = c.version + 1, dateCreated = System.currentTimeMillis)
143 |     val f = db.run(qGet(c.id.get, c.version).update(toAddressSiteGeocodeRow(c2)))
144 |     complete {
145 |       f.map[ToResponseMarshallable] { cnt =>
146 |         if (cnt == 1) c2
147 |         else s"id = ${c.id}, version = ${c.version} not found"
148 |       }
149 |     }
150 |   }
151 |   
152 |   val routes = pathPrefix("contrib") {
153 |     (post & entity(as[ContribGeocode])) { createContribRoute } ~
154 |     (get & path(Segment)) { listRoute } ~
155 |     (delete & entity(as[ContribGeocodeKey])) { deleteContribRoute } ~
156 |     (put & entity(as[ContribGeocode])) { updateContribRoute }
157 |   }
158 | }
159 | 
160 | object ContribService {
161 |   implicit val sys = ActorSystem()
162 |   implicit val exec = sys.dispatcher
163 |   implicit val mat = ActorMaterializer()
164 |   
165 |   val logger = Logging(sys, getClass)
166 |   val config = ConfigFactory.load
167 |   val interface = config.getString("http.interface")
168 |   val port = config.getInt("http.port")
169 | 
170 |   val service = new ContribService(logger, config)
171 |   
172 |   // /api-docs/swagger.json
173 |   val swagger = new SwaggerHttpService with HasActorSystem {
174 |     import scala.reflect.runtime.{ universe => ru }
175 | 
176 |     override implicit val actorSystem = sys
177 |     override implicit val materializer = mat
178 |     override val apiTypes = Seq(ru.typeOf[ContribService])
179 |     override def swaggerConfig = new Swagger().basePath(prependSlashIfNecessary(basePath)) // don't specify protocol://host
180 |   }
181 | 
182 |   def main(args: Array[String]): Unit = {
183 |     service.createSchemaIfNotExists
184 |     
185 |     val routes = cors(defaultSettings.copy(allowedMethods = HttpMethods.DELETE +: defaultSettings.allowedMethods)) {
186 |       logRequestResult("GnafContrib") { service.routes } ~
187 |       logRequestResult("Swagger") { swagger.routes }
188 |     }
189 |     Http().bindAndHandle(routes, interface, port)
190 |   }
191 | }
192 | 


--------------------------------------------------------------------------------
/gnaf-search/src/main/scala/au/csiro/data61/gnaf/search/Search.scala:
--------------------------------------------------------------------------------
  1 | package au.csiro.data61.gnaf.search
  2 | 
  3 | import java.io.File
  4 | 
  5 | import scala.collection.mutable.ListBuffer
  6 | import scala.concurrent.{ ExecutionContextExecutor, Future }
  7 | import scala.io.Source
  8 | import scala.reflect.runtime.universe
  9 | 
 10 | import org.apache.lucene.document.Document
 11 | import org.apache.lucene.search.{ ScoreDoc, Sort }
 12 | 
 13 | import com.github.swagger.akka.{ HasActorSystem, SwaggerHttpService }
 14 | import com.typesafe.config.ConfigFactory
 15 | 
 16 | import akka.actor.ActorSystem
 17 | import akka.http.scaladsl.Http
 18 | import akka.http.scaladsl.marshallers.sprayjson.SprayJsonSupport.{ sprayJsonMarshaller, sprayJsonUnmarshaller }
 19 | import akka.http.scaladsl.marshalling.ToResponseMarshallable.apply
 20 | import akka.http.scaladsl.server.Directives._
 21 | import akka.http.scaladsl.server.RouteResult.route2HandlerFlow
 22 | import akka.http.scaladsl.server.directives.LoggingMagnet.forRequestResponseFromMarker
 23 | import akka.stream.{ ActorMaterializer, Materializer }
 24 | import au.csiro.data61.gnaf.lucene.GnafLucene._
 25 | import au.csiro.data61.gnaf.lucene.LuceneUtil.{ Searcher, directory }
 26 | import au.csiro.data61.gnaf.search.Search.Result
 27 | import au.csiro.data61.gnaf.util.Util.getLogger
 28 | import ch.megard.akka.http.cors.CorsDirectives.cors
 29 | import io.swagger.annotations.{ Api, ApiOperation, ApiParam }
 30 | import io.swagger.models.Swagger
 31 | import javax.ws.rs.Path
 32 | import spray.json.{ DefaultJsonProtocol, pimpString }
 33 | 
 34 | object Search {
 35 |   val log = getLogger(getClass)
 36 | 
 37 |   case class CliOption(indexDir: File, bulk: Int, numHits: Int, fuzzyMinLength: Int, fuzzyMaxEdits: Int, fuzzyPrefixLength: Int, interface: String, port: Int)
 38 |   val defaultCliOption = {
 39 |     val c = ConfigFactory.load.getConfig("gnafSearch")
 40 |     def gs(n: String) = c.getString(n)
 41 |     def gi(n: String) = c.getInt(n)
 42 |     CliOption(new File(gs("indexDir")), gi("bulk"), gi("numHits"), gi("fuzzyMinLength"), gi("fuzzyMaxEdits"), gi("fuzzyPrefixLength"), gs("interface"), gi("port"))
 43 |   }
 44 | 
 45 |   def main(args: Array[String]): Unit = {
 46 |     val parser = new scopt.OptionParser[CliOption]("gnaf-indexer") {
 47 |       head("gnaf-lucene-service", "0.x")
 48 |       note("JSON web service for address searches")
 49 |       opt[File]('i', "indexDir") action { (x, c) =>
 50 |         c.copy(indexDir = x)
 51 |       } text (s"Lucene index directory, default ${defaultCliOption.indexDir}")
 52 |       opt[Int]('b', "bulk") action { (x, c) =>
 53 |         c.copy(bulk = x)
 54 |       } text (s"max addresses client may put in a bulk request, default ${defaultCliOption.bulk}")
 55 |       opt[Int]('h', "numHits") action { (x, c) =>
 56 |         c.copy(numHits = x)
 57 |       } text (s"max client may request for the number of search hits, default ${defaultCliOption.numHits}")
 58 |       opt[Int]('f', "minFuzzyLength") action { (x, c) =>
 59 |         c.copy(fuzzyMinLength = x)
 60 |       } text (s"min client may request for min query term length for fuzzy match, default ${defaultCliOption.fuzzyMinLength}")
 61 |       opt[Int]('e', "fuzzyMaxEdits") action { (x, c) =>
 62 |         c.copy(fuzzyMaxEdits = x)
 63 |       } text (s"max client may request for max edits for a fuzzy match, default ${defaultCliOption.fuzzyMaxEdits}")
 64 |       opt[Int]('p', "fuzzyPrefixLength") action { (x, c) =>
 65 |         c.copy(fuzzyPrefixLength = x)
 66 |       } text (s"min client may request for min initial chars that must match exactly for a fuzzy match, default ${defaultCliOption.fuzzyPrefixLength}")
 67 |       opt[String]('n', "interface") action { (x, c) =>
 68 |         c.copy(interface = x)
 69 |       } text (s"network interface (name or IP address) to attach to, default ${defaultCliOption.interface}")
 70 |       opt[Int]('r', "port") action { (x, c) =>
 71 |         c.copy(port = x)
 72 |       } text (s"IP port to listen on, default ${defaultCliOption.port}")
 73 |       help("help") text ("prints this usage text")
 74 |     }
 75 |     parser.parse(args, defaultCliOption) foreach run
 76 |   }
 77 |   
 78 |   case class Hit(score: Float, json: String, d61Address: List[String], d61AddressNoAlias: String)
 79 |   def toHit(scoreDoc: ScoreDoc, doc: Document) = {
 80 |     Hit(scoreDoc.score, doc.get(F_JSON), doc.getValues(F_ADDRESS).toList, doc.get(F_ADDRESS_NOALIAS))
 81 |   }
 82 |   
 83 |   case class Result(totalHits: Int, elapsedSecs: Float, hits: Seq[Hit], error: Option[String])
 84 |   def toResult(totalHits: Int, elapsedSecs: Float, hits: Seq[Hit], error: Option[String])
 85 |     = Result(totalHits, elapsedSecs, hits, error)
 86 |     
 87 |   def toSort(f: Option[String], asc: Boolean): Option[Sort] = None
 88 |   
 89 |   def validationBuf(c: CliOption, qp: QueryParam): ListBuffer[String] = {
 90 |     val b = new ListBuffer[String]()
 91 |     if (qp.numHits > c.numHits) b += s"numHits = ${qp.numHits} exceeds max of ${c.numHits}"
 92 |     qp.fuzzy.foreach { f =>
 93 |       if (f.minLength < c.fuzzyMinLength) b += s"fuzzy minLength = ${f.minLength} less than min of ${c.fuzzyMinLength}"
 94 |       if (f.maxEdits > c.fuzzyMaxEdits) b += s"fuzzy maxEdits = ${f.maxEdits} exceeds max of ${c.fuzzyMaxEdits}"
 95 |       if (f.prefixLength < c.fuzzyPrefixLength) b += s"fuzzy prefixLength = ${f.prefixLength} less than min of ${c.fuzzyPrefixLength}"
 96 |       if (f.prefixLength >= f.minLength) b += s"fuzzy prefixLength = ${f.prefixLength} not less than minLength = ${f.minLength}"
 97 |     }
 98 |     b
 99 |   }
100 |   
101 |   /** validation error message or empty for no error */
102 |   def validationError(b: ListBuffer[String]) = b.mkString("\n")
103 |   
104 |   case class BulkQueryParam(addresses: Seq[String], numHits: Int, fuzzy: Option[FuzzyParam], box: Option[BoundingBox])
105 | 
106 |   def validationBuf(c: CliOption, bqp: BulkQueryParam): ListBuffer[String] = {
107 |     val b = validationBuf(c, QueryParam("", bqp.numHits, bqp.fuzzy, bqp.box))
108 |     if (bqp.addresses.size > c.bulk) b += s"addresses.size = ${bqp.addresses.size} exceeds max of ${c.bulk}"
109 |     b
110 |   }
111 |   
112 |   case class Version(`git-commit`: String, `sbt-version`: String, `gnaf-version`: String)
113 |   
114 |   object JsonProtocol extends DefaultJsonProtocol {
115 |     implicit val hitFormat = jsonFormat4(Hit)
116 |     implicit val resultFormat = jsonFormat4(Result)
117 |     implicit val fuzzyParamFormat = jsonFormat3(FuzzyParam)
118 |     implicit val boundingBoxFormat = jsonFormat4(BoundingBox)
119 |     implicit val queryParamFormat = jsonFormat4(QueryParam)
120 |     implicit val bulkQueryParamFormat = jsonFormat4(BulkQueryParam)
121 |     implicit val versionFormat = jsonFormat3(Version)
122 |   }
123 |   import JsonProtocol._
124 | 
125 |   
126 |   def mkSearcher(c: CliOption) = {
127 |     val s = new Searcher(directory(c.indexDir), toHit, toResult)
128 |     s.searcher.setSimilarity(GnafSimilarity)
129 |     s
130 |   }
131 |   
132 |   def run(c: CliOption) = {
133 |     
134 |     val version = {
135 |       Option(getClass.getResourceAsStream("/version.json")).map { s =>
136 |         Source.fromInputStream(s).getLines.mkString("\n").parseJson.convertTo[Version]
137 |       }.getOrElse(Version("unknown git-commit", "unknown sbt-version", "unknown gnaf-version"))
138 |     }
139 |     
140 |     implicit val sys = ActorSystem()
141 |     implicit val exec = sys.dispatcher
142 |     implicit val mat = ActorMaterializer()
143 |     
144 |     val luceneService = new LuceneService(c, mkSearcher(c), version)
145 |     
146 |     // /api-docs/swagger.json
147 |     val swaggerService = new SwaggerHttpService() with HasActorSystem {
148 |       override implicit val actorSystem = sys
149 |       override implicit val materializer = mat
150 |       override val apiTypes = Seq(scala.reflect.runtime.universe.typeOf[LuceneService])
151 |       override def swaggerConfig = new Swagger().basePath(prependSlashIfNecessary(basePath)) // don't specify protocol://host
152 |     }
153 |     
154 |     val routes = cors() {
155 |       logRequestResult("LuceneService") { luceneService.routes } ~ 
156 |       logRequestResult("Swagger") { swaggerService.routes }
157 |     }
158 |     log.info("starting service ...")
159 |     Http().bindAndHandle(routes, c.interface, c.port)
160 |   }
161 | }
162 | 
163 | import Search._
164 | import Search.JsonProtocol._
165 | 
166 | @Api(value = "search", produces = "application/json")
167 | @Path("")
168 | class LuceneService(c: CliOption, searcher: Searcher[Hit, Result], version: Version)
169 | (implicit system: ActorSystem, executor: ExecutionContextExecutor, materializer: Materializer) {
170 |   
171 |   @Path("version")
172 |   @ApiOperation(value = "Version of software and data", nickname = "version", notes="""longer description""", httpMethod = "GET", response = classOf[Version])
173 |   def versionRoute =
174 |     complete { // Future { 
175 |       version
176 |     } //}
177 |   
178 |   @Path("search")
179 |   @ApiOperation(value = "Search for an address", nickname = "search", notes="""longer description""", httpMethod = "POST", response = classOf[Result])
180 |   def searchRoute(
181 |     @ApiParam(value = "queryParam", required = true) q: QueryParam
182 |   ) = {
183 |     val err = validationError(validationBuf(c, q))
184 |     validate(err.isEmpty, err) { complete { Future {
185 |       searcher.search(q.toQuery, q.numHits)
186 |     }}}
187 |   }
188 |   
189 |   @Path("bulkSearch")
190 |   @ApiOperation(value = "Search for many addresses", nickname = "bulkSearch", notes="""longer description""", httpMethod = "POST", response = classOf[Array[Result]])
191 |   def bulkSearchRoute(
192 |     @ApiParam(value = "bulkQueryParam", required = true) q: BulkQueryParam
193 |   ) = {
194 |     val err = validationError(validationBuf(c, q))
195 |     validate(err.isEmpty, err) { complete { Future {
196 |       def seqop(z: Seq[Result], addr: String) = z :+ searcher.search(QueryParam(addr, q.numHits, q.fuzzy, q.box).toQuery, q.numHits)
197 |       q.addresses.par.aggregate(Seq.empty[Result])(seqop, _ ++ _)
198 |     }}}
199 |   }
200 |   
201 |   val routes = { 
202 |     pathPrefix("version")    { get                                 { versionRoute    } } ~
203 |     pathPrefix("search")     { (post & entity(as[QueryParam]))     { searchRoute     } } ~
204 |     pathPrefix("bulkSearch") { (post & entity(as[BulkQueryParam])) { bulkSearchRoute } }
205 |   }
206 | 
207 | }
208 | 


--------------------------------------------------------------------------------
/gnaf-contrib/3rd-party-licenses.html:
--------------------------------------------------------------------------------
1 | <html><head><title>gnaf-contrib-licenses</title></head><body><h1>gnaf-contrib-licenses</h1><table border="0" cellspacing="0" cellpading="1">
2 |       <thead><tr><th>Category</th><th>License</th><th>Dependency</th><th>Notes</th></tr></thead>
3 |     <tbody><tr><td>Apache&nbsp;</td><td><a href="https://www.apache.org/licenses/LICENSE-2.0.txt">Apache 2</a>&nbsp;</td><td>ch.megard # akka-http-cors_2.11 # 0.1.2&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0.txt">Apache 2</a>&nbsp;</td><td>io.spray # spray-json_2.11 # 1.3.2&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0.txt">Apache 2</a>&nbsp;</td><td>joda-time # joda-time # 2.2&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0.html">Apache 2.0 License</a>&nbsp;</td><td>com.typesafe.scala-logging # scala-logging_2.11 # 3.1.0&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="LICENSE.txt">Apache License Version 2.0</a>&nbsp;</td><td>org.yaml # snakeyaml # 1.12&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0">Apache License, Version 2.0</a>&nbsp;</td><td>com.typesafe # config # 1.3.0&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0">Apache License, Version 2.0</a>&nbsp;</td><td>com.typesafe # ssl-config-akka_2.11 # 0.2.1&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0">Apache License, Version 2.0</a>&nbsp;</td><td>com.typesafe # ssl-config-core_2.11 # 0.2.1&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0">Apache License, Version 2.0</a>&nbsp;</td><td>com.typesafe.akka # akka-actor_2.11 # 2.4.6&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0">Apache License, Version 2.0</a>&nbsp;</td><td>com.typesafe.akka # akka-http-core_2.11 # 2.4.6&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0">Apache License, Version 2.0</a>&nbsp;</td><td>com.typesafe.akka # akka-http-experimental_2.11 # 2.4.6&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0">Apache License, Version 2.0</a>&nbsp;</td><td>com.typesafe.akka # akka-http-spray-json-experimental_2.11 # 2.4.6&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0">Apache License, Version 2.0</a>&nbsp;</td><td>com.typesafe.akka # akka-http-testkit_2.11 # 2.4.3&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0">Apache License, Version 2.0</a>&nbsp;</td><td>com.typesafe.akka # akka-parsing_2.11 # 2.4.6&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0">Apache License, Version 2.0</a>&nbsp;</td><td>com.typesafe.akka # akka-stream-testkit_2.11 # 2.4.3&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0">Apache License, Version 2.0</a>&nbsp;</td><td>com.typesafe.akka # akka-stream_2.11 # 2.4.6&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0">Apache License, Version 2.0</a>&nbsp;</td><td>com.typesafe.akka # akka-testkit_2.11 # 2.4.3&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0.txt">The Apache Software License, Version 2.0</a>&nbsp;</td><td>com.fasterxml.jackson.core # jackson-annotations # 2.4.5&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0.txt">The Apache Software License, Version 2.0</a>&nbsp;</td><td>com.fasterxml.jackson.core # jackson-core # 2.4.5&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0.txt">The Apache Software License, Version 2.0</a>&nbsp;</td><td>com.fasterxml.jackson.core # jackson-databind # 2.4.5&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0.txt">The Apache Software License, Version 2.0</a>&nbsp;</td><td>com.fasterxml.jackson.dataformat # jackson-dataformat-xml # 2.4.5&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0.txt">The Apache Software License, Version 2.0</a>&nbsp;</td><td>com.fasterxml.jackson.dataformat # jackson-dataformat-yaml # 2.4.5&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0.txt">The Apache Software License, Version 2.0</a>&nbsp;</td><td>com.fasterxml.jackson.datatype # jackson-datatype-joda # 2.4.5&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0.txt">The Apache Software License, Version 2.0</a>&nbsp;</td><td>com.fasterxml.jackson.jaxrs # jackson-jaxrs-base # 2.4.5&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0.txt">The Apache Software License, Version 2.0</a>&nbsp;</td><td>com.fasterxml.jackson.jaxrs # jackson-jaxrs-json-provider # 2.4.5&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0.txt">The Apache Software License, Version 2.0</a>&nbsp;</td><td>com.fasterxml.jackson.module # jackson-module-jaxb-annotations # 2.4.5&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0.txt">The Apache Software License, Version 2.0</a>&nbsp;</td><td>com.fasterxml.jackson.module # jackson-module-scala_2.11 # 2.4.2&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0.txt">The Apache Software License, Version 2.0</a>&nbsp;</td><td>com.github.swagger-akka-http # swagger-akka-http_2.11 # 0.7.0&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0.txt">The Apache Software License, Version 2.0</a>&nbsp;</td><td>com.google.guava # guava # 18.0&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0.txt">The Apache Software License, Version 2.0</a>&nbsp;</td><td>com.zaxxer # HikariCP-java6 # 2.3.7&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0.txt">The Apache Software License, Version 2.0</a>&nbsp;</td><td>io.swagger # swagger-annotations # 1.5.9&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0.txt">The Apache Software License, Version 2.0</a>&nbsp;</td><td>io.swagger # swagger-core # 1.5.9&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0.txt">The Apache Software License, Version 2.0</a>&nbsp;</td><td>io.swagger # swagger-jaxrs # 1.5.9&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0.txt">The Apache Software License, Version 2.0</a>&nbsp;</td><td>io.swagger # swagger-models # 1.5.9&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0.txt">The Apache Software License, Version 2.0</a>&nbsp;</td><td>io.swagger # swagger-scala-module_2.11 # 1.0.2&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0.txt">The Apache Software License, Version 2.0</a>&nbsp;</td><td>javax.validation # validation-api # 1.1.0.Final&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0.txt">The Apache Software License, Version 2.0</a>&nbsp;</td><td>org.apache.commons # commons-lang3 # 3.2.1&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0.txt">The Apache Software License, Version 2.0</a>&nbsp;</td><td>org.javassist # javassist # 3.19.0-GA&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0.txt">The Apache Software License, Version 2.0</a>&nbsp;</td><td>org.json4s # json4s-ast_2.11 # 3.2.11&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0.txt">The Apache Software License, Version 2.0</a>&nbsp;</td><td>org.json4s # json4s-core_2.11 # 3.2.11&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0.txt">The Apache Software License, Version 2.0</a>&nbsp;</td><td>org.json4s # json4s-jackson_2.11 # 3.2.11&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0.txt">The Apache Software License, Version 2.0</a>&nbsp;</td><td>org.json4s # json4s-native_2.11 # 3.2.11&nbsp;</td><td></td></tr><tr><td>BSD&nbsp;</td><td><a href="https://github.com/data61/gnaf/blob/master/LICENSE.txt">BSD</a>&nbsp;</td><td>au.csiro.data61.gnaf # gnaf-util_2.11 # 0.8-SNAPSHOT&nbsp;</td><td></td></tr><tr><td>BSD&nbsp;</td><td><a href="http://www.scala-lang.org/license.html">BSD 3-Clause</a>&nbsp;</td><td>org.scala-lang # scala-compiler # 2.11.8&nbsp;</td><td></td></tr><tr><td>BSD&nbsp;</td><td><a href="http://www.scala-lang.org/license.html">BSD 3-Clause</a>&nbsp;</td><td>org.scala-lang # scala-library # 2.11.8&nbsp;</td><td></td></tr><tr><td>BSD&nbsp;</td><td><a href="http://www.scala-lang.org/license.html">BSD 3-Clause</a>&nbsp;</td><td>org.scala-lang # scala-reflect # 2.11.8&nbsp;</td><td></td></tr><tr><td>BSD&nbsp;</td><td><a href="http://www.scala-lang.org/license.html">BSD 3-Clause</a>&nbsp;</td><td>org.scala-lang # scalap # 2.11.8&nbsp;</td><td></td></tr><tr><td>BSD&nbsp;</td><td><a href="http://opensource.org/licenses/BSD-3-Clause">BSD 3-clause</a>&nbsp;</td><td>org.scala-lang.modules # scala-java8-compat_2.11 # 0.7.0&nbsp;</td><td></td></tr><tr><td>BSD&nbsp;</td><td><a href="http://opensource.org/licenses/BSD-3-Clause">BSD 3-clause</a>&nbsp;</td><td>org.scala-lang.modules # scala-parser-combinators_2.11 # 1.0.4&nbsp;</td><td></td></tr><tr><td>BSD&nbsp;</td><td><a href="http://opensource.org/licenses/BSD-3-Clause">BSD 3-clause</a>&nbsp;</td><td>org.scala-lang.modules # scala-xml_2.11 # 1.0.4&nbsp;</td><td></td></tr><tr><td>BSD&nbsp;</td><td><a href="http://www.opensource.org/licenses/bsd-license.php">BSD-Style</a>&nbsp;</td><td>com.thoughtworks.paranamer # paranamer # 2.6&nbsp;</td><td></td></tr><tr><td>BSD&nbsp;</td><td><a href="http://www.opensource.org/licenses/bsd-license.php">The BSD License</a>&nbsp;</td><td>org.codehaus.woodstox # stax2-api # 3.1.4&nbsp;</td><td></td></tr><tr><td>BSD&nbsp;</td><td><a href="http://www.opensource.org/licenses/bsd-license.html">The New BSD License</a>&nbsp;</td><td>org.reflections # reflections # 0.9.10&nbsp;</td><td></td></tr><tr><td>BSD&nbsp;</td><td><a href="http://github.com/slick/slick/blob/master/LICENSE.txt">Two-clause BSD-style license</a>&nbsp;</td><td>com.typesafe.slick # slick-codegen_2.11 # 3.1.1&nbsp;</td><td></td></tr><tr><td>BSD&nbsp;</td><td><a href="http://github.com/slick/slick/blob/master/LICENSE.txt">Two-clause BSD-style license</a>&nbsp;</td><td>com.typesafe.slick # slick-hikaricp_2.11 # 3.1.1&nbsp;</td><td></td></tr><tr><td>BSD&nbsp;</td><td><a href="http://github.com/slick/slick/blob/master/LICENSE.txt">Two-clause BSD-style license</a>&nbsp;</td><td>com.typesafe.slick # slick_2.11 # 3.1.1&nbsp;</td><td></td></tr><tr><td>CC0&nbsp;</td><td><a href="http://creativecommons.org/publicdomain/zero/1.0/">CC0</a>&nbsp;</td><td>org.reactivestreams # reactive-streams # 1.0.0&nbsp;</td><td></td></tr><tr><td>GPL with Classpath Extension&nbsp;</td><td><a href="https://glassfish.dev.java.net/nonav/public/CDDL+GPL.html">CDDL + GPLv2 with classpath exception</a>&nbsp;</td><td>javax.ws.rs # jsr311-api # 1.1.1&nbsp;</td><td></td></tr><tr><td>LGPL&nbsp;</td><td><a href="http://logback.qos.ch/license.html">EPL + GNU Lesser General Public License</a>&nbsp;</td><td>ch.qos.logback # logback-classic # 1.1.3&nbsp;</td><td></td></tr><tr><td>LGPL&nbsp;</td><td><a href="http://logback.qos.ch/license.html">EPL + GNU Lesser General Public License</a>&nbsp;</td><td>ch.qos.logback # logback-core # 1.1.3&nbsp;</td><td></td></tr><tr><td>LGPL&nbsp;</td><td><a href="http://www.gnu.org/licenses/lgpl.html">GNU Lesser General Public License</a>&nbsp;</td><td>com.google.code.findbugs # annotations # 2.0.1&nbsp;</td><td></td></tr><tr><td>LGPL&nbsp;</td><td><a href="http://www.gnu.org/licenses/lgpl.html">GNU Lesser General Public License</a>&nbsp;</td><td>com.google.code.findbugs # jsr305 # 2.0.1&nbsp;</td><td></td></tr><tr><td>MIT&nbsp;</td><td><a href="http://www.slf4j.org/license.html">MIT License</a>&nbsp;</td><td>org.slf4j # slf4j-api # 1.7.12&nbsp;</td><td></td></tr><tr><td>Mozilla&nbsp;</td><td><a href="http://h2database.com/html/license.html">MPL 2.0 or EPL 1.0</a>&nbsp;</td><td>com.h2database # h2 # 1.4.193&nbsp;</td><td></td></tr></tbody></table></body></html>


--------------------------------------------------------------------------------
/gnaf-db-service/3rd-party-licenses.html:
--------------------------------------------------------------------------------
1 | <html><head><title>gnaf-db-service-licenses</title></head><body><h1>gnaf-db-service-licenses</h1><table border="0" cellspacing="0" cellpading="1">
2 |       <thead><tr><th>Category</th><th>License</th><th>Dependency</th><th>Notes</th></tr></thead>
3 |     <tbody><tr><td>Apache&nbsp;</td><td><a href="https://www.apache.org/licenses/LICENSE-2.0.txt">Apache 2</a>&nbsp;</td><td>ch.megard # akka-http-cors_2.11 # 0.1.2&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0.txt">Apache 2</a>&nbsp;</td><td>io.spray # spray-json_2.11 # 1.3.2&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0.txt">Apache 2</a>&nbsp;</td><td>joda-time # joda-time # 2.2&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0.html">Apache 2.0 License</a>&nbsp;</td><td>com.typesafe.scala-logging # scala-logging_2.11 # 3.1.0&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="LICENSE.txt">Apache License Version 2.0</a>&nbsp;</td><td>org.yaml # snakeyaml # 1.12&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0">Apache License, Version 2.0</a>&nbsp;</td><td>com.typesafe # config # 1.3.0&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0">Apache License, Version 2.0</a>&nbsp;</td><td>com.typesafe # ssl-config-akka_2.11 # 0.2.1&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0">Apache License, Version 2.0</a>&nbsp;</td><td>com.typesafe # ssl-config-core_2.11 # 0.2.1&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0">Apache License, Version 2.0</a>&nbsp;</td><td>com.typesafe.akka # akka-actor_2.11 # 2.4.6&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0">Apache License, Version 2.0</a>&nbsp;</td><td>com.typesafe.akka # akka-http-core_2.11 # 2.4.6&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0">Apache License, Version 2.0</a>&nbsp;</td><td>com.typesafe.akka # akka-http-experimental_2.11 # 2.4.6&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0">Apache License, Version 2.0</a>&nbsp;</td><td>com.typesafe.akka # akka-http-spray-json-experimental_2.11 # 2.4.6&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0">Apache License, Version 2.0</a>&nbsp;</td><td>com.typesafe.akka # akka-http-testkit_2.11 # 2.4.3&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0">Apache License, Version 2.0</a>&nbsp;</td><td>com.typesafe.akka # akka-parsing_2.11 # 2.4.6&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0">Apache License, Version 2.0</a>&nbsp;</td><td>com.typesafe.akka # akka-stream-testkit_2.11 # 2.4.3&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0">Apache License, Version 2.0</a>&nbsp;</td><td>com.typesafe.akka # akka-stream_2.11 # 2.4.6&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0">Apache License, Version 2.0</a>&nbsp;</td><td>com.typesafe.akka # akka-testkit_2.11 # 2.4.3&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0.txt">The Apache Software License, Version 2.0</a>&nbsp;</td><td>com.fasterxml.jackson.core # jackson-annotations # 2.4.5&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0.txt">The Apache Software License, Version 2.0</a>&nbsp;</td><td>com.fasterxml.jackson.core # jackson-core # 2.4.5&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0.txt">The Apache Software License, Version 2.0</a>&nbsp;</td><td>com.fasterxml.jackson.core # jackson-databind # 2.4.5&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0.txt">The Apache Software License, Version 2.0</a>&nbsp;</td><td>com.fasterxml.jackson.dataformat # jackson-dataformat-xml # 2.4.5&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0.txt">The Apache Software License, Version 2.0</a>&nbsp;</td><td>com.fasterxml.jackson.dataformat # jackson-dataformat-yaml # 2.4.5&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0.txt">The Apache Software License, Version 2.0</a>&nbsp;</td><td>com.fasterxml.jackson.datatype # jackson-datatype-joda # 2.4.5&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0.txt">The Apache Software License, Version 2.0</a>&nbsp;</td><td>com.fasterxml.jackson.jaxrs # jackson-jaxrs-base # 2.4.5&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0.txt">The Apache Software License, Version 2.0</a>&nbsp;</td><td>com.fasterxml.jackson.jaxrs # jackson-jaxrs-json-provider # 2.4.5&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0.txt">The Apache Software License, Version 2.0</a>&nbsp;</td><td>com.fasterxml.jackson.module # jackson-module-jaxb-annotations # 2.4.5&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0.txt">The Apache Software License, Version 2.0</a>&nbsp;</td><td>com.fasterxml.jackson.module # jackson-module-scala_2.11 # 2.4.2&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0.txt">The Apache Software License, Version 2.0</a>&nbsp;</td><td>com.github.swagger-akka-http # swagger-akka-http_2.11 # 0.7.0&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0.txt">The Apache Software License, Version 2.0</a>&nbsp;</td><td>com.google.guava # guava # 18.0&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0.txt">The Apache Software License, Version 2.0</a>&nbsp;</td><td>com.zaxxer # HikariCP-java6 # 2.3.7&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0.txt">The Apache Software License, Version 2.0</a>&nbsp;</td><td>io.swagger # swagger-annotations # 1.5.9&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0.txt">The Apache Software License, Version 2.0</a>&nbsp;</td><td>io.swagger # swagger-core # 1.5.9&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0.txt">The Apache Software License, Version 2.0</a>&nbsp;</td><td>io.swagger # swagger-jaxrs # 1.5.9&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0.txt">The Apache Software License, Version 2.0</a>&nbsp;</td><td>io.swagger # swagger-models # 1.5.9&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0.txt">The Apache Software License, Version 2.0</a>&nbsp;</td><td>io.swagger # swagger-scala-module_2.11 # 1.0.2&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0.txt">The Apache Software License, Version 2.0</a>&nbsp;</td><td>javax.validation # validation-api # 1.1.0.Final&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0.txt">The Apache Software License, Version 2.0</a>&nbsp;</td><td>org.apache.commons # commons-lang3 # 3.2.1&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0.txt">The Apache Software License, Version 2.0</a>&nbsp;</td><td>org.javassist # javassist # 3.19.0-GA&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0.txt">The Apache Software License, Version 2.0</a>&nbsp;</td><td>org.json4s # json4s-ast_2.11 # 3.2.11&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0.txt">The Apache Software License, Version 2.0</a>&nbsp;</td><td>org.json4s # json4s-core_2.11 # 3.2.11&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0.txt">The Apache Software License, Version 2.0</a>&nbsp;</td><td>org.json4s # json4s-jackson_2.11 # 3.2.11&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0.txt">The Apache Software License, Version 2.0</a>&nbsp;</td><td>org.json4s # json4s-native_2.11 # 3.2.11&nbsp;</td><td></td></tr><tr><td>BSD&nbsp;</td><td><a href="https://github.com/data61/gnaf/blob/master/LICENSE.txt">BSD</a>&nbsp;</td><td>au.csiro.data61.gnaf # gnaf-db_2.11 # 0.8-SNAPSHOT&nbsp;</td><td></td></tr><tr><td>BSD&nbsp;</td><td><a href="https://github.com/data61/gnaf/blob/master/LICENSE.txt">BSD</a>&nbsp;</td><td>au.csiro.data61.gnaf # gnaf-util_2.11 # 0.8-SNAPSHOT&nbsp;</td><td></td></tr><tr><td>BSD&nbsp;</td><td><a href="http://www.scala-lang.org/license.html">BSD 3-Clause</a>&nbsp;</td><td>org.scala-lang # scala-compiler # 2.11.8&nbsp;</td><td></td></tr><tr><td>BSD&nbsp;</td><td><a href="http://www.scala-lang.org/license.html">BSD 3-Clause</a>&nbsp;</td><td>org.scala-lang # scala-library # 2.11.8&nbsp;</td><td></td></tr><tr><td>BSD&nbsp;</td><td><a href="http://www.scala-lang.org/license.html">BSD 3-Clause</a>&nbsp;</td><td>org.scala-lang # scala-reflect # 2.11.8&nbsp;</td><td></td></tr><tr><td>BSD&nbsp;</td><td><a href="http://www.scala-lang.org/license.html">BSD 3-Clause</a>&nbsp;</td><td>org.scala-lang # scalap # 2.11.8&nbsp;</td><td></td></tr><tr><td>BSD&nbsp;</td><td><a href="http://opensource.org/licenses/BSD-3-Clause">BSD 3-clause</a>&nbsp;</td><td>org.scala-lang.modules # scala-java8-compat_2.11 # 0.7.0&nbsp;</td><td></td></tr><tr><td>BSD&nbsp;</td><td><a href="http://opensource.org/licenses/BSD-3-Clause">BSD 3-clause</a>&nbsp;</td><td>org.scala-lang.modules # scala-parser-combinators_2.11 # 1.0.4&nbsp;</td><td></td></tr><tr><td>BSD&nbsp;</td><td><a href="http://opensource.org/licenses/BSD-3-Clause">BSD 3-clause</a>&nbsp;</td><td>org.scala-lang.modules # scala-xml_2.11 # 1.0.4&nbsp;</td><td></td></tr><tr><td>BSD&nbsp;</td><td><a href="http://www.opensource.org/licenses/bsd-license.php">BSD-Style</a>&nbsp;</td><td>com.thoughtworks.paranamer # paranamer # 2.6&nbsp;</td><td></td></tr><tr><td>BSD&nbsp;</td><td><a href="http://www.opensource.org/licenses/bsd-license.php">The BSD License</a>&nbsp;</td><td>org.codehaus.woodstox # stax2-api # 3.1.4&nbsp;</td><td></td></tr><tr><td>BSD&nbsp;</td><td><a href="http://www.opensource.org/licenses/bsd-license.html">The New BSD License</a>&nbsp;</td><td>org.reflections # reflections # 0.9.10&nbsp;</td><td></td></tr><tr><td>BSD&nbsp;</td><td><a href="http://github.com/slick/slick/blob/master/LICENSE.txt">Two-clause BSD-style license</a>&nbsp;</td><td>com.typesafe.slick # slick-codegen_2.11 # 3.1.1&nbsp;</td><td></td></tr><tr><td>BSD&nbsp;</td><td><a href="http://github.com/slick/slick/blob/master/LICENSE.txt">Two-clause BSD-style license</a>&nbsp;</td><td>com.typesafe.slick # slick-hikaricp_2.11 # 3.1.1&nbsp;</td><td></td></tr><tr><td>BSD&nbsp;</td><td><a href="http://github.com/slick/slick/blob/master/LICENSE.txt">Two-clause BSD-style license</a>&nbsp;</td><td>com.typesafe.slick # slick_2.11 # 3.1.1&nbsp;</td><td></td></tr><tr><td>CC0&nbsp;</td><td><a href="http://creativecommons.org/publicdomain/zero/1.0/">CC0</a>&nbsp;</td><td>org.reactivestreams # reactive-streams # 1.0.0&nbsp;</td><td></td></tr><tr><td>GPL with Classpath Extension&nbsp;</td><td><a href="https://glassfish.dev.java.net/nonav/public/CDDL+GPL.html">CDDL + GPLv2 with classpath exception</a>&nbsp;</td><td>javax.ws.rs # jsr311-api # 1.1.1&nbsp;</td><td></td></tr><tr><td>LGPL&nbsp;</td><td><a href="http://logback.qos.ch/license.html">EPL + GNU Lesser General Public License</a>&nbsp;</td><td>ch.qos.logback # logback-classic # 1.1.3&nbsp;</td><td></td></tr><tr><td>LGPL&nbsp;</td><td><a href="http://logback.qos.ch/license.html">EPL + GNU Lesser General Public License</a>&nbsp;</td><td>ch.qos.logback # logback-core # 1.1.3&nbsp;</td><td></td></tr><tr><td>LGPL&nbsp;</td><td><a href="http://www.gnu.org/licenses/lgpl.html">GNU Lesser General Public License</a>&nbsp;</td><td>com.google.code.findbugs # annotations # 2.0.1&nbsp;</td><td></td></tr><tr><td>LGPL&nbsp;</td><td><a href="http://www.gnu.org/licenses/lgpl.html">GNU Lesser General Public License</a>&nbsp;</td><td>com.google.code.findbugs # jsr305 # 2.0.1&nbsp;</td><td></td></tr><tr><td>MIT&nbsp;</td><td><a href="http://www.slf4j.org/license.html">MIT License</a>&nbsp;</td><td>org.slf4j # slf4j-api # 1.7.12&nbsp;</td><td></td></tr><tr><td>Mozilla&nbsp;</td><td><a href="http://h2database.com/html/license.html">MPL 2.0 or EPL 1.0</a>&nbsp;</td><td>com.h2database # h2 # 1.4.193&nbsp;</td><td></td></tr></tbody></table></body></html>


--------------------------------------------------------------------------------
/gnaf-extractor/src/main/scala/au/csiro/data61/gnaf/extractor/Extractor.scala:
--------------------------------------------------------------------------------
  1 | package au.csiro.data61.gnaf.extractor
  2 | 
  3 | import scala.concurrent.{ Await, ExecutionContext, Future }, ExecutionContext.Implicits.global
  4 | import scala.concurrent.duration.DurationInt
  5 | import scala.language.implicitConversions
  6 | import scala.util.{ Failure, Success }
  7 | 
  8 | import com.typesafe.config.{ ConfigFactory, ConfigValueFactory }
  9 | 
 10 | import au.csiro.data61.gnaf.db.GnafTables
 11 | import au.csiro.data61.gnaf.util.Gnaf._
 12 | import au.csiro.data61.gnaf.util.Gnaf.JsonProtocol._
 13 | import au.csiro.data61.gnaf.util.Util
 14 | import resource.managed
 15 | import slick.collection.heterogeneous.HNil
 16 | import slick.collection.heterogeneous.syntax.::
 17 | import spray.json.pimpAny
 18 | import scala.sys.SystemProperties
 19 | 
 20 | // Organize Imports deletes this, so make it easy to restore ...
 21 | // import slick.collection.heterogeneous.syntax.::
 22 | 
 23 | object Extractor {
 24 |   val log = Util.getLogger(getClass)
 25 |   
 26 |   val config = ConfigFactory.load
 27 |   
 28 |   object MyGnafTables extends {
 29 |     val profile = Util.getObject[slick.driver.JdbcProfile](config.getString("gnafDb.slickDriver")) // e.g. slick.driver.{H2Driver,PostgresDriver}
 30 |   } with GnafTables
 31 |   import MyGnafTables._
 32 |   import MyGnafTables.profile.api._
 33 | 
 34 |   /** result of command line option processing */
 35 |   case class CliOption(dburl: String, localityTimeout: Int, allTimeout: Int)
 36 |   val defaultCliOption = CliOption(config.getString("gnafDb.url"), config.getInt("gnafDb.localityTimeout"), config.getInt("gnafDb.allTimeout"))
 37 | 
 38 |   def main(args: Array[String]): Unit = {
 39 |     val parser = new scopt.OptionParser[CliOption]("gnaf-extractor") {
 40 |       head("gnaf-extractor", "0.x")
 41 |       note("Creates JSON from gnaf database to load into a search engine.")
 42 |       opt[String]('u', "dburl") action { (x, c) =>
 43 |         c.copy(dburl = x)
 44 |       } text (s"database URL, default ${defaultCliOption.dburl}")
 45 |       opt[Int]('l', "localityTimeout") action { (x, c) =>
 46 |         c.copy(localityTimeout = x)
 47 |       } text (s"timeout in minutes for all queries for a locality, default ${defaultCliOption.localityTimeout}")
 48 |       opt[Int]('a', "allTimeout") action { (x, c) =>
 49 |         c.copy(allTimeout = x)
 50 |       } text (s"timeout in minutes for all queries, default ${defaultCliOption.allTimeout}")
 51 |       help("help") text ("prints this usage text")
 52 |     }
 53 |     parser.parse(args, defaultCliOption) foreach run
 54 |     log.info("complete")
 55 |   }
 56 | 
 57 |   def run(c: CliOption) = {
 58 |     // configure global thread pool
 59 |     (new SystemProperties()) ++= Seq(
 60 |       ("scala.concurrent.context.minThreads", "4"), 
 61 |       ("scala.concurrent.context.numThreads", "4"), 
 62 |       ("scala.concurrent.context.maxThreads", "4")
 63 |     )
 64 |   
 65 |     val conf = config.withValue("gnafDb.url", ConfigValueFactory.fromAnyRef(c.dburl)) // CliOption.dburl overrides gnafDb.url
 66 |     for (db <- managed(Database.forConfig("gnafDb", conf))) {
 67 |       doAll(c)(db)
 68 |     }
 69 |   }
 70 | 
 71 |   val qAddressDetail = {
 72 |     def q(localityPid: Rep[String]) = for {
 73 |       ((((ad, lta), as), sl), adg) <- AddressDetail joinLeft
 74 |         LevelTypeAut on (_.levelTypeCode === _.code) joinLeft  // only 15 rows so keep in memory
 75 |         AddressSite on (_._1.addressSitePid === _.addressSitePid) joinLeft // ADDRESS_DETAIL.ADDRESS_SITE_PID is NON NULL, so no need for LEFT JOIN
 76 |         StreetLocality on (_._1._1.streetLocalityPid === _.streetLocalityPid) joinLeft
 77 |         AddressDefaultGeocode on (_._1._1._1.addressDetailPid === _.addressDetailPid)
 78 |       if (ad.localityPid === localityPid && ad.confidence > -1)
 79 |     } yield (
 80 |       ad,
 81 |       lta.map(_.name),
 82 |       as.map(_.addressSiteName),
 83 |       sl.map(sl => (sl.streetName, sl.streetTypeCode, sl.streetSuffixCode)),
 84 |       adg.map(adg => (adg.latitude, adg.longitude)))
 85 |     Compiled(q _)
 86 |   }
 87 | 
 88 |   val qLocalityAliasName = {
 89 |     def q(localityPid: Rep[String]) = for (la <- LocalityAlias if la.localityPid === localityPid) yield la.name
 90 |     Compiled(q _)
 91 |   }
 92 |   def localityVariant(localityPid: String)(implicit db: Database): Future[Seq[LocalityVariant]] =
 93 |     db.run(qLocalityAliasName(localityPid).result).map(_.map(name => LocalityVariant(name)))
 94 | 
 95 |   val qStreetLocalityAlias = {
 96 |     def q(streetLocalityPid: Rep[String]) = for (sla <- StreetLocalityAlias if sla.streetLocalityPid === streetLocalityPid) yield (sla.streetName, sla.streetTypeCode, sla.streetSuffixCode)
 97 |     Compiled(q _)
 98 |   }
 99 | 
100 |   def streetLocalityAlias(streetLocalityPid: Option[String])(implicit db: Database): Future[Seq[(String, Option[String], Option[String])]] = {
101 |     streetLocalityPid.map { pid =>
102 |       db.run(qStreetLocalityAlias(pid).result)
103 |     }.getOrElse(Future(Seq.empty))
104 |   }
105 | 
106 |   type FutStrMap = Future[Map[String, String]]
107 | 
108 |   def doAll(c: CliOption)(implicit db: Database) = {
109 |     // These code -> name mappings are all small enough to keep in memory
110 |     val stateMap: Future[Map[String, (String, String)]] = db.run((for (s <- State) yield s.statePid -> (s.stateAbbreviation, s.stateName)).result).map(_.toMap)
111 |     val flatTypeMap: FutStrMap = db.run((for (f <- FlatTypeAut) yield f.code -> f.name).result).map(_.toMap)
112 |     val streetTypeMap: FutStrMap = db.run((for (s <- StreetTypeAut) yield s.code -> s.name).result).map(_.toMap)
113 |     val streetSuffixMap: FutStrMap = db.run((for (s <- StreetSuffixAut) yield s.code -> s.name).result).map(_.toMap)
114 | 
115 |     val localities: Future[Seq[(String, String, String)]] = db.run((for (loc <- Locality if loc.localityClassCode === 'G') yield (loc.localityPid, loc.localityName, loc.statePid)).result)
116 |     val done: Future[Unit] = localities.flatMap { seq =>
117 |       log.info("got all localities")
118 |       val seqFut: Seq[Future[Unit]] = seq.map {
119 |         case (localityPid, localityName, statePid) =>
120 |           val locDone = doLocality(localityPid, localityName, statePid, stateMap, flatTypeMap, streetTypeMap, streetSuffixMap)
121 |           Await.result(locDone, c.localityTimeout.minute) // without this it runs out of memory before outputting anything!
122 |           locDone
123 |       }
124 |       Future.fold(seqFut)(())((_, _) => ())
125 |     }
126 |     Await.result(done, c.allTimeout.minute)
127 |     log info "all done"
128 |   }
129 | 
130 |   /*
131 |   When I try to stream all AddressDetail rows, I don't get any rows in a reasonable time (seems to hang but CPU is busy).
132 |   
133 |   http://stackoverflow.com/questions/24787119/how-to-set-h2-to-stream-resultset
134 |   H2 currently does not support server side cursors. However, it buffers large result sets to disk (as a separate file, or as a temporary table). The disadvantage is speed, but it should not be a memory usage problems.
135 |   
136 |   You can set the size of the when H2 will buffer to disk using set max_memory_rows. You can append that to the database URL: jdbc:h2:~/test;max_memory_rows=200000.
137 |   
138 |   A workaround is usually to use "keyset paging" as described in the presentation "Pagination Done the Right Way". That would mean running multiple queries instead of one.
139 |   
140 |   http://www.h2database.com/html/advanced.html
141 |   Before the result is returned to the application, all rows are read by the database. Server side cursors are not supported currently.
142 |   
143 |   http://www.h2database.com/javadoc/org/h2/engine/SysProperties.html?highlight=max_memory_rows&search=max_memory_rows#h2.maxMemoryRows
144 |   System property h2.maxMemoryRows (default: 40000 per GB of available RAM).
145 |   
146 |   So if we set -Xmx3G  and partition by LOCALITY_PID we should be OK:
147 |   There are 16398 LOCALITY rows and max ADDRESS_DETAILs for a LOCALITY is 95004.
148 |   SELECT LOCALITY_PID , count(*) cnt FROM ADDRESS_DETAIL group by LOCALITY_PID order by cnt desc limit 3;
149 |   
150 |   LOCALITY_PID    CNT Feb 2016    CNT Nov 2017  
151 |   VIC1634         95004           105960
152 |   NSW3749         44656            45502
153 |   QLD2772         34712            39162
154 |   
155 |   http://slick.typesafe.com/doc/3.1.1/dbio.html
156 |   Slick's Database.stream produces a `Reactive Stream` that can be consumed with a foreach that takes a callback for each row.
157 |   Since H2 is providing all the rows at once (see above):
158 |   - the callback is called for multiple rows at once
159 |   - concurrency is limited only by the number of threads
160 |   - all the other callbacks are queued on the thread pool, preventing anything else from running on this pool.
161 |   It's better to use Database.run to get all all the rows at once, allow H2 to release any resources, and to have some control over the
162 |   concurrency of processing the rows.
163 |  */
164 | 
165 |   def doLocality(
166 |     localityPid: String, localityName: String, statePid: String,
167 |     stateMap: Future[Map[String, (String, String)]], flatTypeMap: FutStrMap, streetTypeMap: FutStrMap, streetSuffixMap: FutStrMap
168 |   )(
169 |     implicit db: Database
170 |   ): Future[Unit] = {
171 |     val state = stateMap.map(_.apply(statePid))
172 |     val locVariant = localityVariant(localityPid)
173 | 
174 |     log.info(s"starting locality $localityName")
175 |     db.run(qAddressDetail(localityPid).result).flatMap { seq =>
176 |       log.info(s"got all addresses for locality $localityName")
177 | 
178 |       val seqFut: Seq[Future[Address]] = seq.map {
179 |         case (
180 |           // copied from AddressDetail.*
181 |           addressDetailPid :: dateCreated :: dateLastModified :: dateRetired :: buildingName :: lotNumberPrefix :: lotNumber :: lotNumberSuffix ::
182 |             flatTypeCode :: flatNumberPrefix :: flatNumber :: flatNumberSuffix ::
183 |             levelTypeCode :: levelNumberPrefix :: levelNumber :: levelNumberSuffix ::
184 |             numberFirstPrefix :: numberFirst :: numberFirstSuffix ::
185 |             numberLastPrefix :: numberLast :: numberLastSuffix ::
186 |             streetLocalityPid :: locationDescription :: localityPid :: aliasPrincipal :: postcode :: privateStreet :: legalParcelId :: confidence ::
187 |             addressSitePid :: levelGeocodedCode :: propertyPid :: gnafPropertyPid :: primarySecondary :: HNil,
188 |           levelTypeName,
189 |           addressSiteName,
190 |           street,
191 |           location
192 |           ) =>
193 | 
194 |           val addr: Future[Address] = for {
195 |             (stateAbbreviation, stateName) <- state
196 |             ftm <- flatTypeMap
197 |             stm <- streetTypeMap
198 |             ssm <- streetSuffixMap
199 |             locVar <- locVariant
200 |             sla <- streetLocalityAlias(streetLocalityPid)
201 |           } yield Address(
202 |             addressDetailPid, addressSiteName.flatten, buildingName,
203 |             flatTypeCode, flatTypeCode.map(ftm), PreNumSuf(flatNumberPrefix, flatNumber, flatNumberSuffix),
204 |             levelTypeCode, levelTypeName, PreNumSuf(levelNumberPrefix, levelNumber, levelNumberSuffix),
205 |             PreNumSuf(numberFirstPrefix, numberFirst, numberFirstSuffix),
206 |             PreNumSuf(numberLastPrefix, numberLast, numberLastSuffix),
207 |             street.map(s => Street(s._1, s._2, s._2.map(stm), s._3, s._3.map(ssm))),
208 |             localityName, stateAbbreviation, stateName, postcode,
209 |             aliasPrincipal, primarySecondary,
210 |             location.flatMap {
211 |               case (Some(lat), Some(lon)) => Some(Location(lat, lon))
212 |               case _                      => None
213 |             },
214 |             sla.map(s => Street(s._1, s._2, s._2.map(stm), s._3, s._3.map(ssm))),
215 |             locVar)
216 | 
217 |           addr.onComplete {
218 |             case Success(a) => println(a.toJson.compactPrint) // println appears to be synchronized
219 |             case Failure(e) => log.error(s"future address for $addressDetailPid failed", e)
220 |           }
221 | 
222 |           /*
223 |            * Trying to use small bounded thread pools I got:
224 |            * 12:50:59.843 [Pool-2-thread-2] ERROR au.com.data61.gnaf.indexer.Main. - future address for GAACT715082885 failed
225 |            * java.util.concurrent.RejectedExecutionException: Task slick.backend.DatabaseComponent$DatabaseDef$$anon$2@1dbaddc0 rejected from
226 |            * java.util.concurrent.ThreadPoolExecutor@2bc930eb[Running, pool size = 3, active threads = 3, queued tasks = 987, completed tasks = 10]
227 |            *         
228 |            * The only pool with a queue size of 987 and 3 threads is the slick pool configured in application.conf.
229 |            * I tried explicit flatMaps instead of for, with an explicit ExecutionContext, but it still used the slick pool!
230 |            */
231 |           addr
232 |       }
233 | 
234 |       val locDone = Future.fold(seqFut)(())((_, _) => ())
235 |       locDone.onComplete {
236 |         case Success(_) => log.info(s"completed locality $localityName")
237 |         case Failure(e) => log.error(s"future locality $localityName failed", e)
238 |       }
239 |       locDone
240 |     }
241 |   }
242 | 
243 | }
244 | 


--------------------------------------------------------------------------------
/gnaf-search/3rd-party-licenses.html:
--------------------------------------------------------------------------------
1 | <html><head><title>gnaf-search-licenses</title></head><body><h1>gnaf-search-licenses</h1><table border="0" cellspacing="0" cellpading="1">
2 |       <thead><tr><th>Category</th><th>License</th><th>Dependency</th><th>Notes</th></tr></thead>
3 |     <tbody><tr><td>Apache&nbsp;</td><td><a href="https://www.apache.org/licenses/LICENSE-2.0.txt">Apache 2</a>&nbsp;</td><td>ch.megard # akka-http-cors_2.11 # 0.1.2&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0.txt">Apache 2</a>&nbsp;</td><td>io.spray # spray-json_2.11 # 1.3.2&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0.txt">Apache 2</a>&nbsp;</td><td>joda-time # joda-time # 2.2&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0.html">Apache 2.0 License</a>&nbsp;</td><td>com.typesafe.scala-logging # scala-logging_2.11 # 3.1.0&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="LICENSE.txt">Apache License Version 2.0</a>&nbsp;</td><td>org.yaml # snakeyaml # 1.12&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0">Apache License, Version 2.0</a>&nbsp;</td><td>com.typesafe # config # 1.3.0&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0">Apache License, Version 2.0</a>&nbsp;</td><td>com.typesafe # ssl-config-akka_2.11 # 0.2.1&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0">Apache License, Version 2.0</a>&nbsp;</td><td>com.typesafe # ssl-config-core_2.11 # 0.2.1&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0">Apache License, Version 2.0</a>&nbsp;</td><td>com.typesafe.akka # akka-actor_2.11 # 2.4.6&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0">Apache License, Version 2.0</a>&nbsp;</td><td>com.typesafe.akka # akka-http-core_2.11 # 2.4.6&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0">Apache License, Version 2.0</a>&nbsp;</td><td>com.typesafe.akka # akka-http-experimental_2.11 # 2.4.6&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0">Apache License, Version 2.0</a>&nbsp;</td><td>com.typesafe.akka # akka-http-spray-json-experimental_2.11 # 2.4.6&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0">Apache License, Version 2.0</a>&nbsp;</td><td>com.typesafe.akka # akka-http-testkit_2.11 # 2.4.3&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0">Apache License, Version 2.0</a>&nbsp;</td><td>com.typesafe.akka # akka-parsing_2.11 # 2.4.6&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0">Apache License, Version 2.0</a>&nbsp;</td><td>com.typesafe.akka # akka-stream-testkit_2.11 # 2.4.3&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0">Apache License, Version 2.0</a>&nbsp;</td><td>com.typesafe.akka # akka-stream_2.11 # 2.4.6&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0">Apache License, Version 2.0</a>&nbsp;</td><td>com.typesafe.akka # akka-testkit_2.11 # 2.4.3&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0.txt">The Apache Software License, Version 2.0</a>&nbsp;</td><td>com.fasterxml.jackson.core # jackson-annotations # 2.4.5&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0.txt">The Apache Software License, Version 2.0</a>&nbsp;</td><td>com.fasterxml.jackson.core # jackson-core # 2.4.5&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0.txt">The Apache Software License, Version 2.0</a>&nbsp;</td><td>com.fasterxml.jackson.core # jackson-databind # 2.4.5&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0.txt">The Apache Software License, Version 2.0</a>&nbsp;</td><td>com.fasterxml.jackson.dataformat # jackson-dataformat-xml # 2.4.5&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0.txt">The Apache Software License, Version 2.0</a>&nbsp;</td><td>com.fasterxml.jackson.dataformat # jackson-dataformat-yaml # 2.4.5&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0.txt">The Apache Software License, Version 2.0</a>&nbsp;</td><td>com.fasterxml.jackson.datatype # jackson-datatype-joda # 2.4.5&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0.txt">The Apache Software License, Version 2.0</a>&nbsp;</td><td>com.fasterxml.jackson.jaxrs # jackson-jaxrs-base # 2.4.5&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0.txt">The Apache Software License, Version 2.0</a>&nbsp;</td><td>com.fasterxml.jackson.jaxrs # jackson-jaxrs-json-provider # 2.4.5&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0.txt">The Apache Software License, Version 2.0</a>&nbsp;</td><td>com.fasterxml.jackson.module # jackson-module-jaxb-annotations # 2.4.5&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0.txt">The Apache Software License, Version 2.0</a>&nbsp;</td><td>com.fasterxml.jackson.module # jackson-module-scala_2.11 # 2.4.2&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0.txt">The Apache Software License, Version 2.0</a>&nbsp;</td><td>com.github.swagger-akka-http # swagger-akka-http_2.11 # 0.7.0&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0.txt">The Apache Software License, Version 2.0</a>&nbsp;</td><td>com.google.guava # guava # 18.0&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0.txt">The Apache Software License, Version 2.0</a>&nbsp;</td><td>io.swagger # swagger-annotations # 1.5.9&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0.txt">The Apache Software License, Version 2.0</a>&nbsp;</td><td>io.swagger # swagger-core # 1.5.9&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0.txt">The Apache Software License, Version 2.0</a>&nbsp;</td><td>io.swagger # swagger-jaxrs # 1.5.9&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0.txt">The Apache Software License, Version 2.0</a>&nbsp;</td><td>io.swagger # swagger-models # 1.5.9&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0.txt">The Apache Software License, Version 2.0</a>&nbsp;</td><td>io.swagger # swagger-scala-module_2.11 # 1.0.2&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0.txt">The Apache Software License, Version 2.0</a>&nbsp;</td><td>javax.validation # validation-api # 1.1.0.Final&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0.txt">The Apache Software License, Version 2.0</a>&nbsp;</td><td>org.apache.commons # commons-lang3 # 3.2.1&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0.txt">The Apache Software License, Version 2.0</a>&nbsp;</td><td>org.apache.lucene # lucene-analyzers-common # 6.2.1&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0.txt">The Apache Software License, Version 2.0</a>&nbsp;</td><td>org.apache.lucene # lucene-core # 6.2.1&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0.txt">The Apache Software License, Version 2.0</a>&nbsp;</td><td>org.javassist # javassist # 3.18.2-GA&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0.txt">The Apache Software License, Version 2.0</a>&nbsp;</td><td>org.json4s # json4s-ast_2.11 # 3.2.11&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0.txt">The Apache Software License, Version 2.0</a>&nbsp;</td><td>org.json4s # json4s-core_2.11 # 3.2.11&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0.txt">The Apache Software License, Version 2.0</a>&nbsp;</td><td>org.json4s # json4s-jackson_2.11 # 3.2.11&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0.txt">The Apache Software License, Version 2.0</a>&nbsp;</td><td>org.json4s # json4s-native_2.11 # 3.2.11&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0">the Apache License, ASL Version 2.0</a>&nbsp;</td><td>org.scalactic # scalactic_2.11 # 3.0.0&nbsp;</td><td></td></tr><tr><td>Apache&nbsp;</td><td><a href="http://www.apache.org/licenses/LICENSE-2.0">the Apache License, ASL Version 2.0</a>&nbsp;</td><td>org.scalatest # scalatest_2.11 # 3.0.0&nbsp;</td><td></td></tr><tr><td>BSD&nbsp;</td><td><a href="https://github.com/data61/gnaf/blob/master/LICENSE.txt">BSD</a>&nbsp;</td><td>au.csiro.data61.gnaf # gnaf-lucene_2.11 # 0.8-SNAPSHOT&nbsp;</td><td></td></tr><tr><td>BSD&nbsp;</td><td><a href="https://github.com/data61/gnaf/blob/master/LICENSE.txt">BSD</a>&nbsp;</td><td>au.csiro.data61.gnaf # gnaf-util_2.11 # 0.8-SNAPSHOT&nbsp;</td><td></td></tr><tr><td>BSD&nbsp;</td><td><a href="http://www.scala-lang.org/license.html">BSD 3-Clause</a>&nbsp;</td><td>org.scala-lang # scala-compiler # 2.11.8&nbsp;</td><td></td></tr><tr><td>BSD&nbsp;</td><td><a href="http://www.scala-lang.org/license.html">BSD 3-Clause</a>&nbsp;</td><td>org.scala-lang # scala-library # 2.11.8&nbsp;</td><td></td></tr><tr><td>BSD&nbsp;</td><td><a href="http://www.scala-lang.org/license.html">BSD 3-Clause</a>&nbsp;</td><td>org.scala-lang # scala-reflect # 2.11.8&nbsp;</td><td></td></tr><tr><td>BSD&nbsp;</td><td><a href="http://www.scala-lang.org/license.html">BSD 3-Clause</a>&nbsp;</td><td>org.scala-lang # scalap # 2.11.8&nbsp;</td><td></td></tr><tr><td>BSD&nbsp;</td><td><a href="http://opensource.org/licenses/BSD-3-Clause">BSD 3-clause</a>&nbsp;</td><td>org.scala-lang.modules # scala-java8-compat_2.11 # 0.7.0&nbsp;</td><td></td></tr><tr><td>BSD&nbsp;</td><td><a href="http://opensource.org/licenses/BSD-3-Clause">BSD 3-clause</a>&nbsp;</td><td>org.scala-lang.modules # scala-parser-combinators_2.11 # 1.0.4&nbsp;</td><td></td></tr><tr><td>BSD&nbsp;</td><td><a href="http://opensource.org/licenses/BSD-3-Clause">BSD 3-clause</a>&nbsp;</td><td>org.scala-lang.modules # scala-xml_2.11 # 1.0.4&nbsp;</td><td></td></tr><tr><td>BSD&nbsp;</td><td><a href="http://opensource.org/licenses/BSD-3-Clause">BSD 3-clause</a>&nbsp;</td><td>org.scala-lang.modules # scala-xml_2.11 # 1.0.5&nbsp;</td><td></td></tr><tr><td>BSD&nbsp;</td><td><a href="http://www.opensource.org/licenses/bsd-license.php">BSD-Style</a>&nbsp;</td><td>com.jsuereth # scala-arm_2.11 # 2.0.0-M1&nbsp;</td><td></td></tr><tr><td>BSD&nbsp;</td><td><a href="http://www.opensource.org/licenses/bsd-license.php">BSD-Style</a>&nbsp;</td><td>com.thoughtworks.paranamer # paranamer # 2.6&nbsp;</td><td></td></tr><tr><td>BSD&nbsp;</td><td><a href="http://www.opensource.org/licenses/bsd-license.php">The BSD License</a>&nbsp;</td><td>org.codehaus.woodstox # stax2-api # 3.1.4&nbsp;</td><td></td></tr><tr><td>BSD&nbsp;</td><td><a href="http://www.opensource.org/licenses/bsd-license.html">The New BSD License</a>&nbsp;</td><td>org.reflections # reflections # 0.9.10&nbsp;</td><td></td></tr><tr><td>CC0&nbsp;</td><td><a href="http://creativecommons.org/publicdomain/zero/1.0/">CC0</a>&nbsp;</td><td>org.reactivestreams # reactive-streams # 1.0.0&nbsp;</td><td></td></tr><tr><td>GPL with Classpath Extension&nbsp;</td><td><a href="https://glassfish.dev.java.net/nonav/public/CDDL+GPL.html">CDDL + GPLv2 with classpath exception</a>&nbsp;</td><td>javax.ws.rs # jsr311-api # 1.1.1&nbsp;</td><td></td></tr><tr><td>LGPL&nbsp;</td><td><a href="http://logback.qos.ch/license.html">EPL + GNU Lesser General Public License</a>&nbsp;</td><td>ch.qos.logback # logback-classic # 1.1.3&nbsp;</td><td></td></tr><tr><td>LGPL&nbsp;</td><td><a href="http://logback.qos.ch/license.html">EPL + GNU Lesser General Public License</a>&nbsp;</td><td>ch.qos.logback # logback-core # 1.1.3&nbsp;</td><td></td></tr><tr><td>LGPL&nbsp;</td><td><a href="http://www.gnu.org/licenses/lgpl.html">GNU Lesser General Public License</a>&nbsp;</td><td>com.google.code.findbugs # annotations # 2.0.1&nbsp;</td><td></td></tr><tr><td>LGPL&nbsp;</td><td><a href="http://www.gnu.org/licenses/lgpl.html">GNU Lesser General Public License</a>&nbsp;</td><td>com.google.code.findbugs # jsr305 # 2.0.1&nbsp;</td><td></td></tr><tr><td>MIT&nbsp;</td><td><a href="http://www.opensource.org/licenses/mit-license.php">MIT License</a>&nbsp;</td><td>com.github.scopt # scopt_2.11 # 3.3.0&nbsp;</td><td></td></tr><tr><td>MIT&nbsp;</td><td><a href="http://www.slf4j.org/license.html">MIT License</a>&nbsp;</td><td>org.slf4j # slf4j-api # 1.7.12&nbsp;</td><td></td></tr></tbody></table></body></html>


--------------------------------------------------------------------------------