├── project ├── build.properties └── plugins.sbt ├── version.sbt ├── .dockerignore ├── gnaf-ui ├── html │ ├── loading.gif │ ├── index.html │ └── index.css └── README.md ├── gnaf-util ├── README.md ├── build.sbt ├── src │ └── main │ │ └── scala │ │ └── au │ │ └── csiro │ │ └── data61 │ │ └── gnaf │ │ └── util │ │ ├── Util.scala │ │ ├── Timer.scala │ │ └── Gnaf.scala └── 3rd-party-licenses.html ├── gnaf-indexer ├── build.sbt ├── src │ ├── test │ │ └── resources │ │ │ └── logback-test.xml │ └── main │ │ ├── resources │ │ └── logback.xml │ │ └── scala │ │ └── au │ │ └── csiro │ │ └── data61 │ │ └── gnaf │ │ └── indexer │ │ └── Indexer.scala ├── README.md └── 3rd-party-licenses.html ├── gnaf-extractor ├── build.sbt ├── src │ └── main │ │ ├── resources │ │ ├── logback.xml │ │ └── application.conf │ │ ├── script │ │ ├── loadElasticsearch.sh │ │ ├── loadElasticsearch.js │ │ └── gnafMapping.json │ │ └── scala │ │ └── au │ │ └── csiro │ │ └── data61 │ │ └── gnaf │ │ └── extractor │ │ └── Extractor.scala ├── README.md └── 3rd-party-licenses.html ├── gnaf-test ├── build.sbt ├── src │ ├── main │ │ ├── resources │ │ │ ├── logback.xml │ │ │ └── application.conf │ │ └── script │ │ │ ├── diff.js │ │ │ ├── summary.js │ │ │ ├── run.sh │ │ │ ├── Maps.js │ │ │ ├── searchLucene.js │ │ │ └── searchEs.js │ └── test │ │ ├── resources │ │ └── logback-test.xml │ │ └── scala │ │ └── au │ │ └── csiro │ │ └── data61 │ │ └── gnaf │ │ └── test │ │ └── MainTest.scala ├── package.json ├── README.md └── 3rd-party-licenses.html ├── .gitignore ├── gnaf-db ├── src │ └── main │ │ └── script │ │ ├── constraint.sed │ │ └── createGnafDb.sh ├── build.sbt └── 3rd-party-licenses.html ├── gnaf-lucene ├── build.sbt ├── src │ ├── test │ │ ├── resources │ │ │ └── logback-test.xml │ │ └── scala │ │ │ └── au │ │ │ └── csiro │ │ │ └── data61 │ │ │ └── gnaf │ │ │ └── lucene │ │ │ └── GnafLuceneTest.scala │ └── main │ │ └── scala │ │ └── au │ │ └── csiro │ │ └── data61 │ │ └── gnaf │ │ └── lucene │ │ ├── LuceneUtil.scala │ │ └── GnafLucene.scala ├── 3rd-party-licenses.html └── README.md ├── gnaf-contrib ├── src │ ├── test │ │ └── resources │ │ │ └── logback-test.xml │ └── main │ │ ├── resources │ │ ├── logback.xml │ │ └── application.conf │ │ └── scala │ │ └── au │ │ └── csiro │ │ └── data61 │ │ └── gnaf │ │ └── contrib │ │ ├── db │ │ └── ContribTables.scala │ │ └── service │ │ └── ContribService.scala ├── build.sbt ├── README.md └── 3rd-party-licenses.html ├── gnaf-search ├── src │ ├── test │ │ └── resources │ │ │ └── logback-test.xml │ └── main │ │ ├── resources │ │ ├── logback.xml │ │ └── application.conf │ │ └── scala │ │ └── au │ │ └── csiro │ │ └── data61 │ │ └── gnaf │ │ └── search │ │ └── Search.scala ├── README.md ├── build.sbt └── 3rd-party-licenses.html ├── gnaf-db-service ├── src │ ├── test │ │ └── resources │ │ │ └── logback-test.xml │ └── main │ │ ├── resources │ │ ├── logback.xml │ │ └── application.conf │ │ └── scala │ │ └── au │ │ └── csiro │ │ └── data61 │ │ └── gnaf │ │ └── db │ │ └── service │ │ └── DbService.scala ├── build.sbt ├── README.md └── 3rd-party-licenses.html ├── src └── main │ └── script │ ├── checkupdates.sh │ └── run.sh ├── Dockerfile ├── LICENSE.txt ├── README.md └── template.yaml /project/build.properties: -------------------------------------------------------------------------------- 1 | sbt.version=0.13.12 2 | -------------------------------------------------------------------------------- /version.sbt: -------------------------------------------------------------------------------- 1 | version in ThisBuild := "1.1-SNAPSHOT" 2 | -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | indexDir/ 2 | gnaf-db/data/ 3 | addresses.gz 4 | -------------------------------------------------------------------------------- /gnaf-ui/html/loading.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/data61/gnaf/HEAD/gnaf-ui/html/loading.gif -------------------------------------------------------------------------------- /gnaf-util/README.md: -------------------------------------------------------------------------------- 1 | # gnaf-util 2 | 3 | ## Introduction 4 | 5 | This project produces a library of common code used by the other gnaf sub-projects. 6 | 7 | -------------------------------------------------------------------------------- /gnaf-indexer/build.sbt: -------------------------------------------------------------------------------- 1 | name := "gnaf-indexer" 2 | 3 | libraryDependencies ++= Seq( 4 | "com.github.scopt" %% "scopt" % "3.3.0", 5 | "com.jsuereth" %% "scala-arm" % "2.0.0-M1" 6 | ) 7 | -------------------------------------------------------------------------------- /gnaf-extractor/build.sbt: -------------------------------------------------------------------------------- 1 | name := "gnaf-extractor" 2 | 3 | libraryDependencies ++= Seq( 4 | "com.github.scopt" %% "scopt" % "3.3.0", 5 | "com.jsuereth" %% "scala-arm" % "2.0.0-M1" 6 | ) 7 | -------------------------------------------------------------------------------- /gnaf-test/build.sbt: -------------------------------------------------------------------------------- 1 | name := "gnaf-test" 2 | 3 | libraryDependencies ++= Seq( 4 | "com.github.scopt" %% "scopt" % "3.3.0", 5 | "com.jsuereth" %% "scala-arm" % "2.0.0-M1", 6 | "org.scalatest" %% "scalatest" % "3.0.0" % "test" 7 | ) 8 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.log 2 | *~ 3 | 4 | # sbt specific 5 | target/ 6 | /project/project/ 7 | /project/target/ 8 | .cache-main 9 | .cache-tests 10 | 11 | # Scala-IDE specific 12 | bin/ 13 | test-bin/ 14 | .classpath 15 | .project 16 | .settings/ 17 | .worksheet 18 | 19 | 20 | -------------------------------------------------------------------------------- /gnaf-db/src/main/script/constraint.sed: -------------------------------------------------------------------------------- 1 | # --regexp-extended 2 | /ALTER TABLE/ { 3 | h 4 | d 5 | } 6 | /CONSTRAINT/ { 7 | H 8 | s~ *CONSTRAINT ([A-Z0-9_)]+) .*~SELECT 'Adding constraint \1 ...' AS Progress, CURRENT_TIME() AS Time;\n~p 9 | g 10 | } 11 | 12 | 13 | -------------------------------------------------------------------------------- /gnaf-lucene/build.sbt: -------------------------------------------------------------------------------- 1 | name := "gnaf-lucene" 2 | 3 | libraryDependencies ++= Seq( 4 | "com.github.scopt" %% "scopt" % "3.3.0", 5 | "com.jsuereth" %% "scala-arm" % "2.0.0-M1", 6 | "org.scalatest" %% "scalatest" % "3.0.0" % "test" 7 | ) 8 | 9 | libraryDependencies ++= Seq( 10 | "lucene-core", 11 | "lucene-analyzers-common" 12 | ) map ("org.apache.lucene" % _ % "6.2.1") 13 | -------------------------------------------------------------------------------- /gnaf-db/build.sbt: -------------------------------------------------------------------------------- 1 | name := "gnaf-db" 2 | 3 | libraryDependencies ++= Seq( 4 | "com.h2database" % "h2" % "1.4.193" // or postgres or whatever, % "runtime" should be enough, but sbt slick.codegen needs it on compile classpath 5 | ) 6 | 7 | libraryDependencies ++= Seq( 8 | "slick-codegen", // only needed when generating slick mapping 9 | "slick", 10 | "slick-hikaricp" 11 | ) map ("com.typesafe.slick" %% _ % "3.1.1") 12 | -------------------------------------------------------------------------------- /gnaf-util/build.sbt: -------------------------------------------------------------------------------- 1 | name := "gnaf-util" 2 | 3 | libraryDependencies ++= Seq( 4 | "io.spray" %% "spray-json" % "1.3.2", 5 | "com.typesafe.scala-logging" %% "scala-logging" % "3.1.0", 6 | "org.slf4j" % "slf4j-api" % "1.7.12", 7 | "ch.qos.logback" % "logback-classic" % "1.1.3" 8 | // "org.scala-lang" % "scala-reflect" % "2.11.8", // Multiple dependencies with the same organization/name but different versions. To avoid conflict, pick one version 9 | // "org.scala-lang.modules" %% "scala-xml" % "1.0.4" // as above 10 | ) 11 | -------------------------------------------------------------------------------- /gnaf-contrib/src/test/resources/logback-test.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | %d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | -------------------------------------------------------------------------------- /gnaf-search/src/test/resources/logback-test.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | %d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | -------------------------------------------------------------------------------- /gnaf-test/src/main/resources/logback.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | gnaf-test.log 6 | 7 | %d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | -------------------------------------------------------------------------------- /gnaf-db-service/src/test/resources/logback-test.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | %d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | -------------------------------------------------------------------------------- /gnaf-test/src/test/resources/logback-test.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | %d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | -------------------------------------------------------------------------------- /gnaf-db-service/build.sbt: -------------------------------------------------------------------------------- 1 | name := "gnaf-db-service" 2 | 3 | libraryDependencies ++= Seq( 4 | "ch.megard" %% "akka-http-cors" % "0.1.2", 5 | "com.github.swagger-akka-http" %% "swagger-akka-http" % "0.7.0", // adding swagger brings in all the horrible old javax.ws & Jackson dependencies! 6 | "io.swagger" % "swagger-annotations" % "1.5.9" 7 | ) 8 | 9 | libraryDependencies ++= Seq( 10 | "akka-actor", 11 | "akka-stream", 12 | "akka-http-experimental", 13 | "akka-http-spray-json-experimental", 14 | "akka-http-testkit" 15 | ) map ("com.typesafe.akka" %% _ % "2.4.3") 16 | -------------------------------------------------------------------------------- /gnaf-indexer/src/test/resources/logback-test.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | %d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | -------------------------------------------------------------------------------- /gnaf-lucene/src/test/resources/logback-test.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | %d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | -------------------------------------------------------------------------------- /gnaf-contrib/src/main/resources/logback.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | %d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | -------------------------------------------------------------------------------- /gnaf-indexer/src/main/resources/logback.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | %d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | -------------------------------------------------------------------------------- /gnaf-search/src/main/resources/logback.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | %d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | -------------------------------------------------------------------------------- /gnaf-db-service/README.md: -------------------------------------------------------------------------------- 1 | # gnaf-db-service 2 | 3 | ## Introduction 4 | This project provides a [Scala](http://scala-lang.org/) [RESTful](https://en.wikipedia.org/wiki/Representational_state_transfer) JSON 5 | web service providing access to the G-NAF database. 6 | 7 | This is a stand-alone webapp and does not run in a servlet container. 8 | 9 | ## Configuration 10 | 11 | Configuration is in [application.conf](src/main/resources/application.conf) and most settings can be overridden with environment variables. 12 | 13 | ## Running and Usage 14 | 15 | See `gnaf/src/main/script/run.sh`. 16 | -------------------------------------------------------------------------------- /gnaf-extractor/src/main/resources/logback.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | gnaf-extractor.log 6 | 7 | %d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | -------------------------------------------------------------------------------- /gnaf-db-service/src/main/resources/logback.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | %d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | -------------------------------------------------------------------------------- /gnaf-search/README.md: -------------------------------------------------------------------------------- 1 | # gnaf-search 2 | 3 | ## Introduction 4 | 5 | This project provides a JSON web service to search the [Lucene](https://lucene.apache.org/) index created by `gnaf-indexer`. 6 | Users should note the [suggested preprocessing](../gnaf-lucene/README.md#suggested-preprocessing-for-client-applications) for 7 | query strings. 8 | 9 | ## Configuration 10 | 11 | Configuration is in [application.conf](src/main/resources/application.conf) and most settings can be overridden with environment variables. 12 | Command line options take precedence over the above, use `--help` for details. 13 | 14 | ## Running and Usage 15 | 16 | See `gnaf/src/main/script/run.sh`. 17 | -------------------------------------------------------------------------------- /gnaf-test/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "gnaf-test", 3 | "version": "1.0.0", 4 | "description": "## Introduction", 5 | "main": "src/main/script/searchLucene.js", 6 | "dependencies": { 7 | "fs": "^0.0.2", 8 | "request": "^2.74.0" 9 | }, 10 | "devDependencies": {}, 11 | "scripts": { 12 | "test": "echo \"Error: no test specified\" && exit 1" 13 | }, 14 | "repository": { 15 | "type": "git", 16 | "url": "git+https://github.com/data61/gnaf.git" 17 | }, 18 | "author": "", 19 | "license": "BSD-3-Clause", 20 | "bugs": { 21 | "url": "https://github.com/data61/gnaf/issues" 22 | }, 23 | "homepage": "https://github.com/data61/gnaf#readme" 24 | } 25 | -------------------------------------------------------------------------------- /gnaf-search/src/main/resources/application.conf: -------------------------------------------------------------------------------- 1 | akka { 2 | loglevel = INFO 3 | } 4 | 5 | gnafSearch { 6 | 7 | indexDir = "./indexDir" 8 | 9 | // validation limits 10 | bulk = 50 11 | numHits = 1000 12 | fuzzyMinLength = 2 13 | fuzzyMaxEdits = 2 14 | fuzzyPrefixLength = 0 15 | 16 | interface = "0.0.0.0" 17 | port = 9040 18 | 19 | indexDir = ${?GNAF_SEARCH_INDEX_DIR} 20 | 21 | numHits = ${?GNAF_SEARCH_NUM_HITS} 22 | fuzzyMinLength = ${?GNAF_SEARCH_FUZZY_MIN_LENGTH} 23 | fuzzyMaxEdits = ${?GNAF_SEARCH_FUZZY_MAX_EDITS} 24 | fuzzyPrefixLength = ${?GNAF_SEARCH_FUZZY_PREFIX_LENGTH} 25 | 26 | interface = ${?GNAF_SEARCH_INTERFACE} 27 | port = ${?GNAF_SEARCH_PORT} 28 | } 29 | 30 | -------------------------------------------------------------------------------- /gnaf-indexer/README.md: -------------------------------------------------------------------------------- 1 | # gnaf-indexer 2 | 3 | ## Introduction 4 | 5 | This project loads JSON address data from `gnaf-extractor` into a [Lucene](https://lucene.apache.org/) index. 6 | Originally Elasticsearch was used, but it was found that significant tweaks to scoring were required for good results 7 | and this was easiest achieved in raw Lucene (which also provided significant speed improvements). 8 | 9 | ## Configuration 10 | 11 | Configuration is in [application.conf](src/main/resources/application.conf) and most settings can be overridden with environment variables. 12 | The index directory can also be set with a command line option (overriding the above, use `--help` for details). 13 | 14 | ## Running and Usage 15 | 16 | See `gnaf/src/main/script/run.sh`. 17 | -------------------------------------------------------------------------------- /gnaf-ui/html/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | G-NAF 6 | 7 | 8 | 9 | 10 | 11 |

G-NAF

12 |
13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | -------------------------------------------------------------------------------- /project/plugins.sbt: -------------------------------------------------------------------------------- 1 | // I got it saying: Source code has generated in /home/neil/sw/gnaf/target/scala-2.11/src_managed/main/au/com/data61/gnaf/db/Tables.scala 2 | // but this file was not actually created, so I'm giving up on this plugin for now. 3 | // addSbtPlugin("com.github.tototoshi" % "sbt-slick-codegen" % "1.2.0") 4 | 5 | // required by above 6 | // libraryDependencies += "com.h2database" % "h2" % "1.4.191" 7 | 8 | addSbtPlugin("io.spray" % "sbt-revolver" % "0.8.0") 9 | 10 | addSbtPlugin("com.typesafe.sbteclipse" % "sbteclipse-plugin" % "4.0.0") 11 | 12 | addSbtPlugin("org.scala-sbt.plugins" % "sbt-onejar" % "0.8") 13 | 14 | addSbtPlugin("net.virtual-void" % "sbt-dependency-graph" % "0.8.2") 15 | 16 | addSbtPlugin("com.typesafe.sbt" % "sbt-license-report" % "1.1.0") 17 | 18 | addSbtPlugin("com.github.gseitz" % "sbt-release" % "1.0.3") 19 | -------------------------------------------------------------------------------- /gnaf-test/src/test/scala/au/csiro/data61/gnaf/test/MainTest.scala: -------------------------------------------------------------------------------- 1 | package au.csiro.data61.gnaf.test 2 | 3 | import org.scalatest.FlatSpec 4 | import org.scalatest.Matchers 5 | import Main._ 6 | 7 | class MainTest extends FlatSpec with Matchers { 8 | 9 | val s = "some test string" 10 | 11 | val typo = "\\S{2}~".r 12 | 13 | "mkTypo" should "make one random typo and not in the first two chars of a word" in { 14 | val s = Seq(Some("the quick brown fox"), None, Some("jumped over the lazy"), Some("fence")) 15 | (0 to 100).foreach { _ => 16 | (s zip mkTypo(s)).count { case (a, b) => 17 | val notEq = a != b 18 | if (notEq) { 19 | log.debug(b.toString) 20 | b.isDefined should be (true) 21 | typo.findFirstIn(b.get).isDefined should be (true) 22 | } 23 | notEq 24 | } should be(1) 25 | } 26 | } 27 | 28 | } -------------------------------------------------------------------------------- /gnaf-contrib/build.sbt: -------------------------------------------------------------------------------- 1 | name := "gnaf-contrib" 2 | 3 | libraryDependencies ++= Seq( 4 | "com.h2database" % "h2" % "1.4.193", // or postgres or whatever, % "runtime" should be enough, but sbt slick.codegen needs it on compile classpath 5 | "ch.megard" %% "akka-http-cors" % "0.1.2", 6 | "com.github.swagger-akka-http" %% "swagger-akka-http" % "0.7.0", // adding swagger brings in all the horrible old javax.ws & Jackson dependencies! 7 | "io.swagger" % "swagger-annotations" % "1.5.9" 8 | ) 9 | 10 | libraryDependencies ++= Seq( 11 | "slick-codegen", // only needed when generating slick mapping 12 | "slick", 13 | "slick-hikaricp" 14 | ) map ("com.typesafe.slick" %% _ % "3.1.1") 15 | 16 | libraryDependencies ++= Seq( 17 | "akka-actor", 18 | "akka-stream", 19 | "akka-http-experimental", 20 | "akka-http-spray-json-experimental", 21 | "akka-http-testkit" 22 | ) map ("com.typesafe.akka" %% _ % "2.4.3") 23 | 24 | -------------------------------------------------------------------------------- /src/main/script/checkupdates.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | # This script simply checks the current data and determines whether there are updates that need to be applied to 4 | # our production environment. 5 | 6 | jsonUrl=http://www.data.gov.au/api/3/action/package_show?id=19432f89-dc3a-4ef3-b943-5326ef1dbecc 7 | prodUrl=http://gnaf.nationalmap.nicta.com.au/v2/version 8 | 9 | last_modified=$( curl -sL $jsonUrl | jq -r '.result.resources[] | select(.format == "ZIP") | .last_modified' ) 10 | 11 | existing_last_modified=$(curl -sL $prodUrl | jq -r '.["gnaf-version"]' || echo None_Found) 12 | 13 | echo "Last modified date in production: $existing_last_modified"; 14 | echo "Last modified date from data.gov.au: $last_modified"; 15 | 16 | if [[ "$last_modified" != "$existing_last_modified" ]]; then 17 | echo "New data found!"; 18 | exit 0 19 | else 20 | echo "No new data found, exiting with exit code 1"; 21 | exit 1; 22 | fi 23 | -------------------------------------------------------------------------------- /gnaf-ui/README.md: -------------------------------------------------------------------------------- 1 | # gnaf-ui 2 | 3 | ## Introduction 4 | This project consists of static files providing a demonstration web user interface using Elasticsearch and the gnaf-service. 5 | It uses ECMAScript 6 and so only runs in some modern browsers (Chrome, Firefox, Edge, not yet Safari). 6 | 7 | ## Configuration 8 | 9 | The function `initBaseUrl` in `index.js` determines the URLs used to access the servers depending on the protocol used to serve the webapp. 10 | If the `file:` protocol is used (`index.html` was opened as a file rather than from a web server) then then `http://localhost is used to access the servers. 11 | Otherwise the protocol and host used to serve the webapp is used. 12 | 13 | ## Running and Usage 14 | 15 | Cors access to servers isn't working from a `file:` URL. 16 | 17 | To use python's simple web server to serve the UI over HTTP, run from the html directory: `python3 -m http.server`. Access the UI at: http://localhost:8000/. 18 | -------------------------------------------------------------------------------- /gnaf-util/src/main/scala/au/csiro/data61/gnaf/util/Util.scala: -------------------------------------------------------------------------------- 1 | package au.csiro.data61.gnaf.util 2 | 3 | import com.typesafe.scalalogging.Logger 4 | import org.slf4j.LoggerFactory 5 | 6 | object Util { 7 | def loader = getClass.getClassLoader // or Thread.currentThread.getContextClassLoader 8 | 9 | /** Get a Scala singleton Object. 10 | * @param fqn object's fully qualified name 11 | * @return object as type T 12 | */ 13 | def getObject[T](fqn: String): T = { 14 | val m = scala.reflect.runtime.universe.runtimeMirror(loader) 15 | m.reflectModule(m.staticModule(fqn)).instance.asInstanceOf[T] 16 | } 17 | 18 | /** 19 | * It appears that configuring a logger name containing a '$' in logback.xml doesn't work, so convert Scala object names ending in '$' to use '.' instead. 20 | */ 21 | def logName(c: Class[_]) = c.getName.replace('$', '.') 22 | 23 | def getLogger(c: Class[_]) = Logger(LoggerFactory.getLogger(logName(c))) 24 | } 25 | -------------------------------------------------------------------------------- /gnaf-db-service/src/main/resources/application.conf: -------------------------------------------------------------------------------- 1 | gnafDb = { 2 | connectionPool = HikariCP // this is the default 3 | 4 | slickDriver = slick.driver.H2Driver 5 | url = "jdbc:h2:file:~/gnaf" 6 | driver = org.h2.Driver 7 | 8 | // slickDriver = slick.driver.PostgresDriver 9 | // url = "jdbc:postgresql://localhost/gnaf" 10 | // driver = org.postgresql.Driver 11 | 12 | readOnly = true 13 | user = "READONLY" 14 | password = "READONLY" 15 | numThreads = 4 16 | queueSize = 100 17 | maxConnections = 10 18 | minConnections = 1 19 | connectionTimeout = 10000 20 | initializationFailFast = false 21 | 22 | slickDriver = ${?GNAF_SLICK_DRIVER} // optional override by environment variable 23 | url = ${?GNAF_JDBC_URL} 24 | driver = ${?GNAF_JDBC_DRIVER} 25 | user = ${?GNAF_JDBC_USER} 26 | password = ${?GNAF_JDBC_PASSWORD} 27 | } 28 | 29 | akka { 30 | loglevel = INFO 31 | } 32 | 33 | http { 34 | interface = "0.0.0.0" 35 | port = 9000 36 | 37 | interface = ${?GNAF_DB_SERVICE_INTERFACE} 38 | port = ${?GNAF_DB_SERVICE_PORT} 39 | } 40 | -------------------------------------------------------------------------------- /gnaf-extractor/src/main/resources/application.conf: -------------------------------------------------------------------------------- 1 | gnafDb = { 2 | connectionPool = HikariCP // this is the default 3 | 4 | slickDriver = slick.driver.H2Driver 5 | url = "jdbc:h2:file:~/gnaf;max_memory_rows=600000" // requires db admin rights 6 | driver = org.h2.Driver 7 | 8 | // slickDriver = slick.driver.PostgresDriver 9 | // url = "jdbc:postgresql://localhost/gnaf" 10 | // driver = org.postgresql.Driver 11 | 12 | readOnly = true 13 | user = "gnaf" // "READONLY" 14 | password = "gnaf" // "READONLY" 15 | numThreads = 4 16 | queueSize = 600000 17 | maxConnections = 20 18 | minConnections = 4 19 | connectionTimeout = 120000 20 | initializationFailFast = false 21 | 22 | slickDriver = ${?GNAF_SLICK_DRIVER} // optional override by environment variable 23 | url = ${?GNAF_JDBC_URL} 24 | driver = ${?GNAF_JDBC_DRIVER} 25 | user = ${?GNAF_JDBC_USER} 26 | password = ${?GNAF_JDBC_PASSWORD} 27 | 28 | localityTimeout = 60 // timeout in minutes for all queries for a locality 29 | allTimeout = 1000 // timeout in minutes for all queries 30 | } 31 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # Build 2 | FROM ubuntu:16.04 as builder 3 | 4 | WORKDIR / 5 | 6 | RUN apt-get update 7 | 8 | RUN apt-get -y install apt-transport-https 9 | 10 | RUN echo "deb https://dl.bintray.com/sbt/debian /" | tee -a /etc/apt/sources.list.d/sbt.list 11 | RUN echo "deb http://apt.postgresql.org/pub/repos/apt/ xenial-pgdg main" 12 | RUN apt-key adv --keyserver hkp://p80.pool.sks-keyservers.net:80 --recv-keys B97B0AFCAA1A47F044F244A07FCC7D46ACCC4CF8 13 | RUN apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv 2EE0EA64E40A89B84B2DF73499E82A75642AC823 14 | 15 | RUN apt-get update 16 | 17 | RUN apt-get -y install openjdk-8-jre sbt jq postgresql-client-9.5 wget curl zip time 18 | 19 | ADD . / 20 | 21 | RUN /bin/bash src/main/script/run.sh 22 | 23 | # Run 24 | FROM openjdk:8-jre 25 | 26 | WORKDIR / 27 | 28 | COPY --from=builder /indexDir /indexDir 29 | COPY --from=builder /gnaf-search/target/scala-2.11/gnaf-search_2.11-1.1-SNAPSHOT-one-jar.jar /gnaf-search/target/scala-2.11/gnaf-search_2.11-1.1-SNAPSHOT-one-jar.jar 30 | 31 | EXPOSE 9040 32 | 33 | CMD ["java", "-jar", "/gnaf-search/target/scala-2.11/gnaf-search_2.11-1.1-SNAPSHOT-one-jar.jar"] 34 | -------------------------------------------------------------------------------- /gnaf-contrib/src/main/resources/application.conf: -------------------------------------------------------------------------------- 1 | gnafContribDb = { 2 | connectionPool = HikariCP // this is the default 3 | 4 | slickDriver = slick.driver.H2Driver 5 | url = "jdbc:h2:file:~/gnafContrib" 6 | driver = org.h2.Driver 7 | 8 | // slickDriver = slick.driver.PostgresDriver 9 | // url = "jdbc:postgresql://localhost/gnafContrib" 10 | // driver = org.postgresql.Driver 11 | 12 | user = "gnaf" 13 | password = "gnaf" 14 | numThreads = 4 15 | maxConnections = 10 // 1 for each of above + 1 for each concurrently run Future + 2 spare 16 | minConnections = 1 17 | connectionTimeout = 10000 18 | initializationFailFast = false 19 | 20 | slickDriver = ${?GNAF_CONTRIB_SLICK_DRIVER} // optional override by environment variable 21 | url = ${?GNAF_CONTRIB_JDBC_URL} 22 | driver = ${?GNAF_CONTRIB_JDBC_DRIVER} 23 | user = ${?GNAF_CONTRIB_JDBC_USER} 24 | password = ${?GNAF_CONTRIB_JDBC_PASSWORD} 25 | } 26 | 27 | akka { 28 | loglevel = INFO 29 | } 30 | 31 | http { 32 | interface = "0.0.0.0" 33 | port = 9010 34 | 35 | interface = ${?GNAF_CONTRIB_SERVICE_INTERFACE} 36 | port = ${?GNAF_CONTRIB_SERVICE_PORT} 37 | } 38 | 39 | -------------------------------------------------------------------------------- /gnaf-test/src/main/resources/application.conf: -------------------------------------------------------------------------------- 1 | // not sure if we should move this to gnaf-common 2 | gnafDb = { 3 | connectionPool = HikariCP // this is the default 4 | 5 | slickDriver = slick.driver.H2Driver 6 | url = "jdbc:h2:file:~/gnaf" // ;max_memory_rows=100000 // requires db admin rights 7 | driver = org.h2.Driver 8 | 9 | // slickDriver = slick.driver.PostgresDriver 10 | // url = "jdbc:postgresql://localhost/gnaf" 11 | // driver = org.postgresql.Driver 12 | 13 | readOnly = true 14 | user = "READONLY" 15 | password = "READONLY" 16 | numThreads = 4 17 | queueSize = 987654 // failed with ~1000, so Future callbacks must go on this queue too 18 | maxConnections = 10 // 1 for each of above + 1 for each concurrently run Future + 2 spare 19 | minConnections = 1 20 | connectionTimeout = 10000 21 | initializationFailFast = false 22 | 23 | slickDriver = ${?GNAF_SLICK_DRIVER} // optional override by environment variable 24 | url = ${?GNAF_JDBC_URL} 25 | driver = ${?GNAF_JDBC_DRIVER} 26 | user = ${?GNAF_JDBC_USER} 27 | password = ${?GNAF_JDBC_PASSWORD} 28 | } 29 | 30 | gnafTest = { 31 | sampleSize = 100 32 | sampleSize = ${?GNAF_TEST_SAMPLE_SIZE} 33 | } -------------------------------------------------------------------------------- /gnaf-test/src/main/script/diff.js: -------------------------------------------------------------------------------- 1 | var fs = require('fs'); 2 | 3 | /** 4 | * Usage: node src/main/node/diff.js otherDir files ... 5 | */ 6 | 7 | // 0 -> node; 1 -> src/main/script/diff.js; 2 -> otherDir; 3 -> files ... 8 | var otherDir = process.argv[2]; 9 | 10 | var m = new Map(); 11 | for (i = 3; i < process.argv.length; ++i) { 12 | var stats = JSON.parse(fs.readFileSync(process.argv[i], "utf8")); 13 | for (desc in stats.histogram) { 14 | var o = stats.histogram[desc]; 15 | for (p in o) histAdd(m, p, o[p]); 16 | } 17 | stats = JSON.parse(fs.readFileSync(otherDir + '/' + process.argv[i], "utf8")); 18 | for (desc in stats.histogram) { 19 | var o = stats.histogram[desc]; 20 | for (p in o) histAdd(m, p, -o[p]); 21 | } 22 | } 23 | 24 | var sum = 0; 25 | for (i of m.values()) sum += i; 26 | console.log(JSON.stringify({ samples: sum, histogram: mapToArr(m) })); 27 | 28 | /** Add v occurrences of k to a histogram map. 29 | * 30 | * @param m histogram map: k -> occurrence count of k 31 | * @param k key 32 | * @param v new occurrences of k to add 33 | */ 34 | function histAdd(m, k, v) { 35 | var n = m.get(k); 36 | m.set(k, n ? n + v : v); 37 | } 38 | 39 | function mapToArr(m) { 40 | var a = []; 41 | for (e of m) a.push(e); 42 | return a; 43 | } 44 | -------------------------------------------------------------------------------- /gnaf-search/build.sbt: -------------------------------------------------------------------------------- 1 | name := "gnaf-search" 2 | 3 | // resourceGenerators in Compile += Def.task { 4 | // val file = (resourceManaged in Compile).value / "demo" / "myapp.properties" 5 | // val contents = "name=%s\nversion=%s".format(name.value,version.value) 6 | // IO.write(file, contents) 7 | // Seq(file) 8 | // }.taskValue 9 | 10 | // mappings in (Compile, packageBin) += { 11 | // (resourceManaged in Compile).value / "demo" / "myapp.properties" -> "demo/myapp.properties" 12 | // } 13 | 14 | mappings in (Compile, packageBin) += { 15 | new File("gnaf-db/target/generated/version.json") -> "version.json" 16 | } 17 | 18 | libraryDependencies ++= Seq( 19 | "com.github.scopt" %% "scopt" % "3.3.0", 20 | "com.jsuereth" %% "scala-arm" % "2.0.0-M1", 21 | "ch.megard" %% "akka-http-cors" % "0.1.2", 22 | "com.github.swagger-akka-http" %% "swagger-akka-http" % "0.7.0", // adding swagger brings in all the horrible old javax.ws & Jackson dependencies! 23 | "io.swagger" % "swagger-annotations" % "1.5.9", 24 | "org.scalatest" %% "scalatest" % "3.0.0" % "test" 25 | ) 26 | 27 | libraryDependencies ++= Seq( 28 | "akka-actor", 29 | "akka-stream", 30 | "akka-http-experimental", 31 | "akka-http-spray-json-experimental", 32 | "akka-http-testkit" 33 | ) map ("com.typesafe.akka" %% _ % "2.4.3") 34 | 35 | -------------------------------------------------------------------------------- /gnaf-extractor/src/main/script/loadElasticsearch.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | set -ex 4 | GNAF=$PWD 5 | 6 | DIR=tmp 7 | # DIR=/srv/gnaf/data # for http://gnaf.it.csiro.au/ (no space in user home dir) 8 | 9 | if false 10 | then 11 | 12 | # run Scala program, takes about 25min with a SSD 13 | rm -f gnaf-indexer.log 14 | mkdir -p $DIR 15 | time java -Xmx3G -jar target/scala-2.11/gnaf-extractor_2.11-0.1-SNAPSHOT-one-jar.jar | gzip > $DIR/out.gz 16 | mv gnaf-indexer.log $DIR 17 | 18 | fi 19 | 20 | ( 21 | cd $DIR 22 | 23 | # transform output of Scala program to suit Elasticsearch 'bulk' API, takes about 15min with a SSD (was 32min using jq) 24 | time zcat out.gz | node $GNAF/src/main/script/loadElasticsearch.js > bulk 25 | 26 | # split 'bulk' file into chunks not too big for a POST request 27 | rm -f chunk-??? 28 | split -l10000 -a3 bulk chunk- 29 | ) 30 | 31 | # backup old index? (for cluster.name: neilsElasSrch set in elasticsearch.yml) 32 | # tar cvfz index1.tar.gz -C /var/lib/elasticsearch/neilsElasSrch/ nodes 33 | 34 | # delete any old index 35 | curl -XDELETE 'localhost:9200/gnaf/' 36 | 37 | # create new index with custom field mappings 38 | curl -XPUT 'localhost:9200/gnaf/' --data-binary @src/main/script/gnafMapping.json 39 | 40 | # load the chunks using the Elasticsearch 'bulk' API, takes about 37min with a SSD 41 | time for i in $DIR/chunk-??? 42 | do 43 | echo $i 44 | curl -s -XPOST localhost:9200/_bulk --data-binary @$i 45 | done 46 | 47 | echo "all done" 48 | 49 | 50 | -------------------------------------------------------------------------------- /gnaf-util/3rd-party-licenses.html: -------------------------------------------------------------------------------- 1 | gnaf-util-licenses

gnaf-util-licenses

2 | 3 |
CategoryLicenseDependencyNotes
Apache Apache 2 io.spray # spray-json_2.11 # 1.3.2 
Apache Apache 2.0 License com.typesafe.scala-logging # scala-logging_2.11 # 3.1.0 
BSD BSD 3-Clause org.scala-lang # scala-library # 2.11.8 
BSD BSD 3-Clause org.scala-lang # scala-reflect # 2.11.8 
LGPL EPL + GNU Lesser General Public License ch.qos.logback # logback-classic # 1.1.3 
LGPL EPL + GNU Lesser General Public License ch.qos.logback # logback-core # 1.1.3 
MIT MIT License org.slf4j # slf4j-api # 1.7.12 
-------------------------------------------------------------------------------- /gnaf-test/src/main/script/summary.js: -------------------------------------------------------------------------------- 1 | var fs = require('fs'); 2 | 3 | /** 4 | * Usage: node src/main/node/summary.js files ... 5 | * 6 | * The test results (our input) are keyed by a description of the test data. 7 | * By default we sum the data for all descriptions except 'nofuzTypo', which is excluded because data 8 | * potentially containing typos should be searched with 'fuz'. 9 | * If a "-{desc}" precedes files then we sum only the descriptions matching this {desc}. 10 | */ 11 | var argIdx = 2; // 0 -> node; 1 -> src/main/script/summary.js; 2 -> [-desc] files ... 12 | var descMatch = process.argv[argIdx].startsWith('-') ? process.argv[argIdx++].substring(1) : null; 13 | var descPred = descMatch ? desc => desc == descMatch : desc => desc != 'nofuzTypo'; 14 | 15 | var m = new Map(); 16 | for (; argIdx < process.argv.length; ++argIdx) { 17 | var stats = JSON.parse(fs.readFileSync(process.argv[argIdx], "utf8")); 18 | for (desc in stats.histogram) { 19 | if (descPred(desc)) { 20 | var o = stats.histogram[desc]; 21 | for (p in o) histAdd(m, p, o[p]); 22 | } 23 | } 24 | } 25 | 26 | var sum = 0; 27 | for (i of m.values()) sum += i; 28 | console.log(JSON.stringify({ samples: sum, histogram: mapToArr(m) })); 29 | 30 | /** Add v occurrences of k to a histogram map. 31 | * 32 | * @param m histogram map: k -> occurrence count of k 33 | * @param k key 34 | * @param v new occurrences of k to add 35 | */ 36 | function histAdd(m, k, v) { 37 | var n = m.get(k); 38 | m.set(k, n ? n + v : v); 39 | } 40 | 41 | function mapToArr(m) { 42 | var a = []; 43 | for (e of m) a.push(e); 44 | return a; 45 | } 46 | -------------------------------------------------------------------------------- /gnaf-util/src/main/scala/au/csiro/data61/gnaf/util/Timer.scala: -------------------------------------------------------------------------------- 1 | package au.csiro.data61.gnaf.util 2 | 3 | /** Accumulates time since constructed or reset. 4 | */ 5 | class Timer { 6 | private var t0 = 0L // start of currently measured time period 7 | private var elapsed = 0L // sum of previous time periods ended by stop/elapsedSecs 8 | 9 | reset 10 | 11 | def reset = { 12 | elapsed = 0L 13 | start 14 | } 15 | 16 | /** `start` and `stop` need not be used - used to discard (not accumulate) the time between `stop` and `start`. */ 17 | def start = t0 = System.currentTimeMillis 18 | 19 | def stop = { 20 | val t = System.currentTimeMillis 21 | elapsed += (t - t0) 22 | t0 = t // so subsequent `start` isn't required 23 | } 24 | 25 | /** Get accumulated seconds. 26 | * 27 | * Also does `stop`, so time between `elapsedSecs` and a subsequent `start` would not be accumulated. 28 | */ 29 | def elapsedSecs: Float = { 30 | stop 31 | elapsed * 1e-3f 32 | } 33 | 34 | } 35 | 36 | object Timer { 37 | 38 | private lazy val log = Util.getLogger(getClass) 39 | 40 | def apply() = new Timer() 41 | 42 | /** Log elapsed time as info. 43 | * 44 | * Usage: 45 | * {{{ 46 | * val a: A = timed("it took {} secs") { 47 | * ... 48 | * new A() 49 | * } 50 | * }}} 51 | * 52 | * @param msg contains "{}" which is replaced by the elapsed time in secs 53 | * @param action thunk to execute and time 54 | */ 55 | def timed[T](msg: String)(action: => T) = { 56 | val t = Timer() 57 | val x = action 58 | log.info(msg, t.elapsedSecs.toString) 59 | x 60 | } 61 | } -------------------------------------------------------------------------------- /gnaf-contrib/README.md: -------------------------------------------------------------------------------- 1 | # gnaf-contrib 2 | 3 | ## Introduction 4 | This project provides a [Scala](http://scala-lang.org/) [RESTful](https://en.wikipedia.org/wiki/Representational_state_transfer) JSON 5 | web service providing access to the 6 | gnafContrib database of user supplied geocodes. 7 | 8 | This is a stand-alone webapp and does not run in a servlet container. 9 | On startup the database schema is created if it doesn't already exist. 10 | 11 | ## Configuration 12 | 13 | Configuration is in [application.conf](src/main/resources/application.conf) and most settings can be overridden with environment variables. 14 | 15 | ## Running and Usage 16 | 17 | See `gnaf/src/main/script/run.sh`. 18 | 19 | 20 | ### Generate Slick bindings 21 | 22 | The slick bindings can be written by hand, but its quicker to generate the bindings from a manually created database table: 23 | 24 | Create and connect to a new database with dburl `jdbc:h2:file:~/gnafContrib`, username `gnaf` and password `gnaf`. 25 | Create a table from which the bindings will be generated: 26 | 27 | CREATE TABLE ADDRESS_SITE_GEOCODE ( 28 | id long IDENTITY, -- auto-inc primary key 29 | contrib_status varchar(15) NOT NULL, -- ‘SUBMITTED’|‘PUBLISHED’ 30 | address_site_geocode_pid varchar(15), -- set to correct a gnaf geocode, null to add a new one 31 | date_created date NOT NULL, 32 | version int NOT NULL, -- optimistic locking row version 33 | address_site_pid varchar(15) NOT NULL, 34 | geocode_type_code varchar(4) NOT NULL, 35 | longitude numeric(11,8) NOT NULL, 36 | latitude numeric(10,8) NOT NULL 37 | ); 38 | 39 | Disconnect the SQL client from the database then, from the top level gnaf directory: 40 | 41 | sbt 42 | > project gnafContrib 43 | > console 44 | slick.codegen.SourceCodeGenerator.main( 45 | Array("slick.driver.H2Driver", "org.h2.Driver", "jdbc:h2:file:~/gnafContrib", "generated", "au.csiro.data61.gnaf.contrib.db", "gnaf", "gnaf") 46 | ) 47 | 48 | This generates code in: `generated/au/csiro/data61/gnaf/contrib/db/Tables.scala`. 49 | The source file `src/main/scala/au/csiro/data61/gnaf/contrib/db/ContribTables.scala` is a very minor modification of this generated code. 50 | 51 | -------------------------------------------------------------------------------- /gnaf-extractor/src/main/script/loadElasticsearch.js: -------------------------------------------------------------------------------- 1 | var readline = require('readline'); 2 | 3 | var d61Num = a => [ a.prefix, a.number.toString(), a.suffix ].filter(a => a != null && a != "D61_NULL" && a != "-1").join(""); 4 | 5 | var d61NumLast = a => a.number == -1 ? "" : '-' + d61Num(a); 6 | 7 | var d61StreetNum = a => a.numberFirst.number == -1 ? "" : d61Num(a.numberFirst) + d61NumLast(a.numberLast); 8 | 9 | /** Each inner array gets indexed as a separate Lucene "value" in the "d61Address" field. 10 | * Although Lucene just concatenates all the values into the field there is a big position increment between the values 11 | * ("position_increment_gap": 100 set in gnafMapping.json) that stops phrase searches and shingles (n-grams) matching across values. 12 | */ 13 | var d61Address = a => 14 | [ 15 | [ a.addressSiteName, a.buildingName ], 16 | [ a.flatTypeName, d61Num(a.flat) ], 17 | [ a.levelTypeName, d61Num(a.level) ], 18 | [ d61StreetNum(a), a.street.name, a.street.typeCode, a.street.suffixName ], 19 | [ a.localityName, a.stateAbbreviation, a.postcode ] 20 | ].concat( 21 | a.streetVariant.map( x => [ d61StreetNum(a), x.name, x.typeCode, x.suffixName ]), 22 | a.localityVariant.map( x => [ x.localityName, a.stateAbbreviation, a.postcode ]) 23 | ).map(x => x.filter(x => x != "" && x != null && x != "D61_NULL").join(" ")).filter(x => x != ""); 24 | 25 | var d61AddressNoAlias = a => 26 | [ 27 | a.addressSiteName, a.buildingName, 28 | a.flatTypeName, d61Num(a.flat), 29 | a.levelTypeName, d61Num(a.level), 30 | d61StreetNum(a), a.street.name, a.street.typeCode, a.street.suffixName, 31 | a.localityName, a.stateAbbreviation, a.postcode 32 | ].filter(x => x != "" && x != null && x != "D61_NULL").join(" "); 33 | 34 | var rl = readline.createInterface({ 35 | input: process.stdin, 36 | output: process.stdout, 37 | terminal: false 38 | }); 39 | 40 | rl.on('line', function (l) { 41 | var a = JSON.parse(l); 42 | a["d61Address"] = d61Address(a); 43 | a["d61AddressNoAlias"] = d61AddressNoAlias(a); 44 | console.log( 45 | JSON.stringify({ index: { _index: "gnaf", _type: "gnaf", _id: a.addressDetailPid } }) // one line of elasticsearch indexing metadata 46 | + '\n' + JSON.stringify(a) // next line is document to index 47 | ); 48 | }); -------------------------------------------------------------------------------- /gnaf-ui/html/index.css: -------------------------------------------------------------------------------- 1 | body { 2 | font-family: verdana, helvetica, arial, sans-serif; 3 | font-size: 14px; 4 | color: #404040; 5 | } 6 | 7 | h1, h2, h3, h4, .ui-tabs-nav .ui-state-active a { 8 | color: #98bf21; 9 | } 10 | 11 | /* .example { */ 12 | /* padding-left: 10px; */ 13 | /* font-size: small; */ 14 | /* } */ 15 | 16 | .label, label { 17 | color: #98bf21; 18 | font-weight: bold; 19 | margin-right: 5px; 20 | vertical-align: top; 21 | } 22 | 23 | .example { 24 | padding-left: 10px; 25 | font-size: small; 26 | vertical-align: top; 27 | } 28 | 29 | .multi-line { 30 | display: inline-block; 31 | } 32 | 33 | #bulkLookup .header a { 34 | margin-left: 10px; 35 | } 36 | 37 | #bulkAddresses { 38 | height: 250px; 39 | width: 500px; 40 | } 41 | 42 | 43 | /* .error { */ 44 | /* color: red; */ 45 | /* margin-top: 15px; */ 46 | /* font-family: monospace; */ 47 | /* white-space: pre; */ 48 | /* } */ 49 | 50 | button { 51 | color: white; 52 | background-color: #98bf21; 53 | border: 1px outset #98bf21; 54 | margin-top: 10px; 55 | font-weight: bold; 56 | } 57 | 58 | table { 59 | border: 1px solid #98bf21; 60 | border-collapse: collapse; 61 | margin-top: 5px; 62 | } 63 | 64 | tr:nth-child(odd) { 65 | background: #EAF2D3; 66 | } 67 | 68 | th { 69 | text-align: center; 70 | padding: 3px 3px 3px 3px; 71 | background-color: #A7C942; 72 | color: white; 73 | } 74 | 75 | td { 76 | vertical-align: top; 77 | border: 1px solid #98bf21; 78 | padding: 2px 5px 2px 5px; 79 | } 80 | 81 | div.location { 82 | margin-bottom: 10px; 83 | } 84 | 85 | div.location a, #addressNearMe .refresh, .contribStatus .delete { 86 | margin-left: 5px; 87 | } 88 | 89 | #addressInput { 90 | width: 500px; 91 | } 92 | 93 | #addressLookupResult, #addressNearMeResult, #bulkResult, .addrType, .gnafGeocodes, .contribGeocodes { 94 | margin-top: 15px; 95 | } 96 | 97 | .contribStatus a { 98 | 99 | } 100 | .contribGeocodes .add input { 101 | width: 150px; 102 | } 103 | 104 | .contribGeocodes .geocodeTypeCode select { 105 | width: 500px; 106 | } 107 | 108 | .showGeoDetail { 109 | margin-left: 10px; 110 | } 111 | 112 | #streetFilter, #address { 113 | width: 300px; 114 | } 115 | 116 | -------------------------------------------------------------------------------- /gnaf-test/src/main/script/run.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | # Note: to count total searches performed: 4 | # for i in address*.json; do jq '.|length' $i; done | awk '{ sum += $1 } END { print sum*6 }' 5 | 6 | # Note: searchLucene.js performs 6 searches for each address: 7 | # { fuz, noFuz } * { query, queryTypo, queryPostcodeBeforeState } 8 | 9 | version=`sed 's/.*"\(.*\)"/\1/' ../version.sbt` 10 | scalaVersion=2.11 11 | 12 | search="src/main/script/searchLucene.js" # "src/main/script/searchEs.js" 13 | url="http://localhost:9040/bulkSearch" # "http://localhost:9200/gnaf/_msearch" 14 | skip="false" 15 | sampleSize=200 16 | addrDir=. 17 | statsDir=. 18 | 19 | while getopts "u:n:sh" opt 20 | do 21 | case $opt in 22 | a) addrDir=$OPTARG ;; 23 | o) statsDir=$OPTARG ;; 24 | u) url=$OPTARG ;; 25 | n) sampleSize=$OPTARG ;; 26 | s) skip="true" ;; 27 | h|"?") cat < $afile 53 | wait # for previous node process 54 | [[ -n "$url" ]] && node $search $url $afile > $sfile & 55 | else 56 | # re-run with same test data as before 57 | node $search $url $afile > $sfile 58 | fi 59 | done 60 | wait # for last node process 61 | 62 | node src/main/script/summary.js $statsDir/stats*.json 63 | -------------------------------------------------------------------------------- /gnaf-indexer/src/main/scala/au/csiro/data61/gnaf/indexer/Indexer.scala: -------------------------------------------------------------------------------- 1 | package au.csiro.data61.gnaf.indexer 2 | 3 | import java.io.File 4 | 5 | import scala.io.Source 6 | 7 | import org.apache.lucene.document.{ Document, DoublePoint, Field } 8 | 9 | import au.csiro.data61.gnaf.lucene.GnafLucene._ 10 | import au.csiro.data61.gnaf.lucene.LuceneUtil.directory 11 | import au.csiro.data61.gnaf.util.Gnaf.Address 12 | import au.csiro.data61.gnaf.util.Gnaf.JsonProtocol.addressFormat 13 | import au.csiro.data61.gnaf.util.Util.getLogger 14 | import resource.managed 15 | import spray.json.pimpString 16 | 17 | 18 | object Indexer { 19 | val log = getLogger(getClass) 20 | 21 | case class CliOption(indexDir: File) 22 | val defaultCliOption = CliOption(new File("./indexDir")) 23 | 24 | def main(args: Array[String]) = { 25 | val parser = new scopt.OptionParser[CliOption]("gnaf-indexer") { 26 | head("gnaf-lucene-indexer", "0.x") 27 | note("Load GNAF JSON into a Lucene index") 28 | opt[File]('i', "indexDir") action { (x, c) => 29 | c.copy(indexDir = x) 30 | } text (s"Lucene index directory, default ${defaultCliOption.indexDir}") 31 | help("help") text ("prints this usage text") 32 | } 33 | parser.parse(args, defaultCliOption) foreach run 34 | log.info("done") 35 | } 36 | 37 | def addrToDoc(line: String) = { 38 | val addr = line.parseJson.convertTo[Address] 39 | val (d61Address, noneCount, d61AddressNoAlias) = addr.toD61Address 40 | val doc = new Document 41 | doc.add(new Field(F_JSON, line, storedNotIndexedFieldType)) 42 | for (l <- addr.location) doc.add(new DoublePoint(F_LOCATION, l.lat.toDouble, l.lon.toDouble)) 43 | for (a <- d61Address) doc.add(new Field(F_ADDRESS, a, addressFieldType)) 44 | for { 45 | f <- addr.flat.toOptStr if addr.level.toOptStr.isEmpty 46 | n <- addr.numberFirst.toOptStr 47 | } doc.add(new Field(F_ADDRESS, f + BIGRAM_SEPARATOR + n, addressFieldType)) // explicitly add flat + street num bigram without extra unigrams 48 | for (i <- 0 until noneCount) doc.add(new Field(F_MISSING_DATA, MISSING_DATA_TOKEN, missingDataFieldType)) 49 | doc.add(new Field(F_ADDRESS_NOALIAS, d61AddressNoAlias, storedNotIndexedFieldType)) 50 | 51 | doc 52 | } 53 | 54 | def run(c: CliOption) = { 55 | for { 56 | indexer <- managed(mkIndexer(directory(c.indexDir))) 57 | line <- Source.fromInputStream(System.in, "UTF-8").getLines 58 | } { 59 | indexer.addDocument(addrToDoc(line)) 60 | } 61 | } 62 | } -------------------------------------------------------------------------------- /gnaf-indexer/3rd-party-licenses.html: -------------------------------------------------------------------------------- 1 | gnaf-indexer-licenses

gnaf-indexer-licenses

2 | 3 |
CategoryLicenseDependencyNotes
Apache Apache 2 io.spray # spray-json_2.11 # 1.3.2 
Apache Apache 2.0 License com.typesafe.scala-logging # scala-logging_2.11 # 3.1.0 
Apache The Apache Software License, Version 2.0 org.apache.lucene # lucene-analyzers-common # 6.2.1 
Apache The Apache Software License, Version 2.0 org.apache.lucene # lucene-core # 6.2.1 
BSD BSD au.csiro.data61.gnaf # gnaf-lucene_2.11 # 0.8-SNAPSHOT 
BSD BSD au.csiro.data61.gnaf # gnaf-util_2.11 # 0.8-SNAPSHOT 
BSD BSD 3-Clause org.scala-lang # scala-library # 2.11.8 
BSD BSD 3-Clause org.scala-lang # scala-reflect # 2.11.8 
BSD BSD-Style com.jsuereth # scala-arm_2.11 # 2.0.0-M1 
LGPL EPL + GNU Lesser General Public License ch.qos.logback # logback-classic # 1.1.3 
LGPL EPL + GNU Lesser General Public License ch.qos.logback # logback-core # 1.1.3 
MIT MIT License com.github.scopt # scopt_2.11 # 3.3.0 
MIT MIT License org.slf4j # slf4j-api # 1.7.12 
-------------------------------------------------------------------------------- /gnaf-extractor/README.md: -------------------------------------------------------------------------------- 1 | # gnaf-extractor 2 | 3 | ## Introduction 4 | This project queries the gnaf database to produce JSON address data (for consumption by gnaf-indexer). 5 | `src/main/script` contains obsolete scripts to load the output into Elasticsearch. 6 | 7 | 8 | ## H2 Result Set Spooling 9 | If an [H2](http://www.h2database.com/) result set contains more than 10 | [MAX_MEMORY_ROWS](http://www.h2database.com/html/grammar.html?highlight=max_memory_rows&search=MAX_MEMORY_ROWS#set_max_memory_rows), 11 | it is spooled to disk before the first row is provided to the client. 12 | The default is 40000 rows per GB of available RAM and setting a non-default value requires database admin rights (which we prefer to avoid using). 13 | Analysis in comments in `Extractor.scala` shows that it needs to handle result sets up to 95,004 rows, so allocating 3GB of heap (with `java -Xmx3G`) should avoid spooling. 14 | 15 | ## Configuration 16 | 17 | Configuration is in [application.conf](src/main/resources/application.conf) and most settings can be overridden with environment variables. 18 | The database URL can also be set with a command line option (overriding the above, use `--help` for details). 19 | 20 | ## Running and Usage 21 | 22 | See `gnaf/src/main/script/run.sh`. 23 | 24 | ## To Do 25 | 26 | Add Code/Name from the _AUT tables as synonyms (e.g. so ST will match STREET) to the phrase suggester. 27 | The current indexed term is the full name (which may contain spaces), so we need to add the abbreviation (which does not contain spaces). 28 | A difference in spaces alters the number of tokens and all the following term positions resulting in problems with phrase search. 29 | See https://www.elastic.co/guide/en/elasticsearch/guide/current/multi-word-synonyms.html, which suggest using "Simple Contraction". 30 | However we're using shingles/ngrams rather than phrase search, so do we have the same problem? Yes I think so. 31 | We should do contraction to the single term abreviation. 32 | Possible negative consequences? Synonyms create the risk of spurious matches. The tables contain some unused entries (e.g. the STREET_TYPE_AUT (AWLK, AIRWALK)) and many rarely used entries; using them all as synonyms increases the risk. e.g. ATM has small edit distance from ATMA, ATKA, ATEA (street names), so contracting "AUTOMATIC TELLER MACHINE" to "ATM" could result in these street names matching AUTOMATIC TELLER MACHINEs. 33 | Perhaps we need to be quite selective in the use of synonyms. 34 | 35 | Other synonyms: "St" for "Saint", "Mt" for "Mount"? 36 | The "Example Queries" section shows that this should be handled already by the inclusion of street (locality) aliases. 37 | 38 | At some cost in terms of speed, we could prioritize primary over secondary addresses and principle over alias addresses. But maybe the default higher weight given to shorter docs is already enough? 39 | -------------------------------------------------------------------------------- /gnaf-lucene/src/main/scala/au/csiro/data61/gnaf/lucene/LuceneUtil.scala: -------------------------------------------------------------------------------- 1 | package au.csiro.data61.gnaf.lucene 2 | 3 | import java.io.Closeable 4 | import scala.util.Try 5 | import org.apache.lucene.analysis.{ Analyzer, TokenStream } 6 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute 7 | import org.apache.lucene.document.Document 8 | import org.apache.lucene.index.DirectoryReader 9 | import org.apache.lucene.search.{ IndexSearcher, Query, ScoreDoc } 10 | import org.apache.lucene.store.Directory 11 | import au.csiro.data61.gnaf.util.Timer 12 | import au.csiro.data61.gnaf.util.Util.getLogger 13 | import java.io.File 14 | import org.apache.lucene.store.FSDirectory 15 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute 16 | 17 | 18 | /** 19 | * Non GNAF specific code for Lucene indexing and searching. 20 | * 21 | * simplified from: https://github.csiro.au/bac003/social-watch/blob/master/analytics/src/main/scala/org/t3as/socialWatch/analytics/LuceneUtil.scala 22 | */ 23 | object LuceneUtil { 24 | val log = getLogger(getClass) 25 | 26 | def tokenIter(ts: TokenStream): Iterator[String] = { 27 | ts.reset 28 | Iterator.continually { 29 | val more = ts.incrementToken 30 | if (!more) { 31 | ts.end 32 | ts.close 33 | } 34 | more 35 | }.takeWhile(identity).map(_ => ts.getAttribute(classOf[CharTermAttribute]).toString) 36 | } 37 | 38 | def tokenIter(analyzer: Analyzer, fieldName: String, text: String): Iterator[String] 39 | = tokenIter(analyzer.tokenStream(fieldName, text)) 40 | 41 | def directory(indexDir: File) = FSDirectory.open(indexDir.toPath) 42 | 43 | class Searcher[Hit, Results]( 44 | directory: Directory, 45 | toHit: (ScoreDoc, Document) => Hit, // convert score and map of fields to Hit 46 | toResults: (Int, Float, Seq[Hit], Option[String]) => Results // convert totalHits, elapsedSecs, Seq[Hit], Option[error] to Results 47 | ) extends Closeable { 48 | val log = getLogger(getClass) 49 | 50 | val searcher = open 51 | protected def open = new IndexSearcher(DirectoryReader.open(directory)) 52 | 53 | log.debug(s"Searcher: numDocs = ${searcher.getIndexReader.numDocs}") 54 | 55 | def search(q: Query, numHits: Int = 20) = { 56 | val timer = Timer() 57 | 58 | val result = for { 59 | topDocs <- Try { 60 | searcher.search(q, numHits) 61 | } 62 | hits <- Try { 63 | topDocs.scoreDocs map { scoreDoc => toHit(scoreDoc, searcher.doc(scoreDoc.doc)) } 64 | } 65 | } yield toResults(topDocs.totalHits, timer.elapsedSecs.toFloat, hits, None) 66 | 67 | result.recover { case e => toResults(0, timer.elapsedSecs.toFloat, List(), Some(e.getMessage)) }.get 68 | } 69 | 70 | def close = searcher.getIndexReader.close 71 | } 72 | 73 | } -------------------------------------------------------------------------------- /gnaf-db/3rd-party-licenses.html: -------------------------------------------------------------------------------- 1 | gnaf-db-licenses

gnaf-db-licenses

2 | 3 |
CategoryLicenseDependencyNotes
Apache Apache 2 io.spray # spray-json_2.11 # 1.3.2 
Apache Apache 2.0 License com.typesafe.scala-logging # scala-logging_2.11 # 3.1.0 
Apache Apache License, Version 2.0 com.typesafe # config # 1.2.1 
Apache The Apache Software License, Version 2.0 com.zaxxer # HikariCP-java6 # 2.3.7 
Apache The Apache Software License, Version 2.0 org.javassist # javassist # 3.19.0-GA 
BSD BSD au.csiro.data61.gnaf # gnaf-util_2.11 # 0.8-SNAPSHOT 
BSD BSD 3-Clause org.scala-lang # scala-library # 2.11.8 
BSD BSD 3-Clause org.scala-lang # scala-reflect # 2.11.8 
BSD Two-clause BSD-style license com.typesafe.slick # slick-codegen_2.11 # 3.1.1 
BSD Two-clause BSD-style license com.typesafe.slick # slick-hikaricp_2.11 # 3.1.1 
BSD Two-clause BSD-style license com.typesafe.slick # slick_2.11 # 3.1.1 
CC0 CC0 org.reactivestreams # reactive-streams # 1.0.0 
LGPL EPL + GNU Lesser General Public License ch.qos.logback # logback-classic # 1.1.3 
LGPL EPL + GNU Lesser General Public License ch.qos.logback # logback-core # 1.1.3 
MIT MIT License org.slf4j # slf4j-api # 1.7.12 
Mozilla MPL 2.0 or EPL 1.0 com.h2database # h2 # 1.4.193 
-------------------------------------------------------------------------------- /gnaf-lucene/3rd-party-licenses.html: -------------------------------------------------------------------------------- 1 | gnaf-lucene-licenses

gnaf-lucene-licenses

2 | 3 |
CategoryLicenseDependencyNotes
Apache Apache 2 io.spray # spray-json_2.11 # 1.3.2 
Apache Apache 2.0 License com.typesafe.scala-logging # scala-logging_2.11 # 3.1.0 
Apache The Apache Software License, Version 2.0 org.apache.lucene # lucene-analyzers-common # 6.2.1 
Apache The Apache Software License, Version 2.0 org.apache.lucene # lucene-core # 6.2.1 
Apache the Apache License, ASL Version 2.0 org.scalactic # scalactic_2.11 # 3.0.0 
Apache the Apache License, ASL Version 2.0 org.scalatest # scalatest_2.11 # 3.0.0 
BSD BSD au.csiro.data61.gnaf # gnaf-util_2.11 # 0.8-SNAPSHOT 
BSD BSD 3-Clause org.scala-lang # scala-library # 2.11.8 
BSD BSD 3-Clause org.scala-lang # scala-reflect # 2.11.8 
BSD BSD 3-clause org.scala-lang.modules # scala-parser-combinators_2.11 # 1.0.4 
BSD BSD 3-clause org.scala-lang.modules # scala-xml_2.11 # 1.0.5 
BSD BSD-Style com.jsuereth # scala-arm_2.11 # 2.0.0-M1 
LGPL EPL + GNU Lesser General Public License ch.qos.logback # logback-classic # 1.1.3 
LGPL EPL + GNU Lesser General Public License ch.qos.logback # logback-core # 1.1.3 
MIT MIT License com.github.scopt # scopt_2.11 # 3.3.0 
MIT MIT License org.slf4j # slf4j-api # 1.7.12 
-------------------------------------------------------------------------------- /gnaf-util/src/main/scala/au/csiro/data61/gnaf/util/Gnaf.scala: -------------------------------------------------------------------------------- 1 | package au.csiro.data61.gnaf.util 2 | 3 | import spray.json.DefaultJsonProtocol 4 | 5 | object Gnaf { 6 | 7 | def join(s: Seq[Option[String]], delim: String): Option[String] = { 8 | val r = s.flatten.filter(_.nonEmpty).mkString(delim) 9 | if (r.nonEmpty) Some(r) else None 10 | } 11 | def d61Num(n: Option[Int]) = n.map(_.toString) 12 | 13 | case class PreNumSuf(prefix: Option[String], number: Option[Int], suffix: Option[String]) { 14 | def toOptStr = join(Seq(prefix, d61Num(number), suffix), "") 15 | } 16 | 17 | case class Street(name: String, typeCode: Option[String], typeName: Option[String], suffixCode: Option[String], suffixName: Option[String]) 18 | case class LocalityVariant(localityName: String) 19 | case class Location(lat: BigDecimal, lon: BigDecimal) 20 | case class Address(addressDetailPid: String, addressSiteName: Option[String], buildingName: Option[String], 21 | flatTypeCode: Option[String], flatTypeName: Option[String], flat: PreNumSuf, 22 | levelTypeCode: Option[String], levelTypeName: Option[String], level: PreNumSuf, 23 | numberFirst: PreNumSuf, numberLast: PreNumSuf, 24 | street: Option[Street], localityName: String, stateAbbreviation: String, stateName: String, postcode: Option[String], 25 | aliasPrincipal: Option[Char], primarySecondary: Option[Char], 26 | location: Option[Location], streetVariant: Seq[Street], localityVariant: Seq[LocalityVariant]) { 27 | 28 | def toD61Address = { 29 | val streetNum = numberFirst.toOptStr.map(f => f + numberLast.toOptStr.map("-" + _).getOrElse("")) 30 | val seqNoAlias = Seq( 31 | Seq( addressSiteName, buildingName ), // each inner Seq optionally produces a String in the final Seq 32 | Seq( flatTypeName, flat.toOptStr ), 33 | Seq( levelTypeName, level.toOptStr ), 34 | Seq( streetNum, street.map(_.name), street.flatMap(_.typeCode), street.flatMap(_.suffixName) ), 35 | Seq( Some(localityName), Some(stateAbbreviation), postcode ) 36 | ) 37 | val seqWithAlias = seqNoAlias ++ 38 | streetVariant.map(v => Seq( streetNum, Some(v.name), v.typeCode, v.suffixName )) ++ 39 | localityVariant.map(v => Seq( Some(v.localityName), Some(stateAbbreviation), postcode )) 40 | val d61Address = seqWithAlias.map(inner => join(inner, " ")).flatten 41 | val seqNoAlias2 = seqNoAlias.map(inner => join(inner, " ")) 42 | val noneCount = (streetNum +: seqNoAlias2).count(_.isEmpty) // count each empty streetNum and inner seq: site/building, flat, level 43 | val d61AddressNoAlias = join(seqNoAlias.map(inner => join(inner, " ")), " ").getOrElse("") 44 | (d61Address, noneCount, d61AddressNoAlias) 45 | } 46 | } 47 | 48 | object JsonProtocol extends DefaultJsonProtocol { 49 | implicit val preNumSufFormat = jsonFormat3(PreNumSuf) 50 | implicit val streetFormat = jsonFormat5(Street) 51 | implicit val locVarFormat = jsonFormat1(LocalityVariant) 52 | implicit val locationFormat = jsonFormat2(Location) 53 | implicit val addressFormat = jsonFormat21(Address) 54 | } 55 | 56 | } -------------------------------------------------------------------------------- /gnaf-test/src/main/script/Maps.js: -------------------------------------------------------------------------------- 1 | 'use strict'; 2 | 3 | var identity = x => x; 4 | 5 | function mapToObj(m, f = identity) { 6 | var a = {}; 7 | for (var [k, v] of m) a[k.toString()] = f(v); 8 | return a; 9 | } 10 | 11 | var ctorMap = () => new Map(); 12 | var ctorArr = () => []; 13 | 14 | 15 | 16 | /** Map where values are a container e.g. an array or another map */ 17 | class MapCont { 18 | 19 | /** 20 | * @param ctor constructor for a new container value in the map 21 | */ 22 | constructor(ctor) { 23 | this.ctor = ctor; 24 | this.m = ctorMap(); 25 | } 26 | 27 | /** 28 | * Get container for k, constructing and adding it if it doesn't exist. 29 | * @param k 30 | */ 31 | get(k) { 32 | var a = this.m.get(k); 33 | if (!a) { 34 | a = this.ctor(); 35 | this.m.set(k, a); 36 | } 37 | return a; 38 | } 39 | 40 | /** convert to object (e.g. for JSON serialization) */ 41 | object(f = identity) { return mapToObj(this.m, f); } 42 | } 43 | 44 | 45 | 46 | /** Map of maps: k1 -> k2 -> v */ 47 | class MapMap extends MapCont { 48 | 49 | constructor(ctor = ctorMap) { 50 | super(ctor); 51 | } 52 | 53 | get2(k1, k2) { 54 | return this.get(k1).get(k2); 55 | } 56 | 57 | set2(k1, k2, v) { 58 | this.get(k1).set(k2, v); 59 | } 60 | 61 | object(f = identity) { return super.object(v => mapToObj(v, f)); } 62 | } 63 | 64 | 65 | 66 | /** Map of histograms, where the histograms are maps: k2 -> count */ 67 | class MapHist extends MapMap { 68 | inc(k1, k2) { 69 | var m2 = this.get(k1); 70 | var n = m2.get(k2); 71 | m2.set(k2, n ? n + 1 : 1); 72 | } 73 | } 74 | 75 | 76 | 77 | /** Map of arrays */ 78 | class MapArr extends MapCont { 79 | 80 | constructor() { 81 | super(ctorArr); 82 | } 83 | 84 | append(k1, v) { 85 | this.get(k1).push(v); 86 | } 87 | } 88 | 89 | var ctorMapArr = () => new MapArr(); 90 | 91 | 92 | 93 | class MapMapCont extends MapMap { 94 | 95 | constructor(ctor) { // must provide object() 96 | super(ctor); 97 | } 98 | 99 | object(f = identity) { return mapToObj(this.m, v => v.object(f)); } 100 | } 101 | 102 | 103 | 104 | module.exports = { 105 | identity: identity, 106 | mapToObj: mapToObj, 107 | ctorMapArr: ctorMapArr, 108 | MapCont: MapCont, 109 | MapMap: MapMap, 110 | MapHist: MapHist, 111 | MapArr: MapArr, 112 | MapMapCont: MapMapCont 113 | } 114 | 115 | // examples 116 | //var m = new MapMap(); 117 | //m.set2("sally", 1, "fred") 118 | //m.set2("sally", 1, "sally") 119 | //m.set2("sally", 2, "fred") 120 | //console.log('MapMap.object:', m.object()); 121 | // 122 | //var h = new MapHist(); 123 | //h.inc("fred", 1); 124 | //h.inc("sally", 2); 125 | //h.inc("sally", 2); 126 | //console.log('MapHist.object:', h.object()); 127 | // 128 | //var a = new MapArr(); 129 | //a.append("sally", "fred"); 130 | //a.append("sally", "sue"); 131 | //console.log('MapArr.object:', a.object()); 132 | // 133 | //var c = new MapMapCont(ctorMapArr); 134 | //c.get("sally").append("george", "fred"); 135 | //c.get("sally").append("george", "sue"); 136 | //console.log('MapMapCont.object:', c.object()); 137 | 138 | 139 | -------------------------------------------------------------------------------- /gnaf-test/README.md: -------------------------------------------------------------------------------- 1 | # gnaf-test 2 | 3 | ## Introduction 4 | 5 | This sub-project provides end to end evaluation by comparing address lookups with known correct results from the database. 6 | This approach is motivated by the fact that search tuning must be evaluated across a wide range of test data. 7 | 8 | ## Project Structure 9 | 10 | 1. The [Scala](http://scala-lang.org/) command line program `gnaf-test` extracts from the database random selections of addresses with user selected characteristics, 11 | such as using street or locality aliases, street number prefixes, suffixes or ranges, unit or level numbers, 12 | out of order elements (postcode before state) or intentional errors (to test fuzzy matching). 13 | It outputs JSON containing the search input as an address string and the correct result as a G-NAF address without aliases or errors. 14 | The `addressDetailPid` is not useful as the correct result because G-NAF contains addresses that are not unique (at least over the fields used here). 15 | 2. A [node.js](https://nodejs.org/en/) program [src/main/script/searchLucene.js](src/main/script/searchLucene.js) takes the above JSON, performs bulk lookups using the `gnaf-search` web service, 16 | computes the histogram of how often the correct result is the top hit (index 0), 17 | next hit (index 1) etc. or not in the top N hits (index -1). 18 | Where its not the top hit the problematic input address is output for further investigation. 19 | The histogram and problematic input addresses are output as JSON. 20 | 3. A [node.js](https://nodejs.org/en/) program [src/main/script/summary.js](src/main/script/summary.js) aggregates the above output into a single histogram. 21 | 4. A [bash](https://www.gnu.org/software/bash/) script [src/main/script/run.sh](src/main/script/run.sh) runs all of the above. 22 | 23 | ## Configuration 24 | 25 | Configuration is in [application.conf](src/main/resources/application.conf) and most settings can be overridden with environment variables. 26 | 27 | ## Dependencies 28 | 29 | - install [node.js](https://nodejs.org/en/) and `npm`. 30 | The Ubuntu packaged versions are too old but up-to-date packages are available [here](https://github.com/nodesource/distributions). 31 | - run `npm install` to install node package dependencies 32 | 33 | ## Results 34 | 35 | Overall results: 36 | 37 | node src/main/script/summary.js stats*.json 38 | {"samples":4780,"histogram":[["0",4764],["1",7],["8",1],["-1",8]]} 39 | 40 | A potential error reported for test addresses using street and locality aliases is (using [jq](https://stedolan.github.io/jq/) to filter and format): 41 | 42 | jq .errors stats-localityAlias-streetAlias.json 43 | "nofuz": { 44 | "-1": [ 45 | "MAIDENWELL-BUNYA MOUNTAINS ROAD PIMPIMBUDGEE QLD 4615" 46 | ] 47 | } 48 | 49 | A non-fuzzy search gets the same score for all the top 10 hits and they are all correct matches, just not the one we were looking for. 50 | Unfortunately G-NAF contains many duplicates with inconsistent usage of the main name and aliases. 51 | Most reported potential errors are similarly not actual errors. 52 | 53 | Baseline for following comparisons: 54 | 55 | node src/main/script/summary.js -nofuz stats*.json 56 | {"samples":956,"histogram":[["0",955],["-1",1]]} 57 | 58 | Inputting a field out or order (postcode before state) looses bigram matches but only introduced one additional potential error: 59 | 60 | node src/main/script/summary.js -nofuzPostcodeBeforeState stats*.json 61 | {"samples":956,"histogram":[["0",954],["-1",2]]} 62 | 63 | Adding a single character error and fuzzy matching also only introduced one additional potential error over the baseline: 64 | 65 | node src/main/script/summary.js -fuzTypo stats*.json 66 | {"samples":956,"histogram":[["0",954],["1",1],["-1",1]]} 67 | 68 | -------------------------------------------------------------------------------- /gnaf-extractor/3rd-party-licenses.html: -------------------------------------------------------------------------------- 1 | gnaf-extractor-licenses

gnaf-extractor-licenses

2 | 3 |
CategoryLicenseDependencyNotes
Apache Apache 2 io.spray # spray-json_2.11 # 1.3.2 
Apache Apache 2.0 License com.typesafe.scala-logging # scala-logging_2.11 # 3.1.0 
Apache Apache License, Version 2.0 com.typesafe # config # 1.2.1 
Apache The Apache Software License, Version 2.0 com.zaxxer # HikariCP-java6 # 2.3.7 
Apache The Apache Software License, Version 2.0 org.javassist # javassist # 3.19.0-GA 
BSD BSD au.csiro.data61.gnaf # gnaf-db_2.11 # 0.8-SNAPSHOT 
BSD BSD au.csiro.data61.gnaf # gnaf-util_2.11 # 0.8-SNAPSHOT 
BSD BSD 3-Clause org.scala-lang # scala-library # 2.11.8 
BSD BSD 3-Clause org.scala-lang # scala-reflect # 2.11.8 
BSD BSD-Style com.jsuereth # scala-arm_2.11 # 2.0.0-M1 
BSD Two-clause BSD-style license com.typesafe.slick # slick-codegen_2.11 # 3.1.1 
BSD Two-clause BSD-style license com.typesafe.slick # slick-hikaricp_2.11 # 3.1.1 
BSD Two-clause BSD-style license com.typesafe.slick # slick_2.11 # 3.1.1 
CC0 CC0 org.reactivestreams # reactive-streams # 1.0.0 
LGPL EPL + GNU Lesser General Public License ch.qos.logback # logback-classic # 1.1.3 
LGPL EPL + GNU Lesser General Public License ch.qos.logback # logback-core # 1.1.3 
MIT MIT License com.github.scopt # scopt_2.11 # 3.3.0 
MIT MIT License org.slf4j # slf4j-api # 1.7.12 
Mozilla MPL 2.0 or EPL 1.0 com.h2database # h2 # 1.4.193 
-------------------------------------------------------------------------------- /gnaf-lucene/README.md: -------------------------------------------------------------------------------- 1 | # gnaf-lucene 2 | 3 | ## Introduction 4 | 5 | This project produces a library of common code for indexing and searching G-NAF with [Lucene](https://lucene.apache.org/) 6 | and is used by `gnaf-indexer` and `gnaf-search`. 7 | 8 | ## Search Techniques 9 | 10 | ### Indexing 11 | 12 | The following G-NAF data is formatted into an array of strings (one array element per bullet point): 13 | 14 | - site name, building name (commas not included) 15 | - unit/flat, 16 | - level, 17 | - street (number ranges are formatted with a minus separator and no space e.g. "2-4 Reed Street South"), 18 | - locality, state abbreviation, postcode; 19 | 20 | plus: 21 | 22 | - one array element for each street alias; and 23 | - one array element for each locality alias: locality alias, state abbreviation, postcode 24 | 25 | These strings are indexed into the same Lucene field using `WhitespaceTokenizer`, `LowerCaseFilter` and `ShingleFilter` producing unigram and bigram tokens. 26 | Bigrams provide a reward for terms appearing in the above order. 27 | A PositionIncrementGap is used to prevent bigrams going across string boundaries so that only ordering within each string is rewarded, not between them. 28 | 29 | A case where this indexing scheme doesn't work well is a user query for "2 17 SMITH STREET". We understand the 2 represents a unit/flat number, because if it was a level number it would need some text to indicate that. The flat number and street number appear in separate array elements so "2 17" will not produce a bigram match. The "2" will only score as a unigram match to any "2" e.g. possibly a level or street number. In the case that an address has a flat number and a street number but no level, a flat number/street number bigram is added to the index specifically to handle queries of this form. 30 | 31 | A search for a street address with no flat specified should score a match to the street address with no flat higher than one with a spurious match to a flat. More generally it is desirable to add a slight boost (less than the score increment for a correct match) to results with missing data for: site/building, flat, level, and street number. This is facilitated by adding a MISSING_DATA_TOKEN to the field F_MISSING_DATA for each missing data element from this list. 32 | 33 | ### Searching 34 | 35 | Query tokenization and filtering is as discussed above (under Indexing). 36 | Bigram term matches are boosted by a factor of 3 to reward correct ordering. 37 | MISSING_DATA_TOKEN is added to the query boosted by 0.05 to slightly boost results for each missing data element. 38 | 39 | ### Scoring 40 | 41 | Analysis of results using `gnaf-test` has shown that Lucene's default scoring based on language models doesn't work well with address data. 42 | 43 | `AddressSimilarity` is used to override the default scoring: 44 | 45 | - length norm is disabled so that multiple aliases are not penalized 46 | - term frequency is disabled so that a matching street and locality name isn't unduly rewarded 47 | - document frequency is disabled so that common street names are not penalized 48 | 49 | `MissingDataSimilarity` overrides the scoring for the field F_MISSING_DATA: 50 | 51 | - length norm is disabled so that multiple tokens are not penalized 52 | - term frequency is enabled so that multiple tokens score more 53 | - document frequency is disabled (it's a constant as we only have one unique token) 54 | 55 | #### Suggested preprocessing for client applications 56 | 57 | People often use "2 / 12 BLAH STREET" for "UNIT 2 12 BLAH STREET" (which corresponds the indexed format). 58 | Bigrams will provide a high score for "2 12 BLAH" but not for "2 / 12 BLAH", so "/" in the input should be replaced with a space. 59 | Similarly any commas in the input should also be replaced with a space. 60 | 61 | The only useful non-alphanumeric characters are '-' as a number range separator and some non-alphanumeric characters that may appear 62 | in names such as "-" and "'". 63 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | CSIRO Open Source Software License Agreement (variation of the BSD / MIT License) 2 | Copyright (c) 2016, Commonwealth Scientific and Industrial Research Organisation (CSIRO) ABN 41 687 119 230. 3 | All rights reserved. CSIRO is willing to grant you a license to this G_NAF Search on the following terms, except where otherwise indicated for third party material. 4 | Redistribution and use of this software in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 5 | * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 6 | * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 7 | * Neither the name of CSIRO nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission of CSIRO. 8 | EXCEPT AS EXPRESSLY STATED IN THIS AGREEMENT AND TO THE FULL EXTENT PERMITTED BY APPLICABLE LAW, THE SOFTWARE IS PROVIDED "AS-IS". CSIRO MAKES NO REPRESENTATIONS, WARRANTIES OR CONDITIONS OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO ANY REPRESENTATIONS, WARRANTIES OR CONDITIONS REGARDING THE CONTENTS OR ACCURACY OF THE SOFTWARE, OR OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NON-INFRINGEMENT, THE ABSENCE OF LATENT OR OTHER DEFECTS, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT DISCOVERABLE. 9 | TO THE FULL EXTENT PERMITTED BY APPLICABLE LAW, IN NO EVENT SHALL CSIRO BE LIABLE ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION, IN AN ACTION FOR BREACH OF CONTRACT, NEGLIGENCE OR OTHERWISE) FOR ANY CLAIM, LOSS, DAMAGES OR OTHER LIABILITY HOWSOEVER INCURRED. WITHOUT LIMITING THE SCOPE OF THE PREVIOUS SENTENCE THE EXCLUSION OF LIABILITY SHALL INCLUDE: LOSS OF PRODUCTION OR OPERATION TIME, LOSS, DAMAGE OR CORRUPTION OF DATA OR RECORDS; OR LOSS OF ANTICIPATED SAVINGS, OPPORTUNITY, REVENUE, PROFIT OR GOODWILL, OR OTHER ECONOMIC LOSS; OR ANY SPECIAL, INCIDENTAL, INDIRECT, CONSEQUENTIAL, PUNITIVE OR EXEMPLARY DAMAGES, ARISING OUT OF OR IN CONNECTION WITH THIS AGREEMENT, ACCESS OF THE SOFTWARE OR ANY OTHER DEALINGS WITH THE SOFTWARE, EVEN IF CSIRO HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH CLAIM, LOSS, DAMAGES OR OTHER LIABILITY. 10 | APPLICABLE LEGISLATION SUCH AS THE AUSTRALIAN CONSUMER LAW MAY APPLY REPRESENTATIONS, WARRANTIES, OR CONDITIONS, OR IMPOSES OBLIGATIONS OR LIABILITY ON CSIRO THAT CANNOT BE EXCLUDED, RESTRICTED OR MODIFIED TO THE FULL EXTENT SET OUT IN THE EXPRESS TERMS OF THIS CLAUSE ABOVE "CONSUMER GUARANTEES". TO THE EXTENT THAT SUCH CONSUMER GUARANTEES CONTINUE TO APPLY, THEN TO THE FULL EXTENT PERMITTED BY THE APPLICABLE LEGISLATION, THE LIABILITY OF CSIRO UNDER THE RELEVANT CONSUMER GUARANTEE IS LIMITED (WHERE PERMITTED AT CSIRO'S OPTION) TO ONE OF FOLLOWING REMEDIES OR SUBSTANTIALLY EQUIVALENT REMEDIES: 11 | (a) THE REPLACEMENT OF THE SOFTWARE, THE SUPPLY OF EQUIVALENT SOFTWARE, OR SUPPLYING RELEVANT SERVICES AGAIN; 12 | (b) THE REPAIR OF THE SOFTWARE; 13 | (c) THE PAYMENT OF THE COST OF REPLACING THE SOFTWARE, OF ACQUIRING EQUIVALENT SOFTWARE, HAVING THE RELEVANT SERVICES SUPPLIED AGAIN, OR HAVING THE SOFTWARE REPAIRED. 14 | IN THIS CLAUSE, CSIRO INCLUDES ANY THIRD PARTY AUTHOR OR OWNER OF ANY PART OF THE SOFTWARE OR MATERIAL DISTRIBUTED WITH IT. CSIRO MAY ENFORCE ANY RIGHTS ON BEHALF OF THE RELEVANT THIRD PARTY. 15 | Third Party Components 16 | The following third party components are distributed with the Software. You agree to comply with the license terms for these components as part of accessing the Software. Other third party software may also be identified in separate files distributed with the Software. 17 | ___________________________________________________________________ 18 | 19 | Please refer to the file: 3rd-party-licenses.html 20 | ___________________________________________________________________ 21 | 22 | 23 | -------------------------------------------------------------------------------- /gnaf-test/3rd-party-licenses.html: -------------------------------------------------------------------------------- 1 | gnaf-test-licenses

gnaf-test-licenses

2 | 3 |
CategoryLicenseDependencyNotes
Apache Apache 2 io.spray # spray-json_2.11 # 1.3.2 
Apache Apache 2.0 License com.typesafe.scala-logging # scala-logging_2.11 # 3.1.0 
Apache Apache License, Version 2.0 com.typesafe # config # 1.2.1 
Apache The Apache Software License, Version 2.0 com.zaxxer # HikariCP-java6 # 2.3.7 
Apache The Apache Software License, Version 2.0 org.javassist # javassist # 3.19.0-GA 
Apache the Apache License, ASL Version 2.0 org.scalactic # scalactic_2.11 # 3.0.0 
Apache the Apache License, ASL Version 2.0 org.scalatest # scalatest_2.11 # 3.0.0 
BSD BSD au.csiro.data61.gnaf # gnaf-db_2.11 # 0.8-SNAPSHOT 
BSD BSD au.csiro.data61.gnaf # gnaf-util_2.11 # 0.8-SNAPSHOT 
BSD BSD 3-Clause org.scala-lang # scala-library # 2.11.8 
BSD BSD 3-Clause org.scala-lang # scala-reflect # 2.11.8 
BSD BSD 3-clause org.scala-lang.modules # scala-parser-combinators_2.11 # 1.0.4 
BSD BSD 3-clause org.scala-lang.modules # scala-xml_2.11 # 1.0.5 
BSD BSD-Style com.jsuereth # scala-arm_2.11 # 2.0.0-M1 
BSD Two-clause BSD-style license com.typesafe.slick # slick-codegen_2.11 # 3.1.1 
BSD Two-clause BSD-style license com.typesafe.slick # slick-hikaricp_2.11 # 3.1.1 
BSD Two-clause BSD-style license com.typesafe.slick # slick_2.11 # 3.1.1 
CC0 CC0 org.reactivestreams # reactive-streams # 1.0.0 
LGPL EPL + GNU Lesser General Public License ch.qos.logback # logback-classic # 1.1.3 
LGPL EPL + GNU Lesser General Public License ch.qos.logback # logback-core # 1.1.3 
MIT MIT License com.github.scopt # scopt_2.11 # 3.3.0 
MIT MIT License org.slf4j # slf4j-api # 1.7.12 
Mozilla MPL 2.0 or EPL 1.0 com.h2database # h2 # 1.4.193 
-------------------------------------------------------------------------------- /gnaf-test/src/main/script/searchLucene.js: -------------------------------------------------------------------------------- 1 | var request = require('request'); 2 | var fs = require('fs'); 3 | var maps = require('./Maps.js'); 4 | 5 | Array.prototype.flatMap = function(f) { 6 | return this.map(f).flatten(); 7 | } 8 | Array.prototype.flatten = function() { 9 | return Array.prototype.concat.apply([], this); 10 | } 11 | 12 | /** 13 | * Usage: node src/main/node/searchLucene.js input.json 14 | * Input: one address per line. Performs bulk lookup using gnaf-lucene-service. 15 | * TODO: add proper command line option handling, add options to set numHits and bulk 16 | */ 17 | var url = process.argv[2]; // 'http://localhost:9040/bulkSearch' 18 | var path = process.argv[3]; // 0 -> node; 1 -> src/main/script/search.js; 2 -> url; 3 -> input.json 19 | var numHits = 10; 20 | 21 | var addr = JSON.parse(fs.readFileSync(path, "utf8")); 22 | // console.log('addr', addr); 23 | 24 | var bulk = Math.floor(50/3); 25 | var batches = []; 26 | for (i = 0; i < addr.length; i += bulk) batches.push(addr.slice(i, Math.min(i + bulk, addr.length))); 27 | // console.log('bulk', bulk, 'batch sizes', batches.map(b => b.length)); 28 | 29 | /** 30 | * return array[i] = index j where hits[i].hits[j].d61AddressNoAlias contains qBatch[i].tc.address 31 | * exception: if j > 0 and hits[i].hits[j].score == hits[i].hits[0].score (i.e. hit is first equal score) then return array[i] = 0 32 | */ 33 | var findHitIndices = (qBatch, hits) => qBatch.map((q, i) => { 34 | var h = hits[i]; 35 | var j = h.hits.findIndex(h => h.d61AddressNoAlias.indexOf(q.tc.address) != -1) 36 | return j > 0 && aboutEqual(h.hits[j].score, h.hits[0].score) ? 0 : j; 37 | // h.hits[j].score == h.hits[0].score instead of aboutEqual appears to work just as well here 38 | }); 39 | 40 | var aboutEqual = (a, b) => Math.abs(a - b) < Math.max(a, b) * 1.e-6; 41 | 42 | 43 | /** 44 | * each input test case contains 3 different queries 45 | * @param tc a test case 46 | */ 47 | var queries = tc => [ 48 | {tc: tc, qstr: tc.query, desc: ''}, 49 | {tc: tc, qstr: tc.queryPostcodeBeforeState, desc: 'PostcodeBeforeState'}, 50 | {tc: tc, qstr: tc.queryTypo, desc:'Typo'} 51 | ]; 52 | 53 | var bulkQueryParam = (addresses, maxEdits) => ({addresses: addresses, numHits: numHits, fuzzy: { minLength: 5, maxEdits: maxEdits, prefixLength: 2} }); 54 | 55 | var done = (histMap, errMap) => console.log(JSON.stringify({ histogram: histMap.object(), errors: errMap.object() })); 56 | 57 | /** 58 | * Process a batch and on completion recursively do the next. 59 | * @param iter provides next batch 60 | * @param histMap test description -> histogram 61 | * where histogram is (index of correct hit (0 in best case) -> occurrence count for this index) 62 | * @param errMap test description -> index of correct hit -> array of addresses with this index 63 | */ 64 | function doBatch(iter, histMap, errMap) { 65 | var x = iter.next(); 66 | if (x.done) done(histMap, errMap); 67 | else { 68 | var batch = x.value; 69 | 70 | // array of batch.length * 3: { tc: tc, qstr: query address string, desc: description } 71 | var qBatch = batch.flatMap(queries); 72 | // console.log('batch.length', batch.length, 'qBatch.length', qBatch.length); // , 'qBatch', qBatch); 73 | var qAddr = qBatch.map(x => x.qstr); // array of query addresses from qBatch 74 | 75 | function responseHandler(qp, hits) { 76 | var idxs = findHitIndices(qBatch, hits); 77 | // console.log('idxs', idxs); 78 | // histogram(histMap, idxs); 79 | // console.log('histMap', histMap); 80 | idxs.forEach((v, i) => { 81 | var q = qBatch[i]; 82 | var desc = (qp.fuzzy.maxEdits == 0 ? 'nofuz' : 'fuz') + q.desc; 83 | histMap.inc(desc, v); 84 | if (v != 0) errMap.get(desc).append(v, q.qstr); 85 | }); 86 | if (qp.fuzzy.maxEdits == 0) { 87 | // on completing response for maxEdits == 0, do request with maxEdits == 2 88 | doRequest(bulkQueryParam(qAddr, 2)); 89 | } else { 90 | // on completing response for maxEdits == 2 recurse to do next batch 91 | doBatch(iter, histMap, errMap); 92 | } 93 | }; 94 | 95 | function doRequest(qp) { 96 | request.post( { url: url, json: true, body: qp }, (error, response, hits) => { 97 | if (error) console.log('error', error); 98 | else responseHandler(qp, hits); 99 | }); 100 | }; 101 | 102 | // do request with qp.fuzzyMaxEdits == 0 103 | doRequest(bulkQueryParam(qAddr, 0)); 104 | }; 105 | } 106 | 107 | 108 | doBatch(batches[Symbol.iterator](), new maps.MapHist(), new maps.MapMapCont(maps.ctorMapArr)); 109 | 110 | 111 | -------------------------------------------------------------------------------- /gnaf-contrib/src/main/scala/au/csiro/data61/gnaf/contrib/db/ContribTables.scala: -------------------------------------------------------------------------------- 1 | package au.csiro.data61.gnaf.contrib.db 2 | // AUTO-GENERATED Slick data model 3 | 4 | /** Slick data model trait for extension, choice of backend or usage in the cake pattern. (Make sure to initialize this late.) */ 5 | trait ContribTables { 6 | val profile: slick.driver.JdbcProfile 7 | import profile.api._ 8 | import slick.model.ForeignKeyAction 9 | // NOTE: GetResult mappers for plain SQL are only generated for tables where Slick knows how to map the types of all columns. 10 | import slick.jdbc.{GetResult => GR} 11 | 12 | /** DDL for all tables. Call .create to execute. */ 13 | lazy val schema: profile.SchemaDescription = AddressSiteGeocode.schema 14 | @deprecated("Use .schema instead of .ddl", "3.0") 15 | def ddl = schema 16 | 17 | /** Entity class storing rows of table AddressSiteGeocode 18 | * @param id Database column ID SqlType(BIGINT), AutoInc, PrimaryKey 19 | * @param contribStatus Database column CONTRIB_STATUS SqlType(VARCHAR), Length(15,true) 20 | * @param addressSiteGeocodePid Database column ADDRESS_SITE_GEOCODE_PID SqlType(VARCHAR), Length(15,true) 21 | * @param dateCreated Database column DATE_CREATED SqlType(DATE) 22 | * @param version Database column VERSION SqlType(INTEGER) 23 | * @param addressSitePid Database column ADDRESS_SITE_PID SqlType(VARCHAR), Length(15,true) 24 | * @param geocodeTypeCode Database column GEOCODE_TYPE_CODE SqlType(VARCHAR), Length(4,true) 25 | * @param longitude Database column LONGITUDE SqlType(DECIMAL) 26 | * @param latitude Database column LATITUDE SqlType(DECIMAL) */ 27 | case class AddressSiteGeocodeRow(id: Option[Long], contribStatus: String, addressSiteGeocodePid: Option[String], dateCreated: java.sql.Date, version: Int, addressSitePid: String, geocodeTypeCode: String, longitude: scala.math.BigDecimal, latitude: scala.math.BigDecimal) 28 | /** GetResult implicit for fetching AddressSiteGeocodeRow objects using plain SQL queries */ 29 | implicit def GetResultAddressSiteGeocodeRow(implicit e0: GR[Long], e1: GR[String], e2: GR[Option[String]], e3: GR[java.sql.Date], e4: GR[Int], e5: GR[scala.math.BigDecimal]): GR[AddressSiteGeocodeRow] = GR{ 30 | prs => import prs._ 31 | AddressSiteGeocodeRow.tupled((<<[Option[Long]], <<[String], < (AddressSiteGeocodeRow.tupled, AddressSiteGeocodeRow.unapply) 36 | /** Maps whole row to an option. Useful for outer joins. */ 37 | def ? = (Rep.Some(id), Rep.Some(contribStatus), addressSiteGeocodePid, Rep.Some(dateCreated), Rep.Some(version), Rep.Some(addressSitePid), Rep.Some(geocodeTypeCode), Rep.Some(longitude), Rep.Some(latitude)).shaped.<>({r=>import r._; _1.map(_=> AddressSiteGeocodeRow.tupled((_1, _2.get, _3, _4.get, _5.get, _6.get, _7.get, _8.get, _9.get)))}, (_:Any) => throw new Exception("Inserting into ? projection not supported.")) 38 | 39 | /** Database column ID SqlType(BIGINT), AutoInc, PrimaryKey */ 40 | val id: Rep[Long] = column[Long]("ID", O.AutoInc, O.PrimaryKey) 41 | /** Database column CONTRIB_STATUS SqlType(VARCHAR), Length(15,true) */ 42 | val contribStatus: Rep[String] = column[String]("CONTRIB_STATUS", O.Length(15,varying=true)) 43 | /** Database column ADDRESS_SITE_GEOCODE_PID SqlType(VARCHAR), Length(15,true) */ 44 | val addressSiteGeocodePid: Rep[Option[String]] = column[Option[String]]("ADDRESS_SITE_GEOCODE_PID", O.Length(15,varying=true)) 45 | /** Database column DATE_CREATED SqlType(DATE) */ 46 | val dateCreated: Rep[java.sql.Date] = column[java.sql.Date]("DATE_CREATED") 47 | /** Database column VERSION SqlType(INTEGER) */ 48 | val version: Rep[Int] = column[Int]("VERSION") 49 | /** Database column ADDRESS_SITE_PID SqlType(VARCHAR), Length(15,true) */ 50 | val addressSitePid: Rep[String] = column[String]("ADDRESS_SITE_PID", O.Length(15,varying=true)) 51 | /** Database column GEOCODE_TYPE_CODE SqlType(VARCHAR), Length(4,true) */ 52 | val geocodeTypeCode: Rep[String] = column[String]("GEOCODE_TYPE_CODE", O.Length(4,varying=true)) 53 | /** Database column LONGITUDE SqlType(DECIMAL) */ 54 | val longitude: Rep[scala.math.BigDecimal] = column[scala.math.BigDecimal]("LONGITUDE") 55 | /** Database column LATITUDE SqlType(DECIMAL) */ 56 | val latitude: Rep[scala.math.BigDecimal] = column[scala.math.BigDecimal]("LATITUDE") 57 | } 58 | /** Collection-like TableQuery object for table AddressSiteGeocode */ 59 | lazy val AddressSiteGeocode = new TableQuery(tag => new AddressSiteGeocode(tag)) 60 | } 61 | -------------------------------------------------------------------------------- /gnaf-lucene/src/test/scala/au/csiro/data61/gnaf/lucene/GnafLuceneTest.scala: -------------------------------------------------------------------------------- 1 | package au.csiro.data61.gnaf.lucene 2 | 3 | import org.apache.lucene.document.{ Document, DoublePoint, Field } 4 | import org.apache.lucene.search.ScoreDoc 5 | import org.apache.lucene.store.{ Directory, RAMDirectory } 6 | import org.scalatest.{ Finders, FlatSpec, Matchers } 7 | 8 | import GnafLucene._ 9 | import LuceneUtil.Searcher 10 | import au.csiro.data61.gnaf.util.Util.getLogger 11 | import resource.managed 12 | 13 | /** 14 | * More a test bed for: 18 | * than a conventional unit test. 19 | */ 20 | class GnafLuceneTest extends FlatSpec with Matchers { 21 | val log = getLogger(getClass) 22 | 23 | val s = "some test string" 24 | 25 | "countOccurences" should "count" in { 26 | for { 27 | (x, n) <- Seq((" ", 2), ("in", 1), (",", 0)) 28 | } countOccurrences(s, x) should be(n) 29 | 30 | countOccurrences("", "some") should be(0) 31 | } 32 | 33 | it should "throw AssertionError on empty find string" in { 34 | a [AssertionError] should be thrownBy { 35 | countOccurrences(s, "") 36 | } 37 | } 38 | 39 | case class Hit(id: Int, score: Float, d61Address: List[String], d61AddressNoAlias: String) 40 | def toHit(scoreDoc: ScoreDoc, doc: Document) = { 41 | Hit(scoreDoc.doc, scoreDoc.score, doc.getValues(F_ADDRESS).toList, doc.get(F_ADDRESS_NOALIAS)) 42 | } 43 | 44 | case class Result(totalHits: Int, elapsedSecs: Float, hits: Seq[Hit], error: Option[String]) 45 | def toResult(totalHits: Int, elapsedSecs: Float, hits: Seq[Hit], error: Option[String]) 46 | = Result(totalHits, elapsedSecs, hits, error) 47 | 48 | def mkSearcher(dir: Directory) = { 49 | val s = new Searcher(dir, toHit, toResult) 50 | s.searcher.setSimilarity(GnafSimilarity) 51 | s 52 | } 53 | 54 | def mkDoc(addr: (Seq[String], Option[String], Int, String, Double, Double)) = { 55 | val d = new Document 56 | for (a <- addr._1) { 57 | log.debug(s"mkDoc: add: $a") 58 | d.add(new Field(F_ADDRESS, a, addressFieldType)) 59 | } 60 | for (a <- addr._2) { 61 | log.debug(s"mkDoc: add: $a") 62 | d.add(new Field(F_ADDRESS, a, addressFieldType)) 63 | } 64 | for (i <- 0 until addr._3) d.add(new Field(F_MISSING_DATA, MISSING_DATA_TOKEN, missingDataFieldType)) 65 | d.add(new Field(F_ADDRESS_NOALIAS, addr._4, storedNotIndexedFieldType)) 66 | d.add(new DoublePoint(F_LOCATION, addr._5, addr._6)) 67 | d 68 | } 69 | 70 | "searcher" should "find" in { 71 | for (dir <- managed(new RAMDirectory)) { 72 | for (indexer <- managed(mkIndexer(dir))) { 73 | Seq( // v noneCount = number of fields with missing data: streetNo, build/site, flat, level 74 | (Seq("3204 INVERNESS ROAD", "DUMGREE QLD 4715"), None, 3, "3204 INVERNESS ROAD DUMGREE QLD 4715", 0.5d, 10.5d), 75 | (Seq("INVERNESS ROAD", "DUMGREE QLD 4715"), None, 4, "INVERNESS ROAD DUMGREE QLD 4715", 0.7d, 11.5d), 76 | (Seq("FLAT 1", "2400 INVERNESS ROAD", "DUMGREE QLD 4715"), Some("1" + BIGRAM_SEPARATOR + "2400"), 2, "FLAT 1 2400 INVERNESS ROAD DUMGREE QLD 4715", 0d, 10d) 77 | ).foreach(a => indexer.addDocument(mkDoc(a))) 78 | } // indexer.close 79 | 80 | for (searcher <- managed(mkSearcher(dir))) { 81 | // addr: String, numHits: Int, minFuzzyLength: Int, fuzzyMaxEdits: Int, fuzzyPrefixLength: Int 82 | { 83 | val q = QueryParam("INVERNESS ROAD DUMGREE QLD 4715", 3, None, None).toQuery 84 | val r = searcher.search(q, 3) 85 | log.debug(r.toString) 86 | for (h <- r.hits) { 87 | log.debug(h.toString) 88 | log.debug(searcher.searcher.explain(q, h.id).toString) 89 | } 90 | r.hits(0).d61AddressNoAlias should be("INVERNESS ROAD DUMGREE QLD 4715") 91 | // Lucene docId is 0, 1, 2 in order that docs are indexed 92 | r.hits.map(_.id) should be(Seq(1, 0, 2)) // in order of decreasing noneCount: 4, 3, 2 93 | } 94 | 95 | { 96 | val q = QueryParam("FLAT 1", 0, None, None).toQuery 97 | val r = searcher.search(q, 1) 98 | log.debug(r.toString) 99 | val h = r.hits(0) 100 | log.debug(h.toString) 101 | log.debug(searcher.searcher.explain(q, h.id).toString) 102 | 103 | val q2 = QueryParam("1 2400", 0, None, None).toQuery 104 | val r2 = searcher.search(q2, 1) 105 | log.debug(r2.toString) 106 | val h2 = r2.hits(0) 107 | log.debug(h2.toString) 108 | log.debug(searcher.searcher.explain(q2, h2.id).toString) 109 | } 110 | 111 | { 112 | val q = DoublePoint.newRangeQuery(F_LOCATION, Array[Double](-0.25, 9.75), Array[Double](0.75, 10.75)) 113 | val r = searcher.search(q, 3) 114 | log.debug(r.toString) 115 | r.hits.map(_.id).toSet should be(Set(0, 2)) // doc 1 not in box 116 | } 117 | } // searcher.close 118 | } // dir.close 119 | } 120 | } -------------------------------------------------------------------------------- /gnaf-extractor/src/main/script/gnafMapping.json: -------------------------------------------------------------------------------- 1 | { 2 | 3 | "settings": { "index": { 4 | "number_of_shards" : "4", 5 | "number_of_replicas" : "0", 6 | "refresh_interval": "60s", 7 | "query.default_field": "d61Address", 8 | "analysis": { 9 | "analyzer": { 10 | "d61Whitespace": { 11 | "tokenizer": "whitespace", 12 | "filter": [ "lowercase" ] 13 | }, 14 | "d61Shingle": { 15 | "tokenizer": "whitespace", 16 | "filter": [ "lowercase", "filter_shingle" ] 17 | } 18 | }, 19 | "filter": { 20 | "filter_shingle": { 21 | "type": "shingle", 22 | "max_shingle_size": 2, 23 | "min_shingle_size": 2, 24 | "output_unigrams": "true" 25 | } 26 | } 27 | } 28 | } }, 29 | 30 | "mappings": { "gnaf": { 31 | "_all": { "enabled": false }, 32 | "properties": { 33 | // comments not normally allowed in JSON, but this doesn't appear to break Elasticsearch 34 | "d61AddressNoAlias": { "type": "string", "index": "no" }, 35 | "addressDetailPid": { "type": "string", "index": "not_analyzed" }, 36 | "addressSiteName": { "type": "string", "analyzer": "d61Whitespace", "null_value": "D61_NULL" }, 37 | "buildingName": { "type": "string", "analyzer": "d61Whitespace", "null_value": "D61_NULL" }, 38 | 39 | "flatTypeCode": { "type": "string", "index": "not_analyzed", "null_value": "D61_NULL" }, 40 | "flatTypeName": { "type": "string", "analyzer": "d61Whitespace", "null_value": "D61_NULL" }, 41 | "flat": { "type": "object", "properties": { 42 | "prefix": { "type": "string", "index": "not_analyzed", "null_value": "D61_NULL" }, 43 | "number": { "type": "integer", "null_value": -1 }, 44 | "suffix": { "type": "string", "index": "not_analyzed", "null_value": "D61_NULL" } 45 | } }, 46 | 47 | "levelTypeCode": { "type": "string", "index": "not_analyzed", "null_value": "D61_NULL" }, 48 | "levelTypeName": { "type": "string", "analyzer": "d61Whitespace", "null_value": "D61_NULL" }, // UPPER GROUND FLOOR 49 | "level": { "type": "object", "properties": { 50 | "prefix": { "type": "string", "index": "not_analyzed", "null_value": "D61_NULL" }, 51 | "number": { "type": "integer", "null_value": -1 }, 52 | "suffix": { "type": "string", "index": "not_analyzed", "null_value": "D61_NULL" } 53 | } }, 54 | 55 | "numberFirst": { "type": "object", "properties": { 56 | "prefix": { "type": "string", "index": "not_analyzed", "null_value": "D61_NULL" }, 57 | "number": { "type": "integer", "null_value": -1 }, 58 | "suffix": { "type": "string", "index": "not_analyzed", "null_value": "D61_NULL" } 59 | } }, 60 | 61 | "numberLast": { "type": "object", "properties": { 62 | "prefix": { "type": "string", "index": "not_analyzed" }, 63 | "number": { "type": "integer", "null_value": -1 }, 64 | "suffix": { "type": "string", "index": "not_analyzed" } 65 | } }, 66 | 67 | "street": { "type": "object", "properties": { 68 | "name": { "type": "string", "analyzer": "d61Whitespace", "null_value": "D61_NULL" }, 69 | "typeCode": { "type": "string", "analyzer": "d61Whitespace", "null_value": "D61_NULL" }, // reversed from other lookup tables! 70 | "typeName": { "type": "string", "index": "not_analyzed", "null_value": "D61_NULL" }, // code is long, name is short abbreviation 71 | "suffixCode": {"type": "string", "index": "not_analyzed", "null_value": "D61_NULL" }, 72 | "suffixName": { "type": "string", "analyzer": "d61Whitespace", "null_value": "D61_NULL" } 73 | } }, 74 | 75 | "localityName": { "type": "string", "analyzer": "d61Whitespace", "null_value": "D61_NULL" }, 76 | "stateAbbreviation": { "type": "string", "index": "not_analyzed", "null_value": "D61_NULL" }, 77 | "stateName": { "type": "string", "analyzer": "d61Whitespace", "null_value": "D61_NULL" }, 78 | "postcode": { "type": "string", "index": "not_analyzed", "null_value": "D61_NULL" }, // string in GNAF, not a bad choice as not used as a number 79 | 80 | "aliasPrincipal": { "type": "string", "index": "not_analyzed", "null_value": "0" }, // TODO: in H2 & Scala this is Option[Char] so "0" used rather than "D61_NULL" 81 | "primarySecondary": { "type": "string", "index": "not_analyzed", "null_value": "0" }, // however if we convert the null here we could use the latter 82 | 83 | "location": { "type": "geo_point" }, 84 | 85 | "streetVariant": { "type": "nested", "properties": { 86 | "name": { "type": "string", "analyzer": "d61Whitespace", "null_value": "D61_NULL" }, 87 | "typeCode": { "type": "string", "index": "not_analyzed", "null_value": "D61_NULL" }, 88 | "typeName": { "type": "string", "analyzer": "d61Whitespace", "null_value": "D61_NULL" }, 89 | "suffixCode": {"type": "string", "index": "not_analyzed", "null_value": "D61_NULL" }, 90 | "suffixName": { "type": "string", "analyzer": "d61Whitespace", "null_value": "D61_NULL" } 91 | } }, 92 | 93 | "localityVariant": { "type": "nested", "properties": { 94 | "localityName": { "type": "string", "analyzer": "d61Whitespace", "null_value": "D61_NULL" } 95 | } } 96 | } } 97 | } 98 | 99 | } -------------------------------------------------------------------------------- /src/main/script/run.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | # script to run the whole thing 3 | # executable documentation 4 | # you might not want to run all this each time 5 | 6 | set -ex 7 | version=`sed 's/.*"\(.*\)"/\1/' version.sbt` 8 | scalaVersion=2.11 9 | 10 | # optional recovery from gnaf-extractor connection timeout after successful population of database 11 | if [[ "$1" != "skip" ]]; then 12 | 13 | # === Delete/Create database === 14 | 15 | if [[ -f ~/gnaf.mv.db ]]; then 16 | rm -f ~/gnaf-old.mv.db 17 | mv ~/gnaf{,-old}.mv.db 18 | fi 19 | # rm -rf gnaf-db/data/unzipped 20 | 21 | # create SQL load script 22 | ( cd gnaf-db; src/main/script/createGnafDb.sh;) 23 | 24 | if [[ $? -eq 5 ]]; then echo "no new data found, cancelling build"; exit 0; fi 25 | 26 | # build scala projects 27 | # 1. above gnaf-db/src/main/script/createGnafDb.sh creates gnaf-db/target/generated/version.json 28 | # 2. this version.json file is included in the gnaf-search jar by gnaf-search/build.sbt (so we need to build after running the above script) 29 | # 3. h2 (used below) is downloaded by the build if necessary, so we need build before running h2 30 | sbt one-jar 31 | 32 | # run h2 with postgres protocol, remembering its PID 33 | # get h2 version 34 | h2ver=$( sed --quiet --regexp-extended '/com.h2database/s/.*"h2"[^"]*"([^"]*)".*/\1/p' gnaf-db/build.sbt ) 35 | echo $h2ver 36 | 37 | java -Xmx3G -jar ~/.ivy2/cache/com.h2database/h2/jars/h2-${h2ver}.jar -web -pg & 38 | H2_PID=$! 39 | sleep 10 40 | 41 | # set psql gnaf password to gnaf 42 | [[ -r ~/.pgpass ]] && grep -q gnaf ~/.pgpass || { 43 | echo "localhost:5435:~/gnaf:gnaf:gnaf" >> ~/.pgpass 44 | chmod 600 ~/.pgpass 45 | } 46 | 47 | # run load script using Postgres client, takes about 90 minutes with a SSD 48 | # see gnaf-db/README.md for an alternative method using the h2 client 49 | psql --host=localhost --port=5435 --username=gnaf --dbname=~/gnaf < gnaf-db/data/createGnafDb.sql 50 | 51 | # attempt to avoid gnaf-extractor failing below with: java.sql.SQLTimeoutException: Timeout after 10000ms of waiting for a connection 52 | sleep 10 53 | 54 | # stop h2 55 | kill $H2_PID 56 | wait 57 | 58 | fi 59 | 60 | # === Extract JSON address data and load into Lucene === 61 | 62 | # takes about 23 min 63 | time java -Xmx3G -jar gnaf-extractor/target/scala-${scalaVersion}/gnaf-extractor_${scalaVersion}-${version}-one-jar.jar | gzip > addresses.gz 64 | 65 | # takes about 13 min 66 | time zcat addresses.gz | java -jar gnaf-indexer/target/scala-${scalaVersion}/gnaf-indexer_${scalaVersion}-${version}-one-jar.jar 67 | 68 | # 69 | # 70 | ## === demo gnaf-search and gnaf-test === 71 | # 72 | #java -jar gnaf-search/target/scala-${scalaVersion}/gnaf-search_${scalaVersion}-${version}-one-jar.jar & 73 | #SEARCH_PID=$! 74 | #sleep 15 # we could wait for it to log a message 75 | # 76 | #echo "gnaf-search: swagger.json ..." 77 | #curl http://localhost:9040/api-docs/swagger.json 78 | #curl -X POST --header 'Content-Type: application/json' --header 'Accept: application/json' -d '{ 79 | # "addr": "137-~45 CHEVALLUM SCHOOL ROAD PALMWOODS QLD 4555", 80 | # "numHits": 3, 81 | # "fuzzy": { 82 | # "maxEdits": 2, 83 | # "minLength": 5, 84 | # "prefixLength": 2 85 | # } 86 | #}' 'http://localhost:9040/search' 87 | #echo 88 | # 89 | ## takes about 12 min 90 | ## gnaf-search must be running 91 | ## gnaf-db-service must not be running (both use the gnaf database in embedded mode, to run at the same time they would need 92 | ## to use different databases or not use embedded mode). 93 | #echo "gnaf-test ..." 94 | #cd gnaf-test 95 | #npm install 96 | #time src/main/script/run.sh 97 | #cd .. 98 | # 99 | ## === demo gnaf-db-service === 100 | # 101 | #java -jar gnaf-db-service/target/scala-${scalaVersion}/gnaf-db-service_${scalaVersion}-${version}-one-jar.jar & 102 | #DB_PID=$! 103 | #sleep 15 104 | # 105 | #echo "gnaf-db-service: swagger.json ..." 106 | #curl http://localhost:9000/api-docs/swagger.json 107 | #echo "get geocode types and descriptions ..." 108 | #curl 'http://localhost:9000/gnaf/geocodeType' 109 | #echo "get type of address e.g. RURAL, often missing, for an addressDetailPid ..." 110 | #curl 'http://localhost:9000/gnaf/addressType/GANSW716635201' 111 | #echo "get all geocodes for an addressDetailPid, almost always 1, sometimes 2, never more ..." 112 | #curl 'http://localhost:9000/gnaf/addressGeocode/GASA_414912543' 113 | #echo 114 | # 115 | # 116 | ## === demo gnaf-contrib === 117 | # 118 | #java -jar gnaf-contrib/target/scala-${scalaVersion}/gnaf-contrib_${scalaVersion}-${version}-one-jar.jar & 119 | #CONTRIB_PID=$! 120 | #sleep 15 121 | # 122 | #echo "gnaf-contrib: swagger.json ..." 123 | #curl http://localhost:9010/api-docs/swagger.json 124 | #echo "add contributed geocode for an addressSite ..." 125 | #curl -XPOST 'http://localhost:9010/contrib/' -H 'Content-Type:application/json' -d '{ 126 | # "contribStatus":"Submitted","addressSitePid":"712279621","geocodeTypeCode":"EM", 127 | # "longitude":149.1213974,"latitude":-35.280994199999995,"dateCreated":0,"version":0 128 | #}' 129 | #echo "list contributed geocodes for an addressSite ..." 130 | #curl 'http://localhost:9010/contrib/712279621' 131 | ## there are also delete and update methods 132 | # 133 | ## === Stop JSON web services === 134 | # 135 | #kill $SEARCH_PID 136 | #kill $DB_PID 137 | #kill $CONTRIB_PID 138 | #wait 139 | -------------------------------------------------------------------------------- /gnaf-db/src/main/script/createGnafDb.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | # script to download and unpack GNAF and write a SQL script to load it. 3 | set -ex 4 | 5 | baseDir=$PWD 6 | scriptDir=$baseDir/src/main/script 7 | dataDir=$baseDir/data 8 | mkdir -p $dataDir 9 | 10 | # JSON URL from near top-right of: http://www.data.gov.au/dataset/geocoded-national-address-file-g-naf 11 | jsonUrl=http://www.data.gov.au/api/3/action/package_show?id=19432f89-dc3a-4ef3-b943-5326ef1dbecc 12 | # get data URL for current version from JSON 13 | curl -sL $jsonUrl > meta.json 14 | dataUrl=$( jq -r '.result.resources[] | select(.format == "ZIP") | .url' meta.json ) 15 | last_modified=$( jq -r '.result.resources[] | select(.format == "ZIP") | .last_modified' meta.json ) 16 | 17 | # download ZIP data file unless already done 18 | zip=$dataDir/${dataUrl##*/} 19 | [[ -f "$zip" ]] || ( cd $dataDir; wget "$dataUrl" ) 20 | 21 | unzipped=$dataDir/unzipped 22 | # get dir path where the zip file's */Extras/ will be extracted (contains release month so releases don't clobber each other) 23 | # get path from zip, discard leading info up to time and following spaces, keep the rest apart from the trailing / 24 | # maybe a bit too brittle? 25 | gnafExtras="$unzipped/$( unzip -l "$zip" '*/Extras/' | sed -rn '/Extras/s~^.*[0-9][0-9]:[0-9][0-9] *(.*)/$~\1~p' )" 26 | # unzip unless $gnafExtras already exists 27 | [[ -d "$gnafExtras" ]] || ( mkdir -p $unzipped; cd $unzipped; unzip $zip ) 28 | # get dir path parent of Standard/ 29 | gnafData="$unzipped/$( unzip -l "$zip" '*/Standard/' | sed -rn '/Standard/s~^.*[0-9][0-9]:[0-9][0-9] *(.*)/Standard/$~\1~p' )" 30 | 31 | mkdir -p target/generated 32 | cat > target/generated/version.json < $dataDir/createGnafDb.sql 128 | 129 | cat <<-'EoF' 130 | 131 | Start H2 database engine with: java -jar h2*.jar -web -pg 132 | Create an empty database by connecting to a new dburl e.g. jdbc:h2:file:~/gnaf (specify 'gnaf' as the username and password). 133 | In the SQL input area enter: RUNSCRIPT FROM 'data/createGnafDb.sql' 134 | or paste in the content of this file (to get progress feedback lacking with RUNSCRIPT). 135 | After an hour (with SSD) you should have a GNAF database. 136 | EoF 137 | 138 | 139 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # gnaf 2 | 3 | ## Introduction 4 | This project: 5 | 6 | - loads the [G-NAF data set](http://www.data.gov.au/dataset/geocoded-national-address-file-g-naf) into a relational database and search engine; 7 | - provides JSON web services to access the database and search engine; and 8 | - provides a demonstration web user interface using the web services. 9 | 10 | Users of `gnaf-search` should note the [suggested preprocessing](gnaf-lucene/README.md#suggested-preprocessing-for-client-applications) for 11 | query strings. 12 | 13 | ## Project Structure 14 | These sub-directories contain sub-projects: 15 | 16 | 1. gnaf-util: common code 17 | 2. gnaf-db: scripts to load the [G-NAF data set](http://www.data.gov.au/dataset/geocoded-national-address-file-g-naf) into a relational database 18 | and [Slick](http://slick.typesafe.com/) "Functional Relational Mapping" bindings for the database. 19 | The README.md discusses the H2 database and G-NAF data. 20 | 3. gnaf-extractor: queries the database to produce JSON address data 21 | 4. gnaf-lucene: common code for indexing and searching G-NAF with [Lucene](https://lucene.apache.org/). 22 | The README.md discusses the search techniques used. 23 | 5. gnaf-indexer: loads JSON address data into a [Lucene](https://lucene.apache.org/) index 24 | 6. gnaf-search: JSON web service to search the [Lucene](https://lucene.apache.org/) index 25 | 7. gnaf-test: queries the database to produce test address data with many variations, scripts to perform bulk lookups of the test data and evaluate results 26 | 8. gnaf-db-service: JSON web service providing access to the G-NAF database 27 | 9. gnaf-contrib: a JSON web service providing access to the gnafContrib database of user supplied geocodes 28 | 10. gnaf-ui: static files providing a demonstration web user interface using gnaf-search, gnad-db-service and gnaf-contrib. 29 | 30 | Nature of Sub-projects: 31 | 32 | - 1, 2 & 4 produce a jar file of library code used by other sub-projects 33 | - 3, 5 & 7 produce command line programs packaged as a [onejar](https://github.com/sbt/sbt-onejar). 34 | This is a jar file containing all dependencies and run simply with: `java -jar {filename.jar}` 35 | - 6, 8 & 9 produce JSON web services also packaged as a [onejar](https://github.com/sbt/sbt-onejar). 36 | These are run as above (not in a servlet container). They produce [Swagger](http://swagger.io/) API documentation at `/api-docs/swagger.json`. 37 | 38 | The top level directory provides: 39 | - the [sbt](http://www.scala-sbt.org/) build for the [Scala](http://scala-lang.org/) code in projects 1-9 (no build is required for 10); and 40 | - [src/main/script/run.sh](src/main/script/run.sh) to run everything, but first: 41 | - take a look as its intended as executable documentation and you may not wish to run it all each time 42 | - install tools 43 | 44 | ## Install Tools 45 | 46 | To run the Scala code install: 47 | - a JRE e.g. from openjdk-8 (version 8 or higher is required by some dependencies); 48 | - the build tool [sbt](http://www.scala-sbt.org/). 49 | 50 | To develop [Scala](http://scala-lang.org/) code install: 51 | - the above items (you may prefer to install the full JDK instead of just the JRE but I think the JRE is sufficient); 52 | - the [Scala IDE](http://scala-ide.org/download/current.html). 53 | 54 | ### Dependencies 55 | 56 | - scripts assume a *nix environment 57 | - [gnaf-db/src/main/script/createGnafDb.sh](gnaf-db/src/main/script/createGnafDb.sh) requires [jq](https://stedolan.github.io/jq/) 58 | - [src/main/script/run.sh](src/main/script/run.sh) requires: 59 | - `jq` (because it runs `createGnafDb.sh`) 60 | - the Postgres client `psql` to load the database (see [gnaf-db](gnaf-db) for an alternative method using the h2 client); 61 | - `node` and `npm` to run [gnaf-test](gnaf-test) (see its README). 62 | - the `/version` endpoint provided by `gnaf-search` reports the software and data version, but relies on a file created by `createGnafDb.sh` 63 | being available when gnaf-search is built. `run.sh` does things in the right order for this to work. 64 | 65 | ## Running and Usage 66 | 67 | See [src/main/script/run.sh](src/main/script/run.sh). 68 | 69 | ## Build 70 | 71 | Automatic builds are available at: https://t3as-jenkins.it.csiro.au/ (only within the CSIRO network). 72 | 73 | The command: 74 | 75 | sbt clean test one-jar dumpLicenseReport 76 | 77 | from the project's top level directory cleans out previous build products, runs unit tests, 78 | builds one-jar files and creates license reports on dependencies. 79 | 80 | ## Develop With Eclipse 81 | 82 | The command: 83 | 84 | sbt update-classifiers eclipse 85 | 86 | uses the [sbteclipse](https://github.com/typesafehub/sbteclipse/wiki/Using-sbteclipse) plugin to create the .project and .classpath files required by Eclipse (with source attachments for dependencies). 87 | 88 | ## Software License 89 | 90 | This software is released under the CSIRO BSD license - see `Licence.txt`. 91 | Each of the sub-projects lists its dependencies and their licenses in `3rd-party-licenses.html`. 92 | 93 | ## Data License 94 | 95 | Incorporates or developed using G-NAF ©PSMA Australia Limited licensed by the Commonwealth of Australia under the 96 | [Open Geo-coded National Address File (G-NAF) End User Licence Agreement](http://data.gov.au/dataset/19432f89-dc3a-4ef3-b943-5326ef1dbecc/resource/09f74802-08b1-4214-a6ea-3591b2753d30/download/20160226---EULA---Open-G-NAF.pdf). 97 | 98 | -------------------------------------------------------------------------------- /gnaf-test/src/main/script/searchEs.js: -------------------------------------------------------------------------------- 1 | var request = require('request'); 2 | var fs = require('fs'); 3 | var maps = require('./Maps.js'); 4 | 5 | Array.prototype.flatMap = function(f) { 6 | return this.map(f).flatten(); 7 | } 8 | Array.prototype.flatten = function() { 9 | return Array.prototype.concat.apply([], this); 10 | } 11 | 12 | /** 13 | * Usage: node src/main/node/searchEs.js input.json 14 | * Input: one address per line. Performs bulk lookup using Elasticsearch index created by gnaf-indexer. 15 | * TODO: add proper command line option handling, add options to set numHits and bulk 16 | */ 17 | var url = process.argv[2]; // 'http://localhost:9200/gnaf/_msearch' 18 | var path = process.argv[3]; // 0 -> node; 1 -> src/main/script/search.js; 2 -> url; 3 -> input.json 19 | var numHits = 10; 20 | 21 | var addr = JSON.parse(fs.readFileSync(path, "utf8")); 22 | // console.log('addr', addr); 23 | 24 | var bulk = 10; 25 | var batches = []; 26 | for (i = 0; i < addr.length; i += bulk) batches.push(addr.slice(i, Math.min(i + bulk, addr.length))); 27 | // console.log('batches', batches); 28 | 29 | /** return array[i] = index j where esHits.responses[i].hits.hits[j].fields.d61AddressNoAlias[0] contains qBatch[i].tc.address */ 30 | var findHitIndices = (qBatch, esHits) => qBatch.map( (q, i) => 31 | esHits.responses[i].hits.hits.findIndex(h => h.fields.d61AddressNoAlias[0].indexOf(q.tc.address) != -1) 32 | ); 33 | 34 | /** 35 | * @return non-fuzzy elasticsearch query 36 | * 37 | * @param qstr a query address string 38 | * 39 | * If we don't specify "fields" we get _source.d61AddressNoAlias as a String 40 | * however if we do specify "fields" _source is omitted and we get fields.d61AddressNoAlias as an array of Strings (with just 1 element). 41 | */ 42 | var esNoFuz = qstr => 43 | ({ 44 | query:{ match:{ d61Address: qstr }}, 45 | fields:[ "d61AddressNoAlias" ], 46 | size:numHits 47 | }); 48 | 49 | /** 50 | * @return fuzzy elasticsearch query 51 | * 52 | * @param qstr a query address string 53 | */ 54 | var esFuz = qstr => 55 | ({ 56 | query:{ match:{ d61Address:{ query: qstr, fuzziness: 2, prefix_length: 2 }}}, 57 | // rescore:{ query:{ rescore_query:{ match:{ d61Address:{ query: qstr }}}, query_weight: 0 }}, why did I think this was a good idea??? 58 | fields: [ "d61AddressNoAlias" ], 59 | size: numHits 60 | }); 61 | 62 | /** @return array elements for a non-fuzzy and a fuzzy search */ 63 | var mkEs = (tc, qstr, desc) => 64 | [ 65 | { tc: tc, qstr: qstr, qes: esNoFuz(qstr), desc: 'nofuz' + desc }, 66 | { tc: tc, qstr: qstr, qes: esFuz(qstr), desc: 'fuz' + desc } 67 | ]; 68 | 69 | /** 70 | * 6 combinations of queries: 3 different queries with and without fuzzy search 71 | * @return array of { tc: tc, qstr: query address string, qes: elasticsearch query, desc: description } 72 | * @param tc a test case 73 | */ 74 | var queries = tc => [ 75 | mkEs(tc, tc.query, ''), 76 | mkEs(tc, tc.queryPostcodeBeforeState, 'PostcodeBeforeState'), 77 | mkEs(tc, tc.queryTypo, 'Typo') 78 | ].flatten(); 79 | 80 | 81 | // comparitor to sort by score then shortest d61AddressNoAlias first 82 | var scoreThenLength = (a, b) => 83 | b._score != a._score ? b._score - a._score 84 | : a.fields.d61AddressNoAlias[0].length - b.fields.d61AddressNoAlias[0].length; 85 | 86 | // sort each esHits.responses[i].hits.hits according to comparitor cmp 87 | var sortHits = (esHits, cmp) => { 88 | esHits.responses.forEach(r => r.hits.hits.sort(cmp)); 89 | return esHits; 90 | }; 91 | 92 | var done = (histMap, errMap) => console.log(JSON.stringify({ histogram: histMap.object(), errors: errMap.object() })); 93 | 94 | /** 95 | * Process a batch and on completion recursively do the next. 96 | * @param iter provides next batch 97 | * @param histMap test description -> histogram 98 | * where histogram is (index of correct hit (0 in best case) -> occurrence count for this index) 99 | * @param errMap test description -> index of correct hit -> array of addresses with this index 100 | */ 101 | function doBatch(iter, histMap, errMap) { 102 | var x = iter.next(); 103 | if (x.done) done(histMap, errMap); 104 | else { 105 | var batch = x.value; 106 | 107 | // array of batch.length * 6: 108 | // { tc: tc, qstr: query address string, qes: elasticsearch query, desc: description } 109 | var qBatch = batch.flatMap(queries); 110 | // console.log('qBatch', qBatch); 111 | 112 | var esBulk = qBatch.flatMap(q => [ '{}', JSON.stringify(q.qes) ]).join('\n') + '\n'; 113 | // console.log('esBulk', esBulk); 114 | 115 | request.post( { url: url, body: esBulk }, (error, response, body) => { 116 | if (error) console.log('error', error) 117 | else { 118 | // console.log('statusCode', response.statusCode, 'body', body); 119 | var esHits = sortHits(JSON.parse(body), scoreThenLength); 120 | // console.log('esHits', JSON.stringify(esHits)); 121 | var idxs = findHitIndices(qBatch, esHits); 122 | // console.log('idxs', idxs); 123 | // histogram(histMap, idxs); 124 | // console.log('histMap', histMap); 125 | idxs.forEach((v, i) => { 126 | var q = qBatch[i]; 127 | histMap.inc(q.desc, v); 128 | if (v != 0) errMap.get(q.desc).append(v, q.qstr); 129 | }); 130 | doBatch(iter, histMap, errMap); 131 | } 132 | }); 133 | }; 134 | } 135 | 136 | 137 | doBatch(batches[Symbol.iterator](), new maps.MapHist(), new maps.MapMapCont(maps.ctorMapArr)); 138 | 139 | 140 | -------------------------------------------------------------------------------- /template.yaml: -------------------------------------------------------------------------------- 1 | AWSTemplateFormatVersion: '2010-09-09' 2 | Description: 'This stack hosts the gnaf-search service in a docker container' 3 | Parameters: 4 | sslcertificate: 5 | Type: 'String' 6 | Description: 'The ARN for the SSL certificate to use on the load balancer to handle https traffic. See Amazon Certificate Manager to find this' 7 | Resources: 8 | gnafelbsg: 9 | Type: 'AWS::EC2::SecurityGroup' 10 | Properties: 11 | GroupDescription: 'Allows Load Balancer Ingress on 80 from public' 12 | SecurityGroupIngress: 13 | - 14 | IpProtocol: 'tcp' 15 | FromPort: '80' 16 | ToPort: '80' 17 | CidrIp: '0.0.0.0/0' 18 | - 19 | IpProtocol: 'tcp' 20 | FromPort: '443' 21 | ToPort: '443' 22 | CidrIp: '0.0.0.0/0' 23 | gnafec2sg: 24 | Type: 'AWS::EC2::SecurityGroup' 25 | Properties: 26 | GroupDescription: 'Allows access to Instances on port 80 from Load Balancer' 27 | SecurityGroupIngress: 28 | - 29 | IpProtocol: 'tcp' 30 | FromPort: '80' 31 | ToPort: '80' 32 | SourceSecurityGroupId: !GetAtt gnafelbsg.GroupId 33 | gnafelb: # No LoadbalancerName specified to allow for potential replacements if Cloud Formation changes are needed. 34 | Type: 'AWS::ElasticLoadBalancing::LoadBalancer' 35 | Properties: 36 | AvailabilityZones: 37 | Fn::GetAZs: 'ap-southeast-2' 38 | CrossZone: True 39 | AccessLoggingPolicy: 40 | Enabled: True 41 | S3BucketName: 'gnaf-logs' 42 | HealthCheck: 43 | HealthyThreshold: '5' 44 | Interval: '10' 45 | Target: 'HTTP:80/v2/api-docs/swagger.json' 46 | Timeout: '9' 47 | UnhealthyThreshold: '5' 48 | Listeners: 49 | - 50 | InstancePort: '80' 51 | InstanceProtocol: 'HTTP' 52 | LoadBalancerPort: '80' 53 | Protocol: 'HTTP' 54 | - 55 | InstancePort: '80' 56 | InstanceProtocol: 'HTTP' 57 | LoadBalancerPort: '443' 58 | Protocol: 'HTTPS' 59 | SSLCertificateId: !Ref sslcertificate 60 | SecurityGroups: 61 | - !GetAtt gnafelbsg.GroupId 62 | gnaflc: 63 | Type: 'AWS::AutoScaling::LaunchConfiguration' 64 | Properties: 65 | ImageId: 'ami-09332079312dc6085' 66 | InstanceType: 't2.medium' 67 | SecurityGroups: 68 | - !GetAtt gnafec2sg.GroupId 69 | KeyName: 'natmap-peter' 70 | IamInstanceProfile: !GetAtt gnafecraccessinstanceprofile.Arn #'arn:aws:iam::933940466036:instance-profile/gnafECRPullAccess' 71 | UserData: !Base64 | 72 | #cloud-config 73 | apt_upgrade: True 74 | apt_reboot_if_required: True 75 | packages: 76 | - nginx 77 | write_files: 78 | - path: /etc/nginx/sites-available/default 79 | content: | 80 | server { 81 | listen 80 default_server; 82 | location / { 83 | rewrite ^/$ https://github.com/data61/gnaf/ redirect; 84 | } 85 | location /v2/ { 86 | proxy_pass http://localhost:8080/; 87 | add_header Access-Control-Allow-Headers 'Content-Type'; 88 | add_header Access-Control-Allow-Origin '*'; 89 | } 90 | } 91 | runcmd: 92 | - "curl -fsSL get.docker.com | bash" 93 | - "curl --silent --show-error --retry 5 https://bootstrap.pypa.io/get-pip.py | python3" 94 | - "pip install awscli" 95 | - "$(aws ecr get-login --no-include-email --region ap-southeast-2)" 96 | - "nginx -s reload" 97 | - "docker run -p 8080:9040 --restart=always 933940466036.dkr.ecr.ap-southeast-2.amazonaws.com/gnaf:latest" 98 | gnafasg: 99 | UpdatePolicy: 100 | AutoScalingRollingUpdate: 101 | MinInstancesInService: '1' 102 | MaxBatchSize: '1' 103 | PauseTime: 'PT2M30S' 104 | Type: "AWS::AutoScaling::AutoScalingGroup" 105 | Properties: 106 | AvailabilityZones: 107 | Fn::GetAZs: 'ap-southeast-2' 108 | Cooldown: '300' 109 | DesiredCapacity: '2' 110 | HealthCheckGracePeriod: '300' 111 | HealthCheckType: 'ELB' 112 | LaunchConfigurationName: !Ref gnaflc 113 | LoadBalancerNames: 114 | - !Ref gnafelb 115 | MaxSize: 2 116 | MinSize: 1 117 | Tags: 118 | - Key: 'Name' 119 | Value: 'gnaf-search-instance' 120 | PropagateAtLaunch: True 121 | gnafecraccessinstanceprofile: 122 | Type: "AWS::IAM::InstanceProfile" 123 | Properties: 124 | Roles: 125 | - !Ref gnafecraccessrole 126 | InstanceProfileName: 127 | Fn::Join: 128 | - '' 129 | - - !Ref 'AWS::StackName' 130 | - '-GNAF-ECR-Access-instanceprofile' 131 | gnafecraccessrole: 132 | Type: "AWS::IAM::Role" 133 | Properties: 134 | AssumeRolePolicyDocument: 135 | Version: "2012-10-17" 136 | Statement: 137 | - 138 | Effect: "Allow" 139 | Principal: 140 | Service: "ec2.amazonaws.com" 141 | Action: "sts:AssumeRole" 142 | Policies: 143 | - 144 | PolicyDocument: { 145 | "Version": "2012-10-17", 146 | "Statement": [ 147 | { 148 | "Sid": "ecraccessforgnafec2instances", 149 | "Effect": "Allow", 150 | "Action": [ 151 | "ecr:BatchCheckLayerAvailability", 152 | "ecr:BatchGetImage", 153 | "ecr:GetAuthorizationToken", 154 | "ecr:GetDownloadUrlForLayer" 155 | ], 156 | "Resource": "*" 157 | } 158 | ] 159 | } 160 | PolicyName: 161 | Fn::Join: 162 | - '' 163 | - - !Ref 'AWS::StackName' 164 | - '-GNAF-ECR-access-policy' 165 | ManagedPolicyArns: 166 | - "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore" 167 | -------------------------------------------------------------------------------- /gnaf-db-service/src/main/scala/au/csiro/data61/gnaf/db/service/DbService.scala: -------------------------------------------------------------------------------- 1 | package au.csiro.data61.gnaf.db.service 2 | 3 | import scala.concurrent.{ ExecutionContextExecutor, Future } 4 | import scala.math.BigDecimal 5 | 6 | import com.github.swagger.akka.{ HasActorSystem, SwaggerHttpService } 7 | import com.github.swagger.akka.model.Info 8 | import com.typesafe.config.{ Config, ConfigFactory } 9 | 10 | import akka.actor.ActorSystem 11 | import akka.event.{ Logging, LoggingAdapter } 12 | import akka.http.scaladsl.Http 13 | import akka.http.scaladsl.marshallers.sprayjson.SprayJsonSupport.sprayJsonMarshaller 14 | import akka.http.scaladsl.server.Directives.{ Segment, _enhanceRouteWithConcatenation, _segmentStringToPathMatcher, complete, get, logRequestResult, path, pathPrefix } 15 | import akka.stream.{ ActorMaterializer, Materializer } 16 | import au.csiro.data61.gnaf.db.GnafTables 17 | import au.csiro.data61.gnaf.util.Util 18 | import ch.megard.akka.http.cors.CorsDirectives.cors 19 | import io.swagger.annotations.{ Api, ApiOperation } 20 | import io.swagger.models.Swagger 21 | import javax.ws.rs.{ Path, PathParam } 22 | import spray.json.DefaultJsonProtocol 23 | 24 | // for latitude: BigDecimal swagger type is number, but for Option[BigDecimal] swagger type is complex internal representation of scala.math.BigDecimal, so we avoid using Option here 25 | case class Geocode(geocodeTypeCode: Option[String], geocodeTypeDescription: Option[String], reliabilityCode: Option[Int], isDefault: Boolean, latitude: BigDecimal, longitude: BigDecimal) 26 | 27 | case class AddressType(addressSitePid: String, addressType: Option[String]) 28 | case class AddressTypeOpt(addressType: Option[AddressType]) 29 | 30 | case class GeocodeType(code: String, description: String) 31 | case class GeocodeTypes(types: Seq[GeocodeType]) 32 | 33 | trait Protocols extends DefaultJsonProtocol { 34 | implicit val geocodeFormat = jsonFormat6(Geocode.apply) 35 | 36 | implicit val addressTypeFormat = jsonFormat2(AddressType.apply) 37 | implicit val addressTypeOptFormat = jsonFormat1(AddressTypeOpt.apply) 38 | 39 | implicit val geocodeTypeFormat = jsonFormat2(GeocodeType.apply) 40 | implicit val geocodeTypesFormat = jsonFormat1(GeocodeTypes.apply) 41 | } 42 | 43 | @Api(value = "gnaf", produces = "application/json") 44 | @Path("gnaf") 45 | class DbService(logger: LoggingAdapter, config: Config)(implicit system: ActorSystem, executor: ExecutionContextExecutor, materializer: Materializer) extends Protocols { 46 | 47 | object MyGnafTables extends { 48 | val profile = Util.getObject[slick.driver.JdbcProfile](config.getString("gnafDb.slickDriver")) // e.g. slick.driver.{H2Driver,PostgresDriver} 49 | } with GnafTables 50 | val gnafTables = MyGnafTables 51 | import gnafTables._ 52 | import gnafTables.profile.api._ 53 | 54 | implicit val db = Database.forConfig("gnafDb", config) 55 | 56 | // map code -> description 57 | lazy val geocodeTypesFuture: Future[Map[String, String]] = db.run(GeocodeTypeAut.result).map(_.map(t => t.code -> t.description.getOrElse(t.code)).toMap) 58 | 59 | @Path("geocodeType") 60 | @ApiOperation(value = "List geocode types", nickname = "geocodeType", httpMethod = "GET", response = classOf[GeocodeType], responseContainer = "List") 61 | def geocodeType = complete { 62 | geocodeTypesFuture.map { x => 63 | GeocodeTypes(x.toSeq.map(GeocodeType.tupled)) 64 | } 65 | } 66 | 67 | // left join because some addressDetailPid have no AddressSiteGeocode 68 | val qGeocodes = { 69 | def q(addressDetailPid: Rep[String]) = for { 70 | (ad, sg) <- AddressDetail joinLeft AddressSiteGeocode on (_.addressSitePid === _.addressSitePid) if ad.addressDetailPid === addressDetailPid 71 | dg <- AddressDefaultGeocode if dg.addressDetailPid === addressDetailPid 72 | } yield (dg, sg) 73 | Compiled(q _) 74 | } 75 | 76 | @Path("addressGeocode/{addressDetailPid}") 77 | @ApiOperation(value = "List geocodes for an addressSitePid", nickname = "addressGeocode", httpMethod = "GET", response = classOf[Geocode], responseContainer = "List") 78 | def addressGeocode(@PathParam("addressDetailPid") addressDetailPid: String) = { 79 | val f = for { 80 | typ <- geocodeTypesFuture 81 | seq <- db.run(qGeocodes(addressDetailPid).result) 82 | } yield seq.map { case (dg, sg) => 83 | // should either have 1 (dg, None) or 1 or more (dg, Some(addressSiteGeocode)), the latitude & longitude values should not be None 84 | sg.map { x => Geocode(x.geocodeTypeCode, x.geocodeTypeCode.map(typ), Some(x.reliabilityCode), Some(dg.geocodeTypeCode) == x.geocodeTypeCode && dg.latitude == x.latitude && dg.longitude == x.longitude, x.latitude.getOrElse(0), x.longitude.getOrElse(0)) } 85 | .getOrElse(Geocode(Some(dg.geocodeTypeCode), Some(typ(dg.geocodeTypeCode)), None, true, dg.latitude.getOrElse(0), dg.longitude.getOrElse(0))) // handle the (dg, None) no AddressSiteGeocode case 86 | }.sortBy(!_.isDefault) 87 | 88 | complete { f } 89 | } 90 | 91 | lazy val addressTypesFuture = db.run(AddressTypeAut.result).map(_.map(t => t.code -> t.description.getOrElse(t.code)).toMap) 92 | 93 | val qAddressSite = { 94 | def q(addressDetailPid: Rep[String]) = for { 95 | ad <- AddressDetail if ad.addressDetailPid === addressDetailPid 96 | as <- AddressSite if as.addressSitePid === ad.addressSitePid 97 | } yield as 98 | Compiled(q _) 99 | } 100 | 101 | @Path("addressType/{addressDetailPid}") 102 | @ApiOperation(value = "AddressType for an addressSitePid", nickname = "addressType", httpMethod = "GET", response = classOf[AddressTypeOpt]) 103 | def addressType(@PathParam("addressDetailPid") addressDetailPid: String) = { 104 | val f = for { 105 | typ <- addressTypesFuture 106 | asOpt <- db.run(qAddressSite(addressDetailPid).result.headOption) 107 | } yield AddressTypeOpt(asOpt.map(as => AddressType(as.addressSitePid, as.addressType.map(typ)))) 108 | 109 | complete { f } 110 | } 111 | 112 | val routes = pathPrefix("gnaf") { 113 | pathPrefix("geocodeType") { 114 | get { geocodeType } 115 | } ~ 116 | pathPrefix("addressGeocode") { 117 | (get & path(Segment)) { addressGeocode } 118 | } ~ 119 | pathPrefix("addressType") { 120 | (get & path(Segment)) { addressType } 121 | } 122 | } 123 | } 124 | 125 | object DbService { 126 | implicit val sys = ActorSystem() 127 | implicit val exec = sys.dispatcher 128 | implicit val mat = ActorMaterializer() 129 | 130 | val logger = Logging(sys, getClass) 131 | val config = ConfigFactory.load 132 | val interface = config.getString("http.interface") 133 | val port = config.getInt("http.port") 134 | 135 | val service = new DbService(logger, config) 136 | 137 | // /api-docs/swagger.json 138 | val swagger = new SwaggerHttpService with HasActorSystem { 139 | import scala.reflect.runtime.{ universe => ru } 140 | 141 | override implicit val actorSystem = sys 142 | override implicit val materializer = mat 143 | override val apiTypes = Seq(ru.typeOf[DbService]) 144 | override def swaggerConfig = new Swagger().basePath(prependSlashIfNecessary(basePath)) // don't specify protocol://host 145 | } 146 | 147 | def main(args: Array[String]): Unit = { 148 | val routes = cors() { 149 | logRequestResult("DbService") { service.routes } ~ 150 | logRequestResult("Swagger") { swagger.routes } 151 | } 152 | Http().bindAndHandle(routes, interface, port) 153 | } 154 | } 155 | -------------------------------------------------------------------------------- /gnaf-lucene/src/main/scala/au/csiro/data61/gnaf/lucene/GnafLucene.scala: -------------------------------------------------------------------------------- 1 | package au.csiro.data61.gnaf.lucene 2 | 3 | import org.apache.lucene.analysis.Analyzer 4 | import org.apache.lucene.analysis.Analyzer.TokenStreamComponents 5 | import org.apache.lucene.analysis.LowerCaseFilter 6 | import org.apache.lucene.analysis.core.WhitespaceTokenizer 7 | import org.apache.lucene.analysis.shingle.ShingleFilter 8 | import org.apache.lucene.document.{ DoublePoint, FieldType } 9 | import org.apache.lucene.index.{ FieldInvertState, IndexOptions, IndexWriter, IndexWriterConfig, Term } 10 | import org.apache.lucene.search.{ BooleanClause, BooleanQuery, BoostQuery, FuzzyQuery, Query, TermQuery } 11 | import org.apache.lucene.search.similarities.ClassicSimilarity 12 | import org.apache.lucene.store.Directory 13 | 14 | import LuceneUtil.tokenIter 15 | import au.csiro.data61.gnaf.util.Util.getLogger 16 | import org.apache.lucene.search.MatchAllDocsQuery 17 | import org.apache.lucene.search.similarities.PerFieldSimilarityWrapper 18 | 19 | /** 20 | * GNAF specific field names, analyzers and scoring for Lucene. 21 | */ 22 | object GnafLucene { 23 | val log = getLogger(getClass) 24 | 25 | /** GNAF Lucene field names */ 26 | val F_JSON = "json" 27 | val F_LOCATION = "location" 28 | val F_ADDRESS = "address" 29 | val F_ADDRESS_NOALIAS = "addressNoAlias" 30 | val F_MISSING_DATA = "noData" 31 | 32 | val MISSING_DATA_TOKEN = "N" // store this token in F_MISSING_DATA once for each missing: site/building, flat, level, streetNum 33 | 34 | val BIGRAM_SEPARATOR = "~" 35 | 36 | /** count occurrences of x in s, x must be non-empty */ 37 | def countOccurrences(s: String, x: String) = { 38 | assert(x.nonEmpty) 39 | var n = 0 40 | var i = 0 41 | while (i < s.length - x.length) { 42 | i = s.indexOf(x, i) 43 | if (i == -1) i = s.length 44 | else { 45 | n += 1 46 | i += x.length 47 | } 48 | } 49 | n 50 | } 51 | 52 | /** get n-gram size n */ 53 | def shingleSize(s: String) = countOccurrences(s, BIGRAM_SEPARATOR) + 1 54 | 55 | /** 56 | * gnaf-test shows tf-idf doesn't work well with addresses 57 | * For F_DADDRESS disable tf, idf and length norm, 58 | * but for F_MISSING_DATA keep tf to favour multiple MISSING_DATA_TOKENs. 59 | */ 60 | class MissingDataSimilarity extends ClassicSimilarity { 61 | // default tf - boost repeated MISSING_DATA_TOKEN tokens 62 | override def lengthNorm(state: FieldInvertState) = state.getBoost // no length norm, don't penalize multiple MISSING_DATA_TOKENs or multiple aliases 63 | override def idf(docFreq: Long, docCount: Long): Float = 1.0f // don't penalize MISSING_DATA_TOKEN or SMITH STREET for being common 64 | } 65 | class AddressSimilarity extends MissingDataSimilarity { 66 | override def tf(freq: Float): Float = 1.0f // don't boost street and locality name being the same 67 | } 68 | val classicSimilarity = new ClassicSimilarity 69 | object GnafSimilarity extends PerFieldSimilarityWrapper(classicSimilarity) { 70 | val md = new MissingDataSimilarity 71 | val addr = new AddressSimilarity 72 | override def get(name: String) = if (name == F_ADDRESS) addr else if (name == F_MISSING_DATA) md else classicSimilarity 73 | } 74 | 75 | val storedNotIndexedFieldType = { 76 | val t = new FieldType 77 | // based on StringField 78 | t.setOmitNorms(true); 79 | t.setStored(true); 80 | t.setTokenized(false); 81 | t.setIndexOptions(IndexOptions.NONE); // StringField has DOCS 82 | t.freeze(); 83 | t 84 | } 85 | 86 | val addressFieldType = { 87 | val t = new FieldType 88 | // based on TextField 89 | t.setOmitNorms(true); 90 | t.setStored(true); 91 | t.setTokenized(true); 92 | t.setIndexOptions(IndexOptions.DOCS); // not using term freq, TextField has DOCS_AND_FREQS_AND_POSITIONS 93 | t.freeze(); 94 | t 95 | } 96 | 97 | val flatStreetNumFieldType = { 98 | val t = new FieldType 99 | t.setOmitNorms(true); 100 | t.setStored(false); 101 | t.setTokenized(false); 102 | t.setIndexOptions(IndexOptions.DOCS); 103 | t.freeze(); 104 | t 105 | } 106 | 107 | val missingDataFieldType = { 108 | val t = new FieldType 109 | t.setOmitNorms(true); 110 | t.setStored(false); 111 | t.setTokenized(false); 112 | t.setIndexOptions(IndexOptions.DOCS_AND_FREQS); // using term freq 113 | t.freeze(); 114 | t 115 | } 116 | 117 | val shingleWhiteLowerAnalyzer = new Analyzer { 118 | 119 | override protected def createComponents(fieldName: String) = { 120 | val source = new WhitespaceTokenizer() 121 | // ShingleFilter defaults are: 122 | // minShingleSize = 2 (error if set < 2), maxShingleSize = 2 123 | // outputUnigrams = true 124 | val result = new ShingleFilter(new LowerCaseFilter(source), 2, 2) 125 | result.setTokenSeparator(BIGRAM_SEPARATOR) // default is " ", changed so we can explicitly add a bigram by passing "a~b" through the tokenizer 126 | new TokenStreamComponents(source, result) 127 | } 128 | 129 | override def getPositionIncrementGap(fieldName: String): Int = 100 // stop shingles matching across boundaries 130 | } 131 | 132 | def mkIndexer(dir: Directory) = new IndexWriter( 133 | dir, 134 | new IndexWriterConfig(shingleWhiteLowerAnalyzer) 135 | .setOpenMode(IndexWriterConfig.OpenMode.CREATE) 136 | .setSimilarity(GnafSimilarity) 137 | ) 138 | 139 | case class FuzzyParam( 140 | /** max number of edits permitted for a match (0 for no fuzzy matching) */ 141 | maxEdits: Int, 142 | /** fuzzy matching only applied to terms of at least this length */ 143 | minLength: Int, 144 | /** the initial length that must match exactly before fuzzy matching is applied to the remainder */ 145 | prefixLength: Int 146 | ) 147 | 148 | case class BoundingBox(minLat: Double, minLon: Double, maxLat: Double, maxLon: Double) { 149 | def toQuery = DoublePoint.newRangeQuery(F_LOCATION, Array[Double](minLat, minLon), Array[Double](maxLat, maxLon)) 150 | } 151 | 152 | case class QueryParam( 153 | /** address search terms - best results if ordered: site/building name, unit/flat, level, street, locality, state abbreviation, postcode */ 154 | addr: String, 155 | /** number of search results to return */ 156 | numHits: Int, 157 | /** optional fuzzy matching */ 158 | fuzzy: Option[FuzzyParam], 159 | /** optional filtering by a bounding box (addr may be blank) */ 160 | box: Option[BoundingBox] 161 | ) { 162 | def toQuery: Query = { 163 | val q = tokenIter(shingleWhiteLowerAnalyzer, F_ADDRESS, addr).foldLeft { 164 | val b = new BooleanQuery.Builder 165 | // small score increment for missing: build/site, flat, level, streetNo (smaller than for an actual match) 166 | b.add(new BooleanClause(new BoostQuery(new TermQuery(new Term(F_MISSING_DATA, MISSING_DATA_TOKEN)), 0.05f), BooleanClause.Occur.SHOULD)) 167 | box.foreach(x => b.add(new BooleanClause(x.toQuery, BooleanClause.Occur.FILTER))) 168 | if (addr.trim.isEmpty) 169 | // mobile use case: all addresses in box around me 170 | b.add(new BooleanClause(new MatchAllDocsQuery, BooleanClause.Occur.SHOULD)) 171 | else 172 | b.setMinimumNumberShouldMatch(2) // could be MISSING_DATA_TOKEN and 1 user term or 2 user terms 173 | b 174 | }{ (b, t) => 175 | val q = { 176 | val term = new Term(F_ADDRESS, t) 177 | val q = fuzzy 178 | .filter(f => f.maxEdits > 0 && t.length >= f.minLength) 179 | .map(f => new FuzzyQuery(term, f.maxEdits, f.prefixLength)) 180 | .getOrElse(new TermQuery(term)) 181 | val n = shingleSize(t) 182 | if (n < 2) q else new BoostQuery(q, Math.pow(3.0, n-1).toFloat) 183 | } 184 | b.add(new BooleanClause(q, BooleanClause.Occur.SHOULD)) 185 | }.build 186 | log.debug(s"mkQuery: bool query = ${q.toString(F_ADDRESS)}") 187 | q 188 | } 189 | } 190 | 191 | } -------------------------------------------------------------------------------- /gnaf-contrib/src/main/scala/au/csiro/data61/gnaf/contrib/service/ContribService.scala: -------------------------------------------------------------------------------- 1 | package au.csiro.data61.gnaf.contrib.service 2 | 3 | import scala.concurrent.ExecutionContextExecutor 4 | import scala.concurrent.duration.DurationInt 5 | import scala.math.BigDecimal 6 | 7 | import com.github.swagger.akka.{ HasActorSystem, SwaggerHttpService } 8 | import com.github.swagger.akka.model.Info 9 | import com.typesafe.config.{ Config, ConfigFactory } 10 | 11 | import akka.actor.ActorSystem 12 | import akka.event.{ Logging, LoggingAdapter } 13 | import akka.http.scaladsl.Http 14 | import akka.http.scaladsl.marshallers.sprayjson.SprayJsonSupport.{ sprayJsonMarshaller, sprayJsonUnmarshaller } 15 | import akka.http.scaladsl.marshalling.ToResponseMarshallable 16 | import akka.http.scaladsl.model.HttpMethods 17 | import akka.http.scaladsl.model.StatusCodes.BadRequest 18 | import akka.http.scaladsl.server.Directives._ 19 | import akka.stream.{ ActorMaterializer, Materializer } 20 | import au.csiro.data61.gnaf.util.Util 21 | import au.csiro.data61.gnaf.contrib.db.ContribTables 22 | import ch.megard.akka.http.cors.CorsDirectives.cors 23 | import ch.megard.akka.http.cors.CorsSettings.defaultSettings 24 | import io.swagger.annotations.{ Api, ApiParam, ApiImplicitParams, ApiImplicitParam, ApiOperation } 25 | import io.swagger.models.Swagger 26 | import javax.ws.rs.{ Path, PathParam, DefaultValue } 27 | import slick.dbio.DBIOAction 28 | import slick.jdbc.ResultSetAction 29 | import spray.json.DefaultJsonProtocol 30 | 31 | case class ContribGeocode(id: Option[Long], contribStatus: String, addressSiteGeocodePid: Option[String], dateCreated: Long, version: Int, addressSitePid: String, geocodeTypeCode: String, longitude: BigDecimal, latitude: BigDecimal) 32 | case class ContribGeocodeKey(id: Long, version: Int) 33 | 34 | object JsonProtocol extends DefaultJsonProtocol { 35 | implicit val contribGeocodeFormat = jsonFormat9(ContribGeocode.apply) 36 | implicit val contribGeocodeKeyFormat = jsonFormat2(ContribGeocodeKey.apply) 37 | } 38 | import JsonProtocol._ 39 | import io.swagger.models.Swagger 40 | 41 | @Api(value = "contrib", produces = "application/json") 42 | @Path("contrib") 43 | class ContribService(logger: LoggingAdapter, config: Config)(implicit system: ActorSystem, executor: ExecutionContextExecutor, materializer: Materializer) { 44 | object MyContribTables extends { 45 | val profile = Util.getObject[slick.driver.JdbcProfile](config.getString("gnafContribDb.slickDriver")) // e.g. slick.driver.{H2Driver,PostgresDriver} 46 | } with ContribTables 47 | import MyContribTables._ 48 | import MyContribTables.profile.api._ 49 | 50 | implicit val db = Database.forConfig("gnafContribDb", config) 51 | 52 | def createSchemaIfNotExists = { 53 | import scala.concurrent.Await 54 | import scala.concurrent.duration._ 55 | import slick.jdbc.GetResult._ 56 | import slick.jdbc.ResultSetAction 57 | 58 | val listTablesAction = ResultSetAction[(String, String, String, String)](_.conn.getMetaData.getTables("", "", null, null)).map(_.filter(_._4 == "TABLE").map(_._3)) 59 | val createIfNotExistsAction = listTablesAction.flatMap { tbls => 60 | if (tbls.isEmpty) schema.create.map(_ => "createSchemaIfNotExists: schema created") 61 | else DBIOAction.successful(s"createSchemaIfNotExists: pre-existing tables = $tbls") 62 | } 63 | logger.info(Await.result(db.run(createIfNotExistsAction), 15.seconds)) 64 | } 65 | 66 | val qList = { 67 | def q(addressSitePid: Rep[String]) = AddressSiteGeocode.filter(_.addressSitePid === addressSitePid) 68 | Compiled(q _) 69 | } 70 | 71 | def toContribGeocode(x: AddressSiteGeocodeRow) = ContribGeocode(x.id, x.contribStatus, x.addressSiteGeocodePid, x.dateCreated.getTime, x.version, x.addressSitePid, x.geocodeTypeCode, x.longitude, x.latitude) 72 | 73 | @Path("{addressSitePid}") 74 | @ApiOperation(value = "List contributed geocodes for an addressSitePid", nickname = "list", 75 | httpMethod = "GET", response = classOf[ContribGeocode], responseContainer = "List") 76 | def listRoute( 77 | @PathParam("addressSitePid") 78 | addressSitePid: String 79 | ) = { 80 | val f = db.run(qList(addressSitePid).result).map(_.map(toContribGeocode)) 81 | complete { f } 82 | } 83 | 84 | val contribGeocodeWithId = (AddressSiteGeocode returning AddressSiteGeocode.map(_.id) ) 85 | def toAddressSiteGeocodeRow(x: ContribGeocode) = AddressSiteGeocodeRow(x.id, x.contribStatus, x.addressSiteGeocodePid, new java.sql.Date(x.dateCreated), x.version, x.addressSitePid, x.geocodeTypeCode, x.longitude, x.latitude) 86 | 87 | @ApiOperation(value = "Add a new contributed geocode for an addressSitePid", nickname = "create", 88 | notes="""id, version and dateCreated input ignored & output set by system (however input values for version and dateCreated are still required). 89 | 90 | Example input (included here as @ApiParam(defaultValue) and @DefaultValue aren't working so far): 91 | { 92 | "contribStatus":"Submitted", 93 | "addressSitePid":"712279621", 94 | "geocodeTypeCode":"EM", 95 | "longitude":149.1213974, 96 | "latitude":-35.280994199999995, 97 | "dateCreated":0, 98 | "version":0 99 | } 100 | """, 101 | httpMethod = "POST", response = classOf[ContribGeocode]) 102 | def createContribRoute( 103 | @ApiParam(value = "contribGeocode", required = true, defaultValue = "Fred") 104 | @DefaultValue("harry") 105 | c: ContribGeocode 106 | ) = { 107 | val c2 = (c.copy(dateCreated = System.currentTimeMillis, version = 1)) 108 | val f = db.run(contribGeocodeWithId += toAddressSiteGeocodeRow(c2)).map(id => c2.copy(id = Some(id))) 109 | complete { f } 110 | } 111 | 112 | def qGet = { 113 | def q(id: Rep[Long], version: Rep[Int]) = AddressSiteGeocode.filter(x => x.id === id && x.version === version) 114 | Compiled(q _) 115 | } 116 | 117 | @ApiOperation(value = "Delete a contributed geocode for an addressSitePid", nickname = "delete", 118 | notes="optimistic lock version must match to succeed", httpMethod = "DELETE", response = classOf[ContribGeocodeKey]) 119 | def deleteContribRoute( 120 | @ApiParam(value = "contribGeocodeKey", required = true) 121 | key: ContribGeocodeKey 122 | ) = { 123 | val f = db.run(qGet(key.id, key.version).delete) 124 | complete { 125 | f.map[ToResponseMarshallable] { cnt => 126 | if (cnt == 1) key 127 | else BadRequest -> s"key = $key not found" 128 | } 129 | } 130 | } 131 | 132 | @ApiOperation(value = "Update a contributed geocode for an addressSitePid", nickname = "update", 133 | notes = """optimistic lock version must match to succeed. 134 | 135 | dateCreated input ignored (but still required); version and dateCreated output set by system 136 | """, 137 | httpMethod = "PUT", response = classOf[ContribGeocode]) 138 | def updateContribRoute( 139 | @ApiParam(value = "contribGeocode", required = true) 140 | c: ContribGeocode 141 | ) = { 142 | val c2 = c.copy(version = c.version + 1, dateCreated = System.currentTimeMillis) 143 | val f = db.run(qGet(c.id.get, c.version).update(toAddressSiteGeocodeRow(c2))) 144 | complete { 145 | f.map[ToResponseMarshallable] { cnt => 146 | if (cnt == 1) c2 147 | else s"id = ${c.id}, version = ${c.version} not found" 148 | } 149 | } 150 | } 151 | 152 | val routes = pathPrefix("contrib") { 153 | (post & entity(as[ContribGeocode])) { createContribRoute } ~ 154 | (get & path(Segment)) { listRoute } ~ 155 | (delete & entity(as[ContribGeocodeKey])) { deleteContribRoute } ~ 156 | (put & entity(as[ContribGeocode])) { updateContribRoute } 157 | } 158 | } 159 | 160 | object ContribService { 161 | implicit val sys = ActorSystem() 162 | implicit val exec = sys.dispatcher 163 | implicit val mat = ActorMaterializer() 164 | 165 | val logger = Logging(sys, getClass) 166 | val config = ConfigFactory.load 167 | val interface = config.getString("http.interface") 168 | val port = config.getInt("http.port") 169 | 170 | val service = new ContribService(logger, config) 171 | 172 | // /api-docs/swagger.json 173 | val swagger = new SwaggerHttpService with HasActorSystem { 174 | import scala.reflect.runtime.{ universe => ru } 175 | 176 | override implicit val actorSystem = sys 177 | override implicit val materializer = mat 178 | override val apiTypes = Seq(ru.typeOf[ContribService]) 179 | override def swaggerConfig = new Swagger().basePath(prependSlashIfNecessary(basePath)) // don't specify protocol://host 180 | } 181 | 182 | def main(args: Array[String]): Unit = { 183 | service.createSchemaIfNotExists 184 | 185 | val routes = cors(defaultSettings.copy(allowedMethods = HttpMethods.DELETE +: defaultSettings.allowedMethods)) { 186 | logRequestResult("GnafContrib") { service.routes } ~ 187 | logRequestResult("Swagger") { swagger.routes } 188 | } 189 | Http().bindAndHandle(routes, interface, port) 190 | } 191 | } 192 | -------------------------------------------------------------------------------- /gnaf-search/src/main/scala/au/csiro/data61/gnaf/search/Search.scala: -------------------------------------------------------------------------------- 1 | package au.csiro.data61.gnaf.search 2 | 3 | import java.io.File 4 | 5 | import scala.collection.mutable.ListBuffer 6 | import scala.concurrent.{ ExecutionContextExecutor, Future } 7 | import scala.io.Source 8 | import scala.reflect.runtime.universe 9 | 10 | import org.apache.lucene.document.Document 11 | import org.apache.lucene.search.{ ScoreDoc, Sort } 12 | 13 | import com.github.swagger.akka.{ HasActorSystem, SwaggerHttpService } 14 | import com.typesafe.config.ConfigFactory 15 | 16 | import akka.actor.ActorSystem 17 | import akka.http.scaladsl.Http 18 | import akka.http.scaladsl.marshallers.sprayjson.SprayJsonSupport.{ sprayJsonMarshaller, sprayJsonUnmarshaller } 19 | import akka.http.scaladsl.marshalling.ToResponseMarshallable.apply 20 | import akka.http.scaladsl.server.Directives._ 21 | import akka.http.scaladsl.server.RouteResult.route2HandlerFlow 22 | import akka.http.scaladsl.server.directives.LoggingMagnet.forRequestResponseFromMarker 23 | import akka.stream.{ ActorMaterializer, Materializer } 24 | import au.csiro.data61.gnaf.lucene.GnafLucene._ 25 | import au.csiro.data61.gnaf.lucene.LuceneUtil.{ Searcher, directory } 26 | import au.csiro.data61.gnaf.search.Search.Result 27 | import au.csiro.data61.gnaf.util.Util.getLogger 28 | import ch.megard.akka.http.cors.CorsDirectives.cors 29 | import io.swagger.annotations.{ Api, ApiOperation, ApiParam } 30 | import io.swagger.models.Swagger 31 | import javax.ws.rs.Path 32 | import spray.json.{ DefaultJsonProtocol, pimpString } 33 | 34 | object Search { 35 | val log = getLogger(getClass) 36 | 37 | case class CliOption(indexDir: File, bulk: Int, numHits: Int, fuzzyMinLength: Int, fuzzyMaxEdits: Int, fuzzyPrefixLength: Int, interface: String, port: Int) 38 | val defaultCliOption = { 39 | val c = ConfigFactory.load.getConfig("gnafSearch") 40 | def gs(n: String) = c.getString(n) 41 | def gi(n: String) = c.getInt(n) 42 | CliOption(new File(gs("indexDir")), gi("bulk"), gi("numHits"), gi("fuzzyMinLength"), gi("fuzzyMaxEdits"), gi("fuzzyPrefixLength"), gs("interface"), gi("port")) 43 | } 44 | 45 | def main(args: Array[String]): Unit = { 46 | val parser = new scopt.OptionParser[CliOption]("gnaf-indexer") { 47 | head("gnaf-lucene-service", "0.x") 48 | note("JSON web service for address searches") 49 | opt[File]('i', "indexDir") action { (x, c) => 50 | c.copy(indexDir = x) 51 | } text (s"Lucene index directory, default ${defaultCliOption.indexDir}") 52 | opt[Int]('b', "bulk") action { (x, c) => 53 | c.copy(bulk = x) 54 | } text (s"max addresses client may put in a bulk request, default ${defaultCliOption.bulk}") 55 | opt[Int]('h', "numHits") action { (x, c) => 56 | c.copy(numHits = x) 57 | } text (s"max client may request for the number of search hits, default ${defaultCliOption.numHits}") 58 | opt[Int]('f', "minFuzzyLength") action { (x, c) => 59 | c.copy(fuzzyMinLength = x) 60 | } text (s"min client may request for min query term length for fuzzy match, default ${defaultCliOption.fuzzyMinLength}") 61 | opt[Int]('e', "fuzzyMaxEdits") action { (x, c) => 62 | c.copy(fuzzyMaxEdits = x) 63 | } text (s"max client may request for max edits for a fuzzy match, default ${defaultCliOption.fuzzyMaxEdits}") 64 | opt[Int]('p', "fuzzyPrefixLength") action { (x, c) => 65 | c.copy(fuzzyPrefixLength = x) 66 | } text (s"min client may request for min initial chars that must match exactly for a fuzzy match, default ${defaultCliOption.fuzzyPrefixLength}") 67 | opt[String]('n', "interface") action { (x, c) => 68 | c.copy(interface = x) 69 | } text (s"network interface (name or IP address) to attach to, default ${defaultCliOption.interface}") 70 | opt[Int]('r', "port") action { (x, c) => 71 | c.copy(port = x) 72 | } text (s"IP port to listen on, default ${defaultCliOption.port}") 73 | help("help") text ("prints this usage text") 74 | } 75 | parser.parse(args, defaultCliOption) foreach run 76 | } 77 | 78 | case class Hit(score: Float, json: String, d61Address: List[String], d61AddressNoAlias: String) 79 | def toHit(scoreDoc: ScoreDoc, doc: Document) = { 80 | Hit(scoreDoc.score, doc.get(F_JSON), doc.getValues(F_ADDRESS).toList, doc.get(F_ADDRESS_NOALIAS)) 81 | } 82 | 83 | case class Result(totalHits: Int, elapsedSecs: Float, hits: Seq[Hit], error: Option[String]) 84 | def toResult(totalHits: Int, elapsedSecs: Float, hits: Seq[Hit], error: Option[String]) 85 | = Result(totalHits, elapsedSecs, hits, error) 86 | 87 | def toSort(f: Option[String], asc: Boolean): Option[Sort] = None 88 | 89 | def validationBuf(c: CliOption, qp: QueryParam): ListBuffer[String] = { 90 | val b = new ListBuffer[String]() 91 | if (qp.numHits > c.numHits) b += s"numHits = ${qp.numHits} exceeds max of ${c.numHits}" 92 | qp.fuzzy.foreach { f => 93 | if (f.minLength < c.fuzzyMinLength) b += s"fuzzy minLength = ${f.minLength} less than min of ${c.fuzzyMinLength}" 94 | if (f.maxEdits > c.fuzzyMaxEdits) b += s"fuzzy maxEdits = ${f.maxEdits} exceeds max of ${c.fuzzyMaxEdits}" 95 | if (f.prefixLength < c.fuzzyPrefixLength) b += s"fuzzy prefixLength = ${f.prefixLength} less than min of ${c.fuzzyPrefixLength}" 96 | if (f.prefixLength >= f.minLength) b += s"fuzzy prefixLength = ${f.prefixLength} not less than minLength = ${f.minLength}" 97 | } 98 | b 99 | } 100 | 101 | /** validation error message or empty for no error */ 102 | def validationError(b: ListBuffer[String]) = b.mkString("\n") 103 | 104 | case class BulkQueryParam(addresses: Seq[String], numHits: Int, fuzzy: Option[FuzzyParam], box: Option[BoundingBox]) 105 | 106 | def validationBuf(c: CliOption, bqp: BulkQueryParam): ListBuffer[String] = { 107 | val b = validationBuf(c, QueryParam("", bqp.numHits, bqp.fuzzy, bqp.box)) 108 | if (bqp.addresses.size > c.bulk) b += s"addresses.size = ${bqp.addresses.size} exceeds max of ${c.bulk}" 109 | b 110 | } 111 | 112 | case class Version(`git-commit`: String, `sbt-version`: String, `gnaf-version`: String) 113 | 114 | object JsonProtocol extends DefaultJsonProtocol { 115 | implicit val hitFormat = jsonFormat4(Hit) 116 | implicit val resultFormat = jsonFormat4(Result) 117 | implicit val fuzzyParamFormat = jsonFormat3(FuzzyParam) 118 | implicit val boundingBoxFormat = jsonFormat4(BoundingBox) 119 | implicit val queryParamFormat = jsonFormat4(QueryParam) 120 | implicit val bulkQueryParamFormat = jsonFormat4(BulkQueryParam) 121 | implicit val versionFormat = jsonFormat3(Version) 122 | } 123 | import JsonProtocol._ 124 | 125 | 126 | def mkSearcher(c: CliOption) = { 127 | val s = new Searcher(directory(c.indexDir), toHit, toResult) 128 | s.searcher.setSimilarity(GnafSimilarity) 129 | s 130 | } 131 | 132 | def run(c: CliOption) = { 133 | 134 | val version = { 135 | Option(getClass.getResourceAsStream("/version.json")).map { s => 136 | Source.fromInputStream(s).getLines.mkString("\n").parseJson.convertTo[Version] 137 | }.getOrElse(Version("unknown git-commit", "unknown sbt-version", "unknown gnaf-version")) 138 | } 139 | 140 | implicit val sys = ActorSystem() 141 | implicit val exec = sys.dispatcher 142 | implicit val mat = ActorMaterializer() 143 | 144 | val luceneService = new LuceneService(c, mkSearcher(c), version) 145 | 146 | // /api-docs/swagger.json 147 | val swaggerService = new SwaggerHttpService() with HasActorSystem { 148 | override implicit val actorSystem = sys 149 | override implicit val materializer = mat 150 | override val apiTypes = Seq(scala.reflect.runtime.universe.typeOf[LuceneService]) 151 | override def swaggerConfig = new Swagger().basePath(prependSlashIfNecessary(basePath)) // don't specify protocol://host 152 | } 153 | 154 | val routes = cors() { 155 | logRequestResult("LuceneService") { luceneService.routes } ~ 156 | logRequestResult("Swagger") { swaggerService.routes } 157 | } 158 | log.info("starting service ...") 159 | Http().bindAndHandle(routes, c.interface, c.port) 160 | } 161 | } 162 | 163 | import Search._ 164 | import Search.JsonProtocol._ 165 | 166 | @Api(value = "search", produces = "application/json") 167 | @Path("") 168 | class LuceneService(c: CliOption, searcher: Searcher[Hit, Result], version: Version) 169 | (implicit system: ActorSystem, executor: ExecutionContextExecutor, materializer: Materializer) { 170 | 171 | @Path("version") 172 | @ApiOperation(value = "Version of software and data", nickname = "version", notes="""longer description""", httpMethod = "GET", response = classOf[Version]) 173 | def versionRoute = 174 | complete { // Future { 175 | version 176 | } //} 177 | 178 | @Path("search") 179 | @ApiOperation(value = "Search for an address", nickname = "search", notes="""longer description""", httpMethod = "POST", response = classOf[Result]) 180 | def searchRoute( 181 | @ApiParam(value = "queryParam", required = true) q: QueryParam 182 | ) = { 183 | val err = validationError(validationBuf(c, q)) 184 | validate(err.isEmpty, err) { complete { Future { 185 | searcher.search(q.toQuery, q.numHits) 186 | }}} 187 | } 188 | 189 | @Path("bulkSearch") 190 | @ApiOperation(value = "Search for many addresses", nickname = "bulkSearch", notes="""longer description""", httpMethod = "POST", response = classOf[Array[Result]]) 191 | def bulkSearchRoute( 192 | @ApiParam(value = "bulkQueryParam", required = true) q: BulkQueryParam 193 | ) = { 194 | val err = validationError(validationBuf(c, q)) 195 | validate(err.isEmpty, err) { complete { Future { 196 | def seqop(z: Seq[Result], addr: String) = z :+ searcher.search(QueryParam(addr, q.numHits, q.fuzzy, q.box).toQuery, q.numHits) 197 | q.addresses.par.aggregate(Seq.empty[Result])(seqop, _ ++ _) 198 | }}} 199 | } 200 | 201 | val routes = { 202 | pathPrefix("version") { get { versionRoute } } ~ 203 | pathPrefix("search") { (post & entity(as[QueryParam])) { searchRoute } } ~ 204 | pathPrefix("bulkSearch") { (post & entity(as[BulkQueryParam])) { bulkSearchRoute } } 205 | } 206 | 207 | } 208 | -------------------------------------------------------------------------------- /gnaf-contrib/3rd-party-licenses.html: -------------------------------------------------------------------------------- 1 | gnaf-contrib-licenses

gnaf-contrib-licenses

2 | 3 |
CategoryLicenseDependencyNotes
Apache Apache 2 ch.megard # akka-http-cors_2.11 # 0.1.2 
Apache Apache 2 io.spray # spray-json_2.11 # 1.3.2 
Apache Apache 2 joda-time # joda-time # 2.2 
Apache Apache 2.0 License com.typesafe.scala-logging # scala-logging_2.11 # 3.1.0 
Apache Apache License Version 2.0 org.yaml # snakeyaml # 1.12 
Apache Apache License, Version 2.0 com.typesafe # config # 1.3.0 
Apache Apache License, Version 2.0 com.typesafe # ssl-config-akka_2.11 # 0.2.1 
Apache Apache License, Version 2.0 com.typesafe # ssl-config-core_2.11 # 0.2.1 
Apache Apache License, Version 2.0 com.typesafe.akka # akka-actor_2.11 # 2.4.6 
Apache Apache License, Version 2.0 com.typesafe.akka # akka-http-core_2.11 # 2.4.6 
Apache Apache License, Version 2.0 com.typesafe.akka # akka-http-experimental_2.11 # 2.4.6 
Apache Apache License, Version 2.0 com.typesafe.akka # akka-http-spray-json-experimental_2.11 # 2.4.6 
Apache Apache License, Version 2.0 com.typesafe.akka # akka-http-testkit_2.11 # 2.4.3 
Apache Apache License, Version 2.0 com.typesafe.akka # akka-parsing_2.11 # 2.4.6 
Apache Apache License, Version 2.0 com.typesafe.akka # akka-stream-testkit_2.11 # 2.4.3 
Apache Apache License, Version 2.0 com.typesafe.akka # akka-stream_2.11 # 2.4.6 
Apache Apache License, Version 2.0 com.typesafe.akka # akka-testkit_2.11 # 2.4.3 
Apache The Apache Software License, Version 2.0 com.fasterxml.jackson.core # jackson-annotations # 2.4.5 
Apache The Apache Software License, Version 2.0 com.fasterxml.jackson.core # jackson-core # 2.4.5 
Apache The Apache Software License, Version 2.0 com.fasterxml.jackson.core # jackson-databind # 2.4.5 
Apache The Apache Software License, Version 2.0 com.fasterxml.jackson.dataformat # jackson-dataformat-xml # 2.4.5 
Apache The Apache Software License, Version 2.0 com.fasterxml.jackson.dataformat # jackson-dataformat-yaml # 2.4.5 
Apache The Apache Software License, Version 2.0 com.fasterxml.jackson.datatype # jackson-datatype-joda # 2.4.5 
Apache The Apache Software License, Version 2.0 com.fasterxml.jackson.jaxrs # jackson-jaxrs-base # 2.4.5 
Apache The Apache Software License, Version 2.0 com.fasterxml.jackson.jaxrs # jackson-jaxrs-json-provider # 2.4.5 
Apache The Apache Software License, Version 2.0 com.fasterxml.jackson.module # jackson-module-jaxb-annotations # 2.4.5 
Apache The Apache Software License, Version 2.0 com.fasterxml.jackson.module # jackson-module-scala_2.11 # 2.4.2 
Apache The Apache Software License, Version 2.0 com.github.swagger-akka-http # swagger-akka-http_2.11 # 0.7.0 
Apache The Apache Software License, Version 2.0 com.google.guava # guava # 18.0 
Apache The Apache Software License, Version 2.0 com.zaxxer # HikariCP-java6 # 2.3.7 
Apache The Apache Software License, Version 2.0 io.swagger # swagger-annotations # 1.5.9 
Apache The Apache Software License, Version 2.0 io.swagger # swagger-core # 1.5.9 
Apache The Apache Software License, Version 2.0 io.swagger # swagger-jaxrs # 1.5.9 
Apache The Apache Software License, Version 2.0 io.swagger # swagger-models # 1.5.9 
Apache The Apache Software License, Version 2.0 io.swagger # swagger-scala-module_2.11 # 1.0.2 
Apache The Apache Software License, Version 2.0 javax.validation # validation-api # 1.1.0.Final 
Apache The Apache Software License, Version 2.0 org.apache.commons # commons-lang3 # 3.2.1 
Apache The Apache Software License, Version 2.0 org.javassist # javassist # 3.19.0-GA 
Apache The Apache Software License, Version 2.0 org.json4s # json4s-ast_2.11 # 3.2.11 
Apache The Apache Software License, Version 2.0 org.json4s # json4s-core_2.11 # 3.2.11 
Apache The Apache Software License, Version 2.0 org.json4s # json4s-jackson_2.11 # 3.2.11 
Apache The Apache Software License, Version 2.0 org.json4s # json4s-native_2.11 # 3.2.11 
BSD BSD au.csiro.data61.gnaf # gnaf-util_2.11 # 0.8-SNAPSHOT 
BSD BSD 3-Clause org.scala-lang # scala-compiler # 2.11.8 
BSD BSD 3-Clause org.scala-lang # scala-library # 2.11.8 
BSD BSD 3-Clause org.scala-lang # scala-reflect # 2.11.8 
BSD BSD 3-Clause org.scala-lang # scalap # 2.11.8 
BSD BSD 3-clause org.scala-lang.modules # scala-java8-compat_2.11 # 0.7.0 
BSD BSD 3-clause org.scala-lang.modules # scala-parser-combinators_2.11 # 1.0.4 
BSD BSD 3-clause org.scala-lang.modules # scala-xml_2.11 # 1.0.4 
BSD BSD-Style com.thoughtworks.paranamer # paranamer # 2.6 
BSD The BSD License org.codehaus.woodstox # stax2-api # 3.1.4 
BSD The New BSD License org.reflections # reflections # 0.9.10 
BSD Two-clause BSD-style license com.typesafe.slick # slick-codegen_2.11 # 3.1.1 
BSD Two-clause BSD-style license com.typesafe.slick # slick-hikaricp_2.11 # 3.1.1 
BSD Two-clause BSD-style license com.typesafe.slick # slick_2.11 # 3.1.1 
CC0 CC0 org.reactivestreams # reactive-streams # 1.0.0 
GPL with Classpath Extension CDDL + GPLv2 with classpath exception javax.ws.rs # jsr311-api # 1.1.1 
LGPL EPL + GNU Lesser General Public License ch.qos.logback # logback-classic # 1.1.3 
LGPL EPL + GNU Lesser General Public License ch.qos.logback # logback-core # 1.1.3 
LGPL GNU Lesser General Public License com.google.code.findbugs # annotations # 2.0.1 
LGPL GNU Lesser General Public License com.google.code.findbugs # jsr305 # 2.0.1 
MIT MIT License org.slf4j # slf4j-api # 1.7.12 
Mozilla MPL 2.0 or EPL 1.0 com.h2database # h2 # 1.4.193 
-------------------------------------------------------------------------------- /gnaf-db-service/3rd-party-licenses.html: -------------------------------------------------------------------------------- 1 | gnaf-db-service-licenses

gnaf-db-service-licenses

2 | 3 |
CategoryLicenseDependencyNotes
Apache Apache 2 ch.megard # akka-http-cors_2.11 # 0.1.2 
Apache Apache 2 io.spray # spray-json_2.11 # 1.3.2 
Apache Apache 2 joda-time # joda-time # 2.2 
Apache Apache 2.0 License com.typesafe.scala-logging # scala-logging_2.11 # 3.1.0 
Apache Apache License Version 2.0 org.yaml # snakeyaml # 1.12 
Apache Apache License, Version 2.0 com.typesafe # config # 1.3.0 
Apache Apache License, Version 2.0 com.typesafe # ssl-config-akka_2.11 # 0.2.1 
Apache Apache License, Version 2.0 com.typesafe # ssl-config-core_2.11 # 0.2.1 
Apache Apache License, Version 2.0 com.typesafe.akka # akka-actor_2.11 # 2.4.6 
Apache Apache License, Version 2.0 com.typesafe.akka # akka-http-core_2.11 # 2.4.6 
Apache Apache License, Version 2.0 com.typesafe.akka # akka-http-experimental_2.11 # 2.4.6 
Apache Apache License, Version 2.0 com.typesafe.akka # akka-http-spray-json-experimental_2.11 # 2.4.6 
Apache Apache License, Version 2.0 com.typesafe.akka # akka-http-testkit_2.11 # 2.4.3 
Apache Apache License, Version 2.0 com.typesafe.akka # akka-parsing_2.11 # 2.4.6 
Apache Apache License, Version 2.0 com.typesafe.akka # akka-stream-testkit_2.11 # 2.4.3 
Apache Apache License, Version 2.0 com.typesafe.akka # akka-stream_2.11 # 2.4.6 
Apache Apache License, Version 2.0 com.typesafe.akka # akka-testkit_2.11 # 2.4.3 
Apache The Apache Software License, Version 2.0 com.fasterxml.jackson.core # jackson-annotations # 2.4.5 
Apache The Apache Software License, Version 2.0 com.fasterxml.jackson.core # jackson-core # 2.4.5 
Apache The Apache Software License, Version 2.0 com.fasterxml.jackson.core # jackson-databind # 2.4.5 
Apache The Apache Software License, Version 2.0 com.fasterxml.jackson.dataformat # jackson-dataformat-xml # 2.4.5 
Apache The Apache Software License, Version 2.0 com.fasterxml.jackson.dataformat # jackson-dataformat-yaml # 2.4.5 
Apache The Apache Software License, Version 2.0 com.fasterxml.jackson.datatype # jackson-datatype-joda # 2.4.5 
Apache The Apache Software License, Version 2.0 com.fasterxml.jackson.jaxrs # jackson-jaxrs-base # 2.4.5 
Apache The Apache Software License, Version 2.0 com.fasterxml.jackson.jaxrs # jackson-jaxrs-json-provider # 2.4.5 
Apache The Apache Software License, Version 2.0 com.fasterxml.jackson.module # jackson-module-jaxb-annotations # 2.4.5 
Apache The Apache Software License, Version 2.0 com.fasterxml.jackson.module # jackson-module-scala_2.11 # 2.4.2 
Apache The Apache Software License, Version 2.0 com.github.swagger-akka-http # swagger-akka-http_2.11 # 0.7.0 
Apache The Apache Software License, Version 2.0 com.google.guava # guava # 18.0 
Apache The Apache Software License, Version 2.0 com.zaxxer # HikariCP-java6 # 2.3.7 
Apache The Apache Software License, Version 2.0 io.swagger # swagger-annotations # 1.5.9 
Apache The Apache Software License, Version 2.0 io.swagger # swagger-core # 1.5.9 
Apache The Apache Software License, Version 2.0 io.swagger # swagger-jaxrs # 1.5.9 
Apache The Apache Software License, Version 2.0 io.swagger # swagger-models # 1.5.9 
Apache The Apache Software License, Version 2.0 io.swagger # swagger-scala-module_2.11 # 1.0.2 
Apache The Apache Software License, Version 2.0 javax.validation # validation-api # 1.1.0.Final 
Apache The Apache Software License, Version 2.0 org.apache.commons # commons-lang3 # 3.2.1 
Apache The Apache Software License, Version 2.0 org.javassist # javassist # 3.19.0-GA 
Apache The Apache Software License, Version 2.0 org.json4s # json4s-ast_2.11 # 3.2.11 
Apache The Apache Software License, Version 2.0 org.json4s # json4s-core_2.11 # 3.2.11 
Apache The Apache Software License, Version 2.0 org.json4s # json4s-jackson_2.11 # 3.2.11 
Apache The Apache Software License, Version 2.0 org.json4s # json4s-native_2.11 # 3.2.11 
BSD BSD au.csiro.data61.gnaf # gnaf-db_2.11 # 0.8-SNAPSHOT 
BSD BSD au.csiro.data61.gnaf # gnaf-util_2.11 # 0.8-SNAPSHOT 
BSD BSD 3-Clause org.scala-lang # scala-compiler # 2.11.8 
BSD BSD 3-Clause org.scala-lang # scala-library # 2.11.8 
BSD BSD 3-Clause org.scala-lang # scala-reflect # 2.11.8 
BSD BSD 3-Clause org.scala-lang # scalap # 2.11.8 
BSD BSD 3-clause org.scala-lang.modules # scala-java8-compat_2.11 # 0.7.0 
BSD BSD 3-clause org.scala-lang.modules # scala-parser-combinators_2.11 # 1.0.4 
BSD BSD 3-clause org.scala-lang.modules # scala-xml_2.11 # 1.0.4 
BSD BSD-Style com.thoughtworks.paranamer # paranamer # 2.6 
BSD The BSD License org.codehaus.woodstox # stax2-api # 3.1.4 
BSD The New BSD License org.reflections # reflections # 0.9.10 
BSD Two-clause BSD-style license com.typesafe.slick # slick-codegen_2.11 # 3.1.1 
BSD Two-clause BSD-style license com.typesafe.slick # slick-hikaricp_2.11 # 3.1.1 
BSD Two-clause BSD-style license com.typesafe.slick # slick_2.11 # 3.1.1 
CC0 CC0 org.reactivestreams # reactive-streams # 1.0.0 
GPL with Classpath Extension CDDL + GPLv2 with classpath exception javax.ws.rs # jsr311-api # 1.1.1 
LGPL EPL + GNU Lesser General Public License ch.qos.logback # logback-classic # 1.1.3 
LGPL EPL + GNU Lesser General Public License ch.qos.logback # logback-core # 1.1.3 
LGPL GNU Lesser General Public License com.google.code.findbugs # annotations # 2.0.1 
LGPL GNU Lesser General Public License com.google.code.findbugs # jsr305 # 2.0.1 
MIT MIT License org.slf4j # slf4j-api # 1.7.12 
Mozilla MPL 2.0 or EPL 1.0 com.h2database # h2 # 1.4.193 
-------------------------------------------------------------------------------- /gnaf-extractor/src/main/scala/au/csiro/data61/gnaf/extractor/Extractor.scala: -------------------------------------------------------------------------------- 1 | package au.csiro.data61.gnaf.extractor 2 | 3 | import scala.concurrent.{ Await, ExecutionContext, Future }, ExecutionContext.Implicits.global 4 | import scala.concurrent.duration.DurationInt 5 | import scala.language.implicitConversions 6 | import scala.util.{ Failure, Success } 7 | 8 | import com.typesafe.config.{ ConfigFactory, ConfigValueFactory } 9 | 10 | import au.csiro.data61.gnaf.db.GnafTables 11 | import au.csiro.data61.gnaf.util.Gnaf._ 12 | import au.csiro.data61.gnaf.util.Gnaf.JsonProtocol._ 13 | import au.csiro.data61.gnaf.util.Util 14 | import resource.managed 15 | import slick.collection.heterogeneous.HNil 16 | import slick.collection.heterogeneous.syntax.:: 17 | import spray.json.pimpAny 18 | import scala.sys.SystemProperties 19 | 20 | // Organize Imports deletes this, so make it easy to restore ... 21 | // import slick.collection.heterogeneous.syntax.:: 22 | 23 | object Extractor { 24 | val log = Util.getLogger(getClass) 25 | 26 | val config = ConfigFactory.load 27 | 28 | object MyGnafTables extends { 29 | val profile = Util.getObject[slick.driver.JdbcProfile](config.getString("gnafDb.slickDriver")) // e.g. slick.driver.{H2Driver,PostgresDriver} 30 | } with GnafTables 31 | import MyGnafTables._ 32 | import MyGnafTables.profile.api._ 33 | 34 | /** result of command line option processing */ 35 | case class CliOption(dburl: String, localityTimeout: Int, allTimeout: Int) 36 | val defaultCliOption = CliOption(config.getString("gnafDb.url"), config.getInt("gnafDb.localityTimeout"), config.getInt("gnafDb.allTimeout")) 37 | 38 | def main(args: Array[String]): Unit = { 39 | val parser = new scopt.OptionParser[CliOption]("gnaf-extractor") { 40 | head("gnaf-extractor", "0.x") 41 | note("Creates JSON from gnaf database to load into a search engine.") 42 | opt[String]('u', "dburl") action { (x, c) => 43 | c.copy(dburl = x) 44 | } text (s"database URL, default ${defaultCliOption.dburl}") 45 | opt[Int]('l', "localityTimeout") action { (x, c) => 46 | c.copy(localityTimeout = x) 47 | } text (s"timeout in minutes for all queries for a locality, default ${defaultCliOption.localityTimeout}") 48 | opt[Int]('a', "allTimeout") action { (x, c) => 49 | c.copy(allTimeout = x) 50 | } text (s"timeout in minutes for all queries, default ${defaultCliOption.allTimeout}") 51 | help("help") text ("prints this usage text") 52 | } 53 | parser.parse(args, defaultCliOption) foreach run 54 | log.info("complete") 55 | } 56 | 57 | def run(c: CliOption) = { 58 | // configure global thread pool 59 | (new SystemProperties()) ++= Seq( 60 | ("scala.concurrent.context.minThreads", "4"), 61 | ("scala.concurrent.context.numThreads", "4"), 62 | ("scala.concurrent.context.maxThreads", "4") 63 | ) 64 | 65 | val conf = config.withValue("gnafDb.url", ConfigValueFactory.fromAnyRef(c.dburl)) // CliOption.dburl overrides gnafDb.url 66 | for (db <- managed(Database.forConfig("gnafDb", conf))) { 67 | doAll(c)(db) 68 | } 69 | } 70 | 71 | val qAddressDetail = { 72 | def q(localityPid: Rep[String]) = for { 73 | ((((ad, lta), as), sl), adg) <- AddressDetail joinLeft 74 | LevelTypeAut on (_.levelTypeCode === _.code) joinLeft // only 15 rows so keep in memory 75 | AddressSite on (_._1.addressSitePid === _.addressSitePid) joinLeft // ADDRESS_DETAIL.ADDRESS_SITE_PID is NON NULL, so no need for LEFT JOIN 76 | StreetLocality on (_._1._1.streetLocalityPid === _.streetLocalityPid) joinLeft 77 | AddressDefaultGeocode on (_._1._1._1.addressDetailPid === _.addressDetailPid) 78 | if (ad.localityPid === localityPid && ad.confidence > -1) 79 | } yield ( 80 | ad, 81 | lta.map(_.name), 82 | as.map(_.addressSiteName), 83 | sl.map(sl => (sl.streetName, sl.streetTypeCode, sl.streetSuffixCode)), 84 | adg.map(adg => (adg.latitude, adg.longitude))) 85 | Compiled(q _) 86 | } 87 | 88 | val qLocalityAliasName = { 89 | def q(localityPid: Rep[String]) = for (la <- LocalityAlias if la.localityPid === localityPid) yield la.name 90 | Compiled(q _) 91 | } 92 | def localityVariant(localityPid: String)(implicit db: Database): Future[Seq[LocalityVariant]] = 93 | db.run(qLocalityAliasName(localityPid).result).map(_.map(name => LocalityVariant(name))) 94 | 95 | val qStreetLocalityAlias = { 96 | def q(streetLocalityPid: Rep[String]) = for (sla <- StreetLocalityAlias if sla.streetLocalityPid === streetLocalityPid) yield (sla.streetName, sla.streetTypeCode, sla.streetSuffixCode) 97 | Compiled(q _) 98 | } 99 | 100 | def streetLocalityAlias(streetLocalityPid: Option[String])(implicit db: Database): Future[Seq[(String, Option[String], Option[String])]] = { 101 | streetLocalityPid.map { pid => 102 | db.run(qStreetLocalityAlias(pid).result) 103 | }.getOrElse(Future(Seq.empty)) 104 | } 105 | 106 | type FutStrMap = Future[Map[String, String]] 107 | 108 | def doAll(c: CliOption)(implicit db: Database) = { 109 | // These code -> name mappings are all small enough to keep in memory 110 | val stateMap: Future[Map[String, (String, String)]] = db.run((for (s <- State) yield s.statePid -> (s.stateAbbreviation, s.stateName)).result).map(_.toMap) 111 | val flatTypeMap: FutStrMap = db.run((for (f <- FlatTypeAut) yield f.code -> f.name).result).map(_.toMap) 112 | val streetTypeMap: FutStrMap = db.run((for (s <- StreetTypeAut) yield s.code -> s.name).result).map(_.toMap) 113 | val streetSuffixMap: FutStrMap = db.run((for (s <- StreetSuffixAut) yield s.code -> s.name).result).map(_.toMap) 114 | 115 | val localities: Future[Seq[(String, String, String)]] = db.run((for (loc <- Locality if loc.localityClassCode === 'G') yield (loc.localityPid, loc.localityName, loc.statePid)).result) 116 | val done: Future[Unit] = localities.flatMap { seq => 117 | log.info("got all localities") 118 | val seqFut: Seq[Future[Unit]] = seq.map { 119 | case (localityPid, localityName, statePid) => 120 | val locDone = doLocality(localityPid, localityName, statePid, stateMap, flatTypeMap, streetTypeMap, streetSuffixMap) 121 | Await.result(locDone, c.localityTimeout.minute) // without this it runs out of memory before outputting anything! 122 | locDone 123 | } 124 | Future.fold(seqFut)(())((_, _) => ()) 125 | } 126 | Await.result(done, c.allTimeout.minute) 127 | log info "all done" 128 | } 129 | 130 | /* 131 | When I try to stream all AddressDetail rows, I don't get any rows in a reasonable time (seems to hang but CPU is busy). 132 | 133 | http://stackoverflow.com/questions/24787119/how-to-set-h2-to-stream-resultset 134 | H2 currently does not support server side cursors. However, it buffers large result sets to disk (as a separate file, or as a temporary table). The disadvantage is speed, but it should not be a memory usage problems. 135 | 136 | You can set the size of the when H2 will buffer to disk using set max_memory_rows. You can append that to the database URL: jdbc:h2:~/test;max_memory_rows=200000. 137 | 138 | A workaround is usually to use "keyset paging" as described in the presentation "Pagination Done the Right Way". That would mean running multiple queries instead of one. 139 | 140 | http://www.h2database.com/html/advanced.html 141 | Before the result is returned to the application, all rows are read by the database. Server side cursors are not supported currently. 142 | 143 | http://www.h2database.com/javadoc/org/h2/engine/SysProperties.html?highlight=max_memory_rows&search=max_memory_rows#h2.maxMemoryRows 144 | System property h2.maxMemoryRows (default: 40000 per GB of available RAM). 145 | 146 | So if we set -Xmx3G and partition by LOCALITY_PID we should be OK: 147 | There are 16398 LOCALITY rows and max ADDRESS_DETAILs for a LOCALITY is 95004. 148 | SELECT LOCALITY_PID , count(*) cnt FROM ADDRESS_DETAIL group by LOCALITY_PID order by cnt desc limit 3; 149 | 150 | LOCALITY_PID CNT Feb 2016 CNT Nov 2017 151 | VIC1634 95004 105960 152 | NSW3749 44656 45502 153 | QLD2772 34712 39162 154 | 155 | http://slick.typesafe.com/doc/3.1.1/dbio.html 156 | Slick's Database.stream produces a `Reactive Stream` that can be consumed with a foreach that takes a callback for each row. 157 | Since H2 is providing all the rows at once (see above): 158 | - the callback is called for multiple rows at once 159 | - concurrency is limited only by the number of threads 160 | - all the other callbacks are queued on the thread pool, preventing anything else from running on this pool. 161 | It's better to use Database.run to get all all the rows at once, allow H2 to release any resources, and to have some control over the 162 | concurrency of processing the rows. 163 | */ 164 | 165 | def doLocality( 166 | localityPid: String, localityName: String, statePid: String, 167 | stateMap: Future[Map[String, (String, String)]], flatTypeMap: FutStrMap, streetTypeMap: FutStrMap, streetSuffixMap: FutStrMap 168 | )( 169 | implicit db: Database 170 | ): Future[Unit] = { 171 | val state = stateMap.map(_.apply(statePid)) 172 | val locVariant = localityVariant(localityPid) 173 | 174 | log.info(s"starting locality $localityName") 175 | db.run(qAddressDetail(localityPid).result).flatMap { seq => 176 | log.info(s"got all addresses for locality $localityName") 177 | 178 | val seqFut: Seq[Future[Address]] = seq.map { 179 | case ( 180 | // copied from AddressDetail.* 181 | addressDetailPid :: dateCreated :: dateLastModified :: dateRetired :: buildingName :: lotNumberPrefix :: lotNumber :: lotNumberSuffix :: 182 | flatTypeCode :: flatNumberPrefix :: flatNumber :: flatNumberSuffix :: 183 | levelTypeCode :: levelNumberPrefix :: levelNumber :: levelNumberSuffix :: 184 | numberFirstPrefix :: numberFirst :: numberFirstSuffix :: 185 | numberLastPrefix :: numberLast :: numberLastSuffix :: 186 | streetLocalityPid :: locationDescription :: localityPid :: aliasPrincipal :: postcode :: privateStreet :: legalParcelId :: confidence :: 187 | addressSitePid :: levelGeocodedCode :: propertyPid :: gnafPropertyPid :: primarySecondary :: HNil, 188 | levelTypeName, 189 | addressSiteName, 190 | street, 191 | location 192 | ) => 193 | 194 | val addr: Future[Address] = for { 195 | (stateAbbreviation, stateName) <- state 196 | ftm <- flatTypeMap 197 | stm <- streetTypeMap 198 | ssm <- streetSuffixMap 199 | locVar <- locVariant 200 | sla <- streetLocalityAlias(streetLocalityPid) 201 | } yield Address( 202 | addressDetailPid, addressSiteName.flatten, buildingName, 203 | flatTypeCode, flatTypeCode.map(ftm), PreNumSuf(flatNumberPrefix, flatNumber, flatNumberSuffix), 204 | levelTypeCode, levelTypeName, PreNumSuf(levelNumberPrefix, levelNumber, levelNumberSuffix), 205 | PreNumSuf(numberFirstPrefix, numberFirst, numberFirstSuffix), 206 | PreNumSuf(numberLastPrefix, numberLast, numberLastSuffix), 207 | street.map(s => Street(s._1, s._2, s._2.map(stm), s._3, s._3.map(ssm))), 208 | localityName, stateAbbreviation, stateName, postcode, 209 | aliasPrincipal, primarySecondary, 210 | location.flatMap { 211 | case (Some(lat), Some(lon)) => Some(Location(lat, lon)) 212 | case _ => None 213 | }, 214 | sla.map(s => Street(s._1, s._2, s._2.map(stm), s._3, s._3.map(ssm))), 215 | locVar) 216 | 217 | addr.onComplete { 218 | case Success(a) => println(a.toJson.compactPrint) // println appears to be synchronized 219 | case Failure(e) => log.error(s"future address for $addressDetailPid failed", e) 220 | } 221 | 222 | /* 223 | * Trying to use small bounded thread pools I got: 224 | * 12:50:59.843 [Pool-2-thread-2] ERROR au.com.data61.gnaf.indexer.Main. - future address for GAACT715082885 failed 225 | * java.util.concurrent.RejectedExecutionException: Task slick.backend.DatabaseComponent$DatabaseDef$$anon$2@1dbaddc0 rejected from 226 | * java.util.concurrent.ThreadPoolExecutor@2bc930eb[Running, pool size = 3, active threads = 3, queued tasks = 987, completed tasks = 10] 227 | * 228 | * The only pool with a queue size of 987 and 3 threads is the slick pool configured in application.conf. 229 | * I tried explicit flatMaps instead of for, with an explicit ExecutionContext, but it still used the slick pool! 230 | */ 231 | addr 232 | } 233 | 234 | val locDone = Future.fold(seqFut)(())((_, _) => ()) 235 | locDone.onComplete { 236 | case Success(_) => log.info(s"completed locality $localityName") 237 | case Failure(e) => log.error(s"future locality $localityName failed", e) 238 | } 239 | locDone 240 | } 241 | } 242 | 243 | } 244 | -------------------------------------------------------------------------------- /gnaf-search/3rd-party-licenses.html: -------------------------------------------------------------------------------- 1 | gnaf-search-licenses

gnaf-search-licenses

2 | 3 |
CategoryLicenseDependencyNotes
Apache Apache 2 ch.megard # akka-http-cors_2.11 # 0.1.2 
Apache Apache 2 io.spray # spray-json_2.11 # 1.3.2 
Apache Apache 2 joda-time # joda-time # 2.2 
Apache Apache 2.0 License com.typesafe.scala-logging # scala-logging_2.11 # 3.1.0 
Apache Apache License Version 2.0 org.yaml # snakeyaml # 1.12 
Apache Apache License, Version 2.0 com.typesafe # config # 1.3.0 
Apache Apache License, Version 2.0 com.typesafe # ssl-config-akka_2.11 # 0.2.1 
Apache Apache License, Version 2.0 com.typesafe # ssl-config-core_2.11 # 0.2.1 
Apache Apache License, Version 2.0 com.typesafe.akka # akka-actor_2.11 # 2.4.6 
Apache Apache License, Version 2.0 com.typesafe.akka # akka-http-core_2.11 # 2.4.6 
Apache Apache License, Version 2.0 com.typesafe.akka # akka-http-experimental_2.11 # 2.4.6 
Apache Apache License, Version 2.0 com.typesafe.akka # akka-http-spray-json-experimental_2.11 # 2.4.6 
Apache Apache License, Version 2.0 com.typesafe.akka # akka-http-testkit_2.11 # 2.4.3 
Apache Apache License, Version 2.0 com.typesafe.akka # akka-parsing_2.11 # 2.4.6 
Apache Apache License, Version 2.0 com.typesafe.akka # akka-stream-testkit_2.11 # 2.4.3 
Apache Apache License, Version 2.0 com.typesafe.akka # akka-stream_2.11 # 2.4.6 
Apache Apache License, Version 2.0 com.typesafe.akka # akka-testkit_2.11 # 2.4.3 
Apache The Apache Software License, Version 2.0 com.fasterxml.jackson.core # jackson-annotations # 2.4.5 
Apache The Apache Software License, Version 2.0 com.fasterxml.jackson.core # jackson-core # 2.4.5 
Apache The Apache Software License, Version 2.0 com.fasterxml.jackson.core # jackson-databind # 2.4.5 
Apache The Apache Software License, Version 2.0 com.fasterxml.jackson.dataformat # jackson-dataformat-xml # 2.4.5 
Apache The Apache Software License, Version 2.0 com.fasterxml.jackson.dataformat # jackson-dataformat-yaml # 2.4.5 
Apache The Apache Software License, Version 2.0 com.fasterxml.jackson.datatype # jackson-datatype-joda # 2.4.5 
Apache The Apache Software License, Version 2.0 com.fasterxml.jackson.jaxrs # jackson-jaxrs-base # 2.4.5 
Apache The Apache Software License, Version 2.0 com.fasterxml.jackson.jaxrs # jackson-jaxrs-json-provider # 2.4.5 
Apache The Apache Software License, Version 2.0 com.fasterxml.jackson.module # jackson-module-jaxb-annotations # 2.4.5 
Apache The Apache Software License, Version 2.0 com.fasterxml.jackson.module # jackson-module-scala_2.11 # 2.4.2 
Apache The Apache Software License, Version 2.0 com.github.swagger-akka-http # swagger-akka-http_2.11 # 0.7.0 
Apache The Apache Software License, Version 2.0 com.google.guava # guava # 18.0 
Apache The Apache Software License, Version 2.0 io.swagger # swagger-annotations # 1.5.9 
Apache The Apache Software License, Version 2.0 io.swagger # swagger-core # 1.5.9 
Apache The Apache Software License, Version 2.0 io.swagger # swagger-jaxrs # 1.5.9 
Apache The Apache Software License, Version 2.0 io.swagger # swagger-models # 1.5.9 
Apache The Apache Software License, Version 2.0 io.swagger # swagger-scala-module_2.11 # 1.0.2 
Apache The Apache Software License, Version 2.0 javax.validation # validation-api # 1.1.0.Final 
Apache The Apache Software License, Version 2.0 org.apache.commons # commons-lang3 # 3.2.1 
Apache The Apache Software License, Version 2.0 org.apache.lucene # lucene-analyzers-common # 6.2.1 
Apache The Apache Software License, Version 2.0 org.apache.lucene # lucene-core # 6.2.1 
Apache The Apache Software License, Version 2.0 org.javassist # javassist # 3.18.2-GA 
Apache The Apache Software License, Version 2.0 org.json4s # json4s-ast_2.11 # 3.2.11 
Apache The Apache Software License, Version 2.0 org.json4s # json4s-core_2.11 # 3.2.11 
Apache The Apache Software License, Version 2.0 org.json4s # json4s-jackson_2.11 # 3.2.11 
Apache The Apache Software License, Version 2.0 org.json4s # json4s-native_2.11 # 3.2.11 
Apache the Apache License, ASL Version 2.0 org.scalactic # scalactic_2.11 # 3.0.0 
Apache the Apache License, ASL Version 2.0 org.scalatest # scalatest_2.11 # 3.0.0 
BSD BSD au.csiro.data61.gnaf # gnaf-lucene_2.11 # 0.8-SNAPSHOT 
BSD BSD au.csiro.data61.gnaf # gnaf-util_2.11 # 0.8-SNAPSHOT 
BSD BSD 3-Clause org.scala-lang # scala-compiler # 2.11.8 
BSD BSD 3-Clause org.scala-lang # scala-library # 2.11.8 
BSD BSD 3-Clause org.scala-lang # scala-reflect # 2.11.8 
BSD BSD 3-Clause org.scala-lang # scalap # 2.11.8 
BSD BSD 3-clause org.scala-lang.modules # scala-java8-compat_2.11 # 0.7.0 
BSD BSD 3-clause org.scala-lang.modules # scala-parser-combinators_2.11 # 1.0.4 
BSD BSD 3-clause org.scala-lang.modules # scala-xml_2.11 # 1.0.4 
BSD BSD 3-clause org.scala-lang.modules # scala-xml_2.11 # 1.0.5 
BSD BSD-Style com.jsuereth # scala-arm_2.11 # 2.0.0-M1 
BSD BSD-Style com.thoughtworks.paranamer # paranamer # 2.6 
BSD The BSD License org.codehaus.woodstox # stax2-api # 3.1.4 
BSD The New BSD License org.reflections # reflections # 0.9.10 
CC0 CC0 org.reactivestreams # reactive-streams # 1.0.0 
GPL with Classpath Extension CDDL + GPLv2 with classpath exception javax.ws.rs # jsr311-api # 1.1.1 
LGPL EPL + GNU Lesser General Public License ch.qos.logback # logback-classic # 1.1.3 
LGPL EPL + GNU Lesser General Public License ch.qos.logback # logback-core # 1.1.3 
LGPL GNU Lesser General Public License com.google.code.findbugs # annotations # 2.0.1 
LGPL GNU Lesser General Public License com.google.code.findbugs # jsr305 # 2.0.1 
MIT MIT License com.github.scopt # scopt_2.11 # 3.3.0 
MIT MIT License org.slf4j # slf4j-api # 1.7.12 
--------------------------------------------------------------------------------