├── .gitignore
├── LICENSE.txt
├── README.md
├── build.sbt
├── build.sh
├── dataFusion-common
    ├── 3rd-party-licenses.md
    ├── README.md
    ├── build.sbt
    ├── doc
    │   └── dataFusion.zargo
    └── src
    │   ├── main
    │       └── scala
    │       │   └── au
    │       │       └── csiro
    │       │           └── data61
    │       │               └── dataFusion
    │       │                   └── common
    │       │                       ├── CSV.scala
    │       │                       ├── Data.scala
    │       │                       ├── EnglishScore.scala
    │       │                       ├── Parallel.scala
    │       │                       ├── Timer.scala
    │       │                       └── Util.scala
    │   └── test
    │       ├── resources
    │           └── logback-test.xml
    │       └── scala
    │           └── au
    │               └── csiro
    │                   └── data61
    │                       └── dataFusion
    │                           └── common
    │                               ├── EnglishScoreTest.scala
    │                               ├── JsonTest.scala
    │                               ├── ParallelTest.scala
    │                               └── UtilTest.scala
├── dataFusion-db-service
    ├── 3rd-party-licenses.md
    ├── README.md
    ├── build.sbt
    └── src
    │   ├── main
    │       ├── resources
    │       │   ├── application.conf
    │       │   └── logback.xml
    │       └── scala
    │       │   └── au
    │       │       └── csiro
    │       │           └── data61
    │       │               └── dataFusion
    │       │                   └── db
    │       │                       └── service
    │       │                           ├── DbService.scala
    │       │                           └── Main.scala
    │   └── test
    │       └── resources
    │           └── logback-test.xml
├── dataFusion-db
    ├── 3rd-party-licenses.md
    ├── README.md
    ├── build.sbt
    └── src
    │   ├── main
    │       ├── resources
    │       │   ├── application.conf
    │       │   └── logback.xml
    │       └── scala
    │       │   └── au
    │       │       └── csiro
    │       │           └── data61
    │       │               └── dataFusion
    │       │                   └── db
    │       │                       ├── Main.scala
    │       │                       └── Tables.scala
    │   └── test
    │       └── resources
    │           └── logback-test.xml
├── dataFusion-graph-service
    ├── 3rd-party-licenses.md
    ├── README.md
    ├── build.sbt
    └── src
    │   ├── main
    │       ├── resources
    │       │   ├── application.conf
    │       │   └── logback.xml
    │       └── scala
    │       │   └── au
    │       │       └── csiro
    │       │           └── data61
    │       │               └── dataFusion
    │       │                   └── graph
    │       │                       └── service
    │       │                           └── Main.scala
    │   └── test
    │       ├── resources
    │           ├── edge.json
    │           ├── logback-test.xml
    │           └── node.json
    │       └── scala
    │           └── au
    │               └── csiro
    │                   └── data61
    │                       └── dataFusion
    │                           └── graph
    │                               └── service
    │                                   └── MainTest.scala
├── dataFusion-ner-service
    ├── 3rd-party-licenses.md
    ├── README.md
    ├── build.sbt
    └── src
    │   └── main
    │       ├── resources
    │           ├── application.conf
    │           └── logback.xml
    │       └── scala
    │           └── au
    │               └── csiro
    │                   └── data61
    │                       └── dataFusion
    │                           └── ner
    │                               └── service
    │                                   └── Main.scala
├── dataFusion-ner
    ├── 3rd-party-licenses.md
    ├── MITIE-native
    │   ├── centos
    │   │   └── libjavamitie.so
    │   └── ubuntu
    │   │   └── libjavamitie.so
    ├── README.md
    ├── build-MITIE.sh
    ├── build.sbt
    ├── lib
    │   └── javamitie.jar
    └── src
    │   ├── main
    │       ├── resources
    │       │   ├── application.conf
    │       │   ├── logback.xml
    │       │   └── opennlp-models-1.5
    │       │   │   ├── en-ner-date.bin
    │       │   │   ├── en-ner-location.bin
    │       │   │   ├── en-ner-money.bin
    │       │   │   ├── en-ner-organization.bin
    │       │   │   ├── en-ner-percentage.bin
    │       │   │   ├── en-ner-person.bin
    │       │   │   ├── en-ner-time.bin
    │       │   │   ├── en-sent.bin
    │       │   │   └── en-token.bin
    │       └── scala
    │       │   └── au
    │       │       └── csiro
    │       │           └── data61
    │       │               └── dataFusion
    │       │                   └── ner
    │       │                       ├── CoreNLP.scala
    │       │                       ├── MITIE.scala
    │       │                       ├── Main.scala
    │       │                       ├── OpenNLP.scala
    │       │                       └── Split.scala
    │   └── test
    │       ├── resources
    │           └── logback-test.xml
    │       └── scala
    │           └── au
    │               └── csiro
    │                   └── data61
    │                       └── dataFusion
    │                           └── ner
    │                               ├── CoreNLPTest.scala
    │                               ├── MITIETest.scala
    │                               ├── OpenNLPTest.scala
    │                               └── SplitTest.scala
├── dataFusion-search-service
    ├── 3rd-party-licenses.md
    ├── README.md
    ├── build.sbt
    └── src
    │   ├── main
    │       ├── resources
    │       │   ├── application.conf
    │       │   └── logback.xml
    │       └── scala
    │       │   └── au
    │       │       └── csiro
    │       │           └── data61
    │       │               └── dataFusion
    │       │                   └── search
    │       │                       └── service
    │       │                           └── Main.scala
    │   └── test
    │       └── resources
    │           └── logback-test.xml
├── dataFusion-search
    ├── 3rd-party-licenses.md
    ├── README.md
    ├── build.sbt
    ├── src
    │   ├── main
    │   │   ├── resources
    │   │   │   ├── application.conf
    │   │   │   └── logback.xml
    │   │   └── scala
    │   │   │   └── au
    │   │   │       └── csiro
    │   │   │           └── data61
    │   │   │               └── dataFusion
    │   │   │                   └── search
    │   │   │                       ├── DataFusionLucene.scala
    │   │   │                       ├── DocFreq.scala
    │   │   │                       ├── Indexer.scala
    │   │   │                       ├── LuceneUtil.scala
    │   │   │                       ├── Main.scala
    │   │   │                       └── Search.scala
    │   └── test
    │   │   ├── resources
    │   │       └── logback-test.xml
    │   │   └── scala
    │   │       └── au
    │   │           └── csiro
    │   │               └── data61
    │   │                   └── dataFusion
    │   │                       └── search
    │   │                           ├── DataFusionLuceneTest.scala
    │   │                           ├── JsonTest.scala
    │   │                           └── SearchTest.scala
    └── synonyms.txt
├── dataFusion-tika-service
    ├── 3rd-party-licenses.md
    ├── README.md
    ├── build.sbt
    └── src
    │   ├── main
    │       ├── resources
    │       │   ├── application.conf
    │       │   └── logback.xml
    │       └── scala
    │       │   └── au
    │       │       └── csiro
    │       │           └── data61
    │       │               └── dataFusion
    │       │                   └── tika
    │       │                       └── service
    │       │                           └── Main.scala
    │   └── test
    │       └── resources
    │           └── logback-test.xml
├── dataFusion-tika
    ├── 3rd-party-licenses.md
    ├── README.md
    ├── build.sbt
    └── src
    │   ├── main
    │       ├── java
    │       │   └── org
    │       │   │   └── apache
    │       │   │       └── tika
    │       │   │           └── parser
    │       │   │               └── ocr
    │       │   │                   └── TesseractOCRParser.java
    │       ├── resources
    │       │   ├── META-INF
    │       │   │   └── services
    │       │   │   │   ├── javax.imageio.spi.ImageReaderSpi
    │       │   │   │   └── javax.imageio.spi.ImageWriterSpi
    │       │   ├── application.conf
    │       │   ├── logback.xml
    │       │   └── org
    │       │   │   └── apache
    │       │   │       └── tika
    │       │   │           └── parser
    │       │   │               ├── ocr
    │       │   │                   ├── TesseractOCRConfig.properties
    │       │   │                   └── rotation.py
    │       │   │               └── pdf
    │       │   │                   └── PDFParser.properties
    │       └── scala
    │       │   └── au
    │       │       └── csiro
    │       │           └── data61
    │       │               └── dataFusion
    │       │                   └── tika
    │       │                       ├── LangDetect.scala
    │       │                       ├── Main.scala
    │       │                       └── TikaUtil.scala
    │   └── test
    │       ├── resources
    │           ├── exampleData
    │           │   ├── AAA.pptx
    │           │   ├── Email001.msg
    │           │   ├── PDF001.pdf
    │           │   ├── PDF002.pdf
    │           │   ├── PDF003.pdf
    │           │   ├── PDF004.pdf
    │           │   ├── README.txt
    │           │   ├── TIF001.tif
    │           │   ├── TIF002.tif
    │           │   ├── TIF003.tif
    │           │   ├── Thumbs.db
    │           │   ├── data-prob-2-12.XLS
    │           │   ├── doc001.doc
    │           │   ├── doc002.doc
    │           │   ├── html001.html
    │           │   ├── image001.png
    │           │   ├── image002.gif
    │           │   ├── image003.jpeg
    │           │   ├── image004.png
    │           │   ├── rtf001.rtf
    │           │   └── xls001.xls
    │           └── logback-test.xml
    │       └── scala
    │           └── au
    │               └── csiro
    │                   └── data61
    │                       └── dataFusion
    │                           └── tika
    │                               └── TikaTest.scala
├── dataFusion-util
    ├── 3rd-party-licenses.md
    ├── README.md
    ├── build.sbt
    └── src
    │   ├── main
    │       ├── resources
    │       │   ├── application.conf
    │       │   └── logback.xml
    │       └── scala
    │       │   └── au
    │       │       └── csiro
    │       │           └── data61
    │       │               └── dataFusion
    │       │                   └── util
    │       │                       ├── Age.scala
    │       │                       ├── Email.scala
    │       │                       ├── Hits.scala
    │       │                       ├── Main.scala
    │       │                       ├── Proximity.scala
    │       │                       └── TmNer.scala
    │   └── test
    │       ├── resources
    │           └── logback-test.xml
    │       └── scala
    │           └── au
    │               └── csiro
    │                   └── data61
    │                       └── dataFusion
    │                           └── util
    │                               ├── AgeTest.scala
    │                               ├── EmailTest.scala
    │                               ├── HitsTest.scala
    │                               └── ProximityTest.scala
├── docker
    ├── .dockerignore
    ├── Dockerfile-centos
    └── Dockerfile-ubuntu
├── images
    ├── JSONFormatsUML.png
    ├── JSONFormatsUML.svg
    ├── dataFusion.zargo
    ├── datafusion.png
    ├── datafusion.svg
    └── network.png
├── project
    ├── build.properties
    └── plugins.sbt
├── sh
    ├── dfus
    ├── setenv.centos
    ├── setenv.ubuntu
    └── tesseract4.sh
├── ui
    ├── README.md
    ├── bubble
    │   ├── css
    │   │   └── index.css
    │   ├── data
    │   │   └── data.json
    │   ├── images
    │   │   ├── csiro-black.png
    │   │   └── data61-logo.png
    │   ├── index.html
    │   └── js
    │   │   ├── bubble.js
    │   │   ├── d3-selection-multi.v1.min.js
    │   │   ├── d3.v4.min.js
    │   │   ├── form.js
    │   │   ├── index.js
    │   │   └── network.js
    └── network
    │   ├── d3.v4.min.js
    │   ├── graph.css
    │   ├── graph.js
    │   └── index.html
└── version.sbt


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Logs
 2 | logs
 3 | *.log
 4 | npm-debug.log*
 5 | 
 6 | # Runtime data
 7 | pids
 8 | *.pid
 9 | *.seed
10 | 
11 | # Directory for instrumented libs generated by jscoverage/JSCover
12 | lib-cov
13 | 
14 | # Coverage directory used by tools like istanbul
15 | coverage
16 | 
17 | # nyc test coverage
18 | .nyc_output
19 | 
20 | # Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files)
21 | .grunt
22 | 
23 | # node-waf configuration
24 | .lock-wscript
25 | 
26 | # Compiled binary addons (http://nodejs.org/api/addons.html)
27 | build/Release
28 | 
29 | # Dependency directories
30 | node_modules
31 | jspm_packages
32 | 
33 | # Optional npm cache directory
34 | .npm
35 | 
36 | # Optional REPL history
37 | .node_repl_history
38 | 
39 | # sbt and eclipse
40 | .classpath
41 | .project
42 | .settings/
43 | bin/
44 | test-bin/
45 | .cache-main
46 | .cache-tests
47 | target/
48 | 
49 | # project generated CSV files, NER models, Lucene indices, H2 test database etc.
50 | dataFusion-ner/MITIE/
51 | dataFusion-ner/MITIE-models/
52 | *.csv
53 | dataFusion-search/*Index/
54 | dataFusion.mv.db
55 | 
56 | ui/swagger-ui-3.3.2/
57 | 


--------------------------------------------------------------------------------
/build.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/sh
 2 | 
 3 | set -vex
 4 | 
 5 | # ubuntu has: ID=ubuntu but centos has: ID="centos"
 6 | OS=`sed --regexp-extended --quiet 's/^ID="?([a-z]+)"?$/\1/p' /etc/os-release`
 7 | 
 8 | # build MITIE (native code used by dataFusion-ner)
 9 | # do as little as necessary by default, add --clean option to do everything from scratch
10 | cd dataFusion-ner
11 | ./build-MITIE.sh # --clean
12 | cd ..
13 | 
14 | # set environment
15 | . ./sh/setenv.$OS
16 | 
17 | # run Scala build
18 | sbt one-jar                                                        # minimal, or
19 | # sbt -J-Xmx3G clean test publish-local one-jar dumpLicenseReport  # the works
20 | # move/rename the license reports
21 | # for i in */target/license-reports/*.md; do cp $i ${i%%/*}/3rd-party-licenses.md; done
22 | 
23 | 


--------------------------------------------------------------------------------
/dataFusion-common/3rd-party-licenses.md:
--------------------------------------------------------------------------------
 1 | # datafusion-common-licenses
 2 | 
 3 | Category | License | Dependency | Notes
 4 | --- | --- | --- | ---
 5 | Apache | [Apache 2](http://www.apache.org/licenses/LICENSE-2.0.txt) | io.spray # spray-json_2.12 # 1.3.3 | <notextile></notextile>
 6 | Apache | [Apache 2.0 License](http://www.apache.org/licenses/LICENSE-2.0.html) | com.typesafe.scala-logging # scala-logging_2.12 # 3.5.0 | <notextile></notextile>
 7 | Apache | [the Apache License, ASL Version 2.0](http://www.apache.org/licenses/LICENSE-2.0) | org.scalactic # scalactic_2.12 # 3.0.3 | <notextile></notextile>
 8 | Apache | [the Apache License, ASL Version 2.0](http://www.apache.org/licenses/LICENSE-2.0) | org.scalatest # scalatest_2.12 # 3.0.3 | <notextile></notextile>
 9 | BSD | [BSD 3-Clause](http://www.scala-lang.org/license.html) | org.scala-lang # scala-library # 2.12.2 | <notextile></notextile>
10 | BSD | [BSD 3-Clause](http://www.scala-lang.org/license.html) | org.scala-lang # scala-reflect # 2.12.2 | <notextile></notextile>
11 | BSD | [BSD 3-clause](http://opensource.org/licenses/BSD-3-Clause) | org.scala-lang.modules # scala-parser-combinators_2.12 # 1.0.4 | <notextile></notextile>
12 | BSD | [BSD 3-clause](http://opensource.org/licenses/BSD-3-Clause) | org.scala-lang.modules # scala-xml_2.12 # 1.0.5 | <notextile></notextile>
13 | LGPL | [EPL + GNU Lesser General Public License](http://logback.qos.ch/license.html) | ch.qos.logback # logback-classic # 1.2.3 | <notextile></notextile>
14 | LGPL | [EPL + GNU Lesser General Public License](http://logback.qos.ch/license.html) | ch.qos.logback # logback-core # 1.2.3 | <notextile></notextile>
15 | MIT | [MIT License](http://www.slf4j.org/license.html) | org.slf4j # slf4j-api # 1.7.25 | <notextile></notextile>
16 | 
17 | 


--------------------------------------------------------------------------------
/dataFusion-common/build.sbt:
--------------------------------------------------------------------------------
 1 | name := "dataFusion-common"
 2 | 
 3 | libraryDependencies ++= Seq(
 4 |   "io.spray" %% "spray-json" % "1.3.3",
 5 |   // "io.swagger" % "swagger-annotations" % "1.5.12",
 6 |   "com.typesafe.scala-logging" %% "scala-logging" % "3.5.0",
 7 |   "ch.qos.logback" % "logback-classic" % "1.2.3",
 8 |   "org.scalatest" %% "scalatest" % "3.0.3" % "test"
 9 | )
10 | 


--------------------------------------------------------------------------------
/dataFusion-common/doc/dataFusion.zargo:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/data61/dataFusion/4b766ae65254eed6c5a095f3bb8b6b42b9500a9c/dataFusion-common/doc/dataFusion.zargo


--------------------------------------------------------------------------------
/dataFusion-common/src/main/scala/au/csiro/data61/dataFusion/common/CSV.scala:
--------------------------------------------------------------------------------
 1 | package au.csiro.data61.dataFusion.common
 2 | 
 3 | object CSV {
 4 |   
 5 |   /**
 6 |    * return the indices of the fields for: id, organisation name and person's: family, first given and other given names.
 7 |    * @param csvHdr the header line from the CSV file
 8 |    */
 9 |   def csvHeaderToIndices(delim: Char, fields: Seq[String], hdr: String): Seq[Int] = {
10 |     val hdrs = hdr.toUpperCase.split(delim)
11 |     val fieldsUp = fields.map(_.toUpperCase)
12 |     val idx = fieldsUp map hdrs.indexOf
13 |     val missing = for ((f, i) <- fields zip idx if i == -1) yield f
14 |     if (!missing.isEmpty) throw new Exception(s"CSV header is missing fields: ${missing.mkString(",")}")
15 |     idx
16 |   }
17 |   
18 |   /**
19 |    * Process the header line from iter and return a function to map the remaining lines to a seq of string data in same order as fields.
20 |    * (Done this way to allow the function to be applied to different lines in parallel). 
21 |    */
22 |   def mkFieldData(delim: Char, fields: Seq[String], iter: Iterator[String]): String => Seq[String] = {
23 |     if (iter.hasNext) {
24 |       val idx = csvHeaderToIndices(delim, fields, iter.next)
25 |       val reqLen = idx.max + 1
26 |       line =>
27 |         val d = line.toUpperCase.split(delim).toIndexedSeq.padTo(reqLen, "")
28 |         idx.map(d(_).trim)
29 |     } else _ => Seq.empty
30 |   }
31 | }


--------------------------------------------------------------------------------
/dataFusion-common/src/main/scala/au/csiro/data61/dataFusion/common/Data.scala:
--------------------------------------------------------------------------------
 1 | package au.csiro.data61.dataFusion.common
 2 | 
 3 | import spray.json.DefaultJsonProtocol
 4 | 
 5 | object Data {
 6 |   val T_PERSON = "PERSON"
 7 |   val T_PERSON2 = "PERSON2"           // PHits.typ for a search on just family & first given names (not using other)
 8 |   val T_ORGANIZATION = "ORGANIZATION" // Z is consistent with NER implementations
 9 |   
10 |   val GAZ = "D61GAZ"     // Ner.impl for search hits
11 |   val EMAIL = "D61EMAIL" // Ner.impl for names parsed from email headers
12 |       
13 |   /** pos{Str,End} are token indices
14 |    *  off{Str,End} are character offsets
15 |    *  {pos,off}Str is included, {pos,off}End is excluded (first token/char not included)
16 |    */
17 |   case class ExtRef(name: String,  ids: List[Long])
18 |   case class Ner(posStr: Int, posEnd: Int, offStr: Int, offEnd: Int, score: Double, text: String, typ: String, impl: String, extRef: Option[ExtRef])
19 | 
20 |   /** metadata key for language code e.g. "en" or "es" */
21 |   val META_LANG_CODE = "language-code"
22 |   val META_LANG_PROB = "language-prob"
23 |   val META_EN_SCORE = "english-score"
24 |   
25 |   case class Embedded(content: Option[String], meta: Map[String, String], ner: List[Ner])
26 |   case class Doc(id: Long, content: Option[String], meta: Map[String, String], path: String, ner: List[Ner], embedded: List[Embedded])
27 | 
28 |   // collection -> (weight, count)
29 |   type WeightMap = Map[String, (Double, Int)]
30 |   
31 |   // sourceNodeId, targetNodeId -> Scores
32 |   case class Node(nodeId: Int, extRef: ExtRef, score: Double, typ: String)
33 |   case class Edge(source: Int, target: Int, weights: WeightMap, typ: String)  
34 |   case class NodeEdgeCount(nodeId: Int, numEdges: Int)
35 | 
36 |   val EMB_IDX_MAIN = -1 // a searchable value for embIdx to represent main content - not embedded
37 |   case class IdEmbIdx(id: Long, embIdx: Int)
38 | 
39 |   case class Stats(totalHits: Int, elapsedSecs: Float)
40 |   case class PosInfo(posStr: Int, posEnd: Int, offStr: Int, offEnd: Int)
41 |   case class LPosDoc(idEmbIdx: IdEmbIdx, posInfos: List[PosInfo])
42 |   case class PHits(stats: Stats, hits: List[LPosDoc], error: Option[String], extRef: ExtRef, score: Double, typ: String)
43 |   
44 |   case class LDoc(idEmbIdx: IdEmbIdx, content: String, path: String)
45 |   case class LMeta(idEmbIdx: IdEmbIdx, key: String, `val`: String)
46 |   case class LNer(idEmbIdx: IdEmbIdx, posStr: Int, posEnd: Int, offStr: Int, offEnd: Int, text: String, typ: String, impl: String) 
47 |   
48 |   case class Query(query: String, numHits: Int)
49 |   case class DHits(stats: Stats, hits: List[(Float, LDoc)], error: Option[String])
50 |   case class MHits(stats: Stats, hits: List[(Float, LMeta)], error: Option[String])
51 |   case class NHits(stats: Stats, hits: List[(Float, LNer)], error: Option[String])
52 | 
53 |   case class PosQuery(extRef: ExtRef, typ: String)
54 |   case class PosMultiQuery(queries: List[PosQuery])
55 |   case class PMultiHits(pHits: List[PHits])
56 |       
57 |   object JsonProtocol extends DefaultJsonProtocol {
58 |     implicit val extRefFormat = jsonFormat2(ExtRef)
59 |     implicit val nerFormat = jsonFormat9(Ner)
60 |     implicit val embeddedFormat = jsonFormat3(Embedded)
61 |     implicit val docFormat = jsonFormat6(Doc)
62 |     
63 |     implicit val nodeFormat = jsonFormat4(Node)
64 |     implicit val edgeFormat = jsonFormat4(Edge)
65 |     implicit val clientEdgeCountFormat = jsonFormat2(NodeEdgeCount)
66 |     implicit val idEmbIdxCodec = jsonFormat2(IdEmbIdx)
67 |     
68 |     implicit val statsCodec = jsonFormat2(Stats)
69 |     implicit val posInfoCodec = jsonFormat4(PosInfo)
70 |     implicit val lposDocCodec = jsonFormat2(LPosDoc)
71 |     implicit val pHitsCodec = jsonFormat6(PHits)
72 |     
73 |     implicit val ldocCodec = jsonFormat3(LDoc)
74 |     implicit val lmetaCodec = jsonFormat3(LMeta)
75 |     implicit val lnerCodec = jsonFormat8(LNer)  
76 | 
77 |     implicit val queryCodec = jsonFormat2(Query)
78 |     implicit val dHitsCodec = jsonFormat3(DHits)
79 |     implicit val mHitsCodec = jsonFormat3(MHits)
80 |     implicit val nHitsCodec = jsonFormat3(NHits)
81 |     
82 |     implicit val posQueryCodec = jsonFormat2(PosQuery)
83 |     implicit val posMultiQueryCodec = jsonFormat1(PosMultiQuery)
84 |     implicit val pMultiHitsCodec = jsonFormat1(PMultiHits)
85 |   }
86 | }


--------------------------------------------------------------------------------
/dataFusion-common/src/main/scala/au/csiro/data61/dataFusion/common/EnglishScore.scala:
--------------------------------------------------------------------------------
 1 | package au.csiro.data61.dataFusion.common
 2 | 
 3 | import com.typesafe.scalalogging.Logger
 4 | 
 5 | object EnglishScore {
 6 |   private val log = Logger(getClass)
 7 |   
 8 |   case class Feat(wordLike: Boolean, initCap: Boolean, endsDot: Boolean)
 9 | 
10 |   // A metric for English text quality.
11 |   // Near enough is good enough, no need to handle voweless works like "sky" or apostrophes.  
12 |   
13 |   val word = """\S+""".r
14 |   val vowels = "AEIOUaeiou".toSet
15 |   val upper = ('A' to 'Z').toSet
16 |   val letter = upper ++ upper.map(Character.toLowerCase)
17 |   val punct = ",;:'\"!@#$%^&*()-_+=/[]{}.".toSet
18 | 
19 |   def word2feat(w: String): Feat = {
20 |     val numVowel = w.count(vowels contains _)
21 |     val numLetter = w.count(letter contains _)
22 |     val numUpper = w.count(upper contains _)
23 |     val startsPunct = punct contains w.head 
24 |     val endsPunct = punct contains w.last
25 |     val endsDot = w.endsWith(".")
26 |     val expectedLetters = w.length - (if (startsPunct) 1 else 0) - (if (endsPunct) 1 else 0)
27 |     val initCap = numUpper == 1 && (startsPunct && w.length > 1 && Character.isUpperCase(w(1))|| Character.isUpperCase(w.head))
28 |     val wordLike = w.length < 30 && numLetter == expectedLetters && (numUpper == 0 || initCap) && numVowel > 0
29 |     // log.debug(s"word2feat: numVowel = $numVowel, numLetter = $numLetter, numUpper = $numUpper, startsPunct = $startsPunct, endsPunct = $endsPunct, endsDot = $endsDot, initCap = $initCap, length = ${w.length}, expectedLetters = $expectedLetters, wordLike = $wordLike")
30 |     Feat(wordLike, initCap, endsDot)
31 |   }
32 | 
33 |   def englishScore(text: String): Double = {
34 |     val feats = word.findAllIn(text).map(word2feat).toSeq
35 |     val numWords = feats.count(_.wordLike)
36 |     val wordScore = numWords.toDouble / feats.size // ratio
37 |     
38 |     // unit test with text from wikipedia is getting a very low sentenceScore, so disabled for now
39 |     val numSentence = feats.sliding(2).count {
40 |       case Seq(a, b) => a.wordLike && a.endsDot && b.wordLike && b.initCap
41 |       case _ => false
42 |     }
43 |     val x = numWords.toDouble / numSentence // avgSentenceLength
44 |     // See http://hearle.nahoo.net/Academic/Maths/Sentence.html
45 |     // try piece-wise linear score
46 |     val sentenceScore = if (x < 10.0) 0.6 + 0.4 * x/10.0
47 |       else if (x < 30.0) 1.0
48 |       else if (x < 100.0) 1.0 - 0.8 * (x - 30.0)/70.0 
49 |       else 0.2
50 |     
51 |     log.debug(s"englishScore: numSentence = $numSentence, numWords = $numWords, wordScore = $wordScore, sentenceScore = $sentenceScore")
52 |     wordScore * sentenceScore
53 |   }
54 | }


--------------------------------------------------------------------------------
/dataFusion-common/src/main/scala/au/csiro/data61/dataFusion/common/Parallel.scala:
--------------------------------------------------------------------------------
 1 | package au.csiro.data61.dataFusion.common
 2 | 
 3 | import java.util.concurrent.ArrayBlockingQueue
 4 | 
 5 | import scala.util.{ Failure, Success, Try }
 6 | 
 7 | import com.typesafe.scalalogging.Logger
 8 | 
 9 | object Parallel {
10 |   private val log = Logger(getClass)
11 | 
12 |   /**
13 |    * One thread does `in`,
14 |    * One thread does `out`,
15 |    * `numWorkers` threads do `work`.
16 |    */
17 |   def doParallel[I, O](in: Iterator[I], work: I => O, out: O => Unit, inDone: I, outDone: O, numWorkers: Int) = {
18 |     val qFactor = 10
19 |     val qSize = numWorkers * qFactor
20 |     val iq = new ArrayBlockingQueue[I](qSize)
21 |     val oq = new ArrayBlockingQueue[Try[O]](qSize)
22 |     
23 |     val iThread = new Thread {
24 |       override def run = {
25 |         in.foreach(i => iq.put(i))
26 |         iq.put(inDone)
27 |       }
28 |     }
29 |     iThread.start
30 |     
31 |     val oThread = new Thread {
32 |       override def run = {
33 |         Iterator.continually(oq.take) takeWhile(_ != Success(outDone)) foreach { _ match {
34 |           case Success(o) => out(o)
35 |           case Failure(e) => log.error("worker exception", e)
36 |         } }
37 |       }
38 |     }
39 |     oThread.start
40 |     
41 |     val workers = (0 until numWorkers).map { i => new Thread {
42 |       override def run = {
43 |         Iterator.continually(iq.take) takeWhile(_ != inDone) foreach { i => oq.put(Try{ work(i) }) }
44 |         iq.put(inDone) // tell another worker
45 |       }        
46 |     } }
47 |     workers.foreach(_.start)
48 |     
49 |     iThread.join
50 |     log.debug("iThread done")
51 |     workers.foreach(_.join)
52 |     log.debug("workers done")
53 |     oq.put(Success(outDone))
54 |     oThread.join
55 |     log.debug("oThread done")
56 |   }
57 | 
58 | }


--------------------------------------------------------------------------------
/dataFusion-common/src/main/scala/au/csiro/data61/dataFusion/common/Timer.scala:
--------------------------------------------------------------------------------
 1 | package au.csiro.data61.dataFusion.common
 2 | 
 3 | import com.typesafe.scalalogging.Logger
 4 | 
 5 | /** Accumulate time since constructed or reset.
 6 |  *  
 7 |  *  Usage:
 8 |  *  {{{
 9 |  *  val t = Timer()
10 |  *  ...
11 |  *  t.stop
12 |  *  log.info(s"... took ${t.elapsedSecs} secs")
13 |  *  }}}
14 |  */
15 | class Timer {
16 |   private var t0 = 0L      // start of currently measured time period
17 |   private var elapsed = 0L // sum of previous time periods ended by stop/elapsedSecs
18 | 
19 |   reset
20 | 
21 |   def reset = {
22 |     elapsed = 0L
23 |     start
24 |   }
25 | 
26 |   /** `start` need not be used - used to discard (not accumulate) the time between `stop` and `start`. */
27 |   def start = t0 = System.currentTimeMillis
28 | 
29 |   def stop = elapsed += (System.currentTimeMillis - t0)
30 | 
31 |   /** Get accumulated seconds up to `stop` */
32 |   def elapsedSecs: Float = elapsed * 1e-3f
33 | }
34 | 
35 | object Timer {
36 |   
37 |   private lazy val log = Logger(getClass)
38 |   
39 |   def apply() = new Timer()
40 |   
41 |   /** Log elapsed time as info.
42 |    *  
43 |    *  Usage:
44 |    *  {{{
45 |    *  val a: A = timed("it took {} secs") {
46 |    *     ...
47 |    *     new A()
48 |    *  }
49 |    *  }}}
50 |    *  
51 |    *  @param msg contains "{}" which is replaced by the elapsed time in secs
52 |    *  @param action thunk to execute and time
53 |    */
54 |   def timed[T](msg: String)(action: => T) = {
55 |     val t = Timer()
56 |     val x = action
57 |     t.stop
58 |     log.info(msg, t.elapsedSecs.toString)
59 |     x
60 |   }
61 | }


--------------------------------------------------------------------------------
/dataFusion-common/src/main/scala/au/csiro/data61/dataFusion/common/Util.scala:
--------------------------------------------------------------------------------
 1 | package au.csiro.data61.dataFusion.common
 2 | 
 3 | import java.io.{ BufferedWriter, File, FileOutputStream, OutputStreamWriter }
 4 | import com.typesafe.scalalogging.Logger
 5 | import scala.collection.mutable.ListBuffer
 6 | 
 7 | object Util {
 8 |   private val log = Logger(getClass)
 9 |   
10 |   /** @return a BufferedWriter using UTF-8 encoding */
11 |   def bufWriter(f: File) = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(f), "UTF-8"))
12 |   
13 |   /** Get a Scala singleton Object.
14 |     * @param fqn object's fully qualified name
15 |     * @return object as type T
16 |     */
17 |   def getObject[T](fqn: String): T = {
18 |     val m = scala.reflect.runtime.universe.runtimeMirror(getClass.getClassLoader)
19 |     m.reflectModule(m.staticModule(fqn)).instance.asInstanceOf[T]
20 |   }
21 |   
22 |   /**
23 |    * Modified from: https://stackoverflow.com/questions/5674741/simplest-way-to-get-the-top-n-elements-of-a-scala-iterable
24 |    * Well the simplest is sort.take(n), but for a large collection where n << the collection size, this is much more efficient!
25 |    */
26 |   def extremeN[T](n: Int, it: Iterator[T])(comp1: ((T, T) => Boolean), comp2: ((T, T) => Boolean)): List[T] = {
27 | 
28 |     def sortedIns (el: T, list: List[T]): List[T] = 
29 |       if (list.isEmpty) List (el) else 
30 |       if (comp2 (el, list.head)) el :: list else 
31 |         list.head :: sortedIns (el, list.tail)
32 |   
33 |     def updateSofar (sofar: List [T], el: T) : List [T] =
34 |       if (comp1 (el, sofar.head)) 
35 |         sortedIns (el, sofar.tail)
36 |       else sofar
37 | 
38 |     val initN = {
39 |       val buf = new ListBuffer[T]
40 |       for (_ <- 0 until n if it.hasNext) buf += it.next
41 |       buf.toList
42 |     }
43 |     if (initN.size > 1) (initN.sortWith(comp2) /: it) { updateSofar } else initN
44 |   }
45 |   
46 |   /** @return smallest n elements in descending order */
47 |   def bottom[T](n: Int, it: Iterator[T])(implicit ord: Ordering[T]): List[T] = extremeN(n, it)(ord.lt, ord.gt)
48 | 
49 |   /** @return largest n elements in ascending order */
50 |   def top[T](n: Int, it: Iterator[T])(implicit ord: Ordering[T]): List[T] = extremeN(n, it)(ord.gt, ord.lt)
51 | }


--------------------------------------------------------------------------------
/dataFusion-common/src/test/resources/logback-test.xml:
--------------------------------------------------------------------------------
 1 | <configuration>
 2 | 
 3 |   <appender name="CONS" class="ch.qos.logback.core.ConsoleAppender">
 4 |     <encoder>
 5 |       <pattern>%d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern>
 6 |     </encoder>
 7 |   </appender>
 8 |   
 9 | <!--   <appender name="FILE" class="ch.qos.logback.core.FileAppender"> -->
10 | <!--     <file>common.log</file> -->
11 | <!--     <encoder> -->
12 | <!--       <pattern>%d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern> -->
13 | <!--     </encoder> -->
14 | <!--   </appender> -->
15 | 
16 |   <logger name="au.csiro.data61" level="DEBUG" />
17 |   
18 |   <root level="INFO">
19 |     <appender-ref ref="CONS" />
20 |   </root>
21 | </configuration>
22 | 


--------------------------------------------------------------------------------
/dataFusion-common/src/test/scala/au/csiro/data61/dataFusion/common/ParallelTest.scala:
--------------------------------------------------------------------------------
 1 | package au.csiro.data61.dataFusion.common
 2 | 
 3 | import scala.collection.mutable.ListBuffer
 4 | 
 5 | import org.scalatest.{ FlatSpec, Matchers }
 6 | 
 7 | import com.typesafe.scalalogging.Logger
 8 | import scala.util.Success
 9 | 
10 | class ParallelTest extends FlatSpec with Matchers {
11 |   val log = Logger(getClass)
12 | 
13 |   "Threads" should "do stuff in parallel" in {
14 |     val l = ListBuffer[String]()
15 |     Parallel.doParallel(Iterator.range(0, 1000).map(_.toString), (s: String) => s, (s: String) => l += s, "done", "done", 4)
16 |     l.size should be(1000)
17 |     for {
18 |       (a, b) <- l.map(_.toInt).sortBy(identity).zipWithIndex
19 |     } a should be(b)
20 |   }
21 | }


--------------------------------------------------------------------------------
/dataFusion-common/src/test/scala/au/csiro/data61/dataFusion/common/UtilTest.scala:
--------------------------------------------------------------------------------
 1 | package au.csiro.data61.dataFusion.common
 2 | 
 3 | import com.typesafe.scalalogging.Logger
 4 | 
 5 | import org.scalatest.FlatSpec
 6 | import org.scalatest.Matchers
 7 | 
 8 | import com.typesafe.scalalogging.Logger
 9 | import Util._
10 | import scala.util.Random
11 | import Timer.timed
12 | 
13 | class UtilTest extends FlatSpec with Matchers {
14 |   val log = Logger(getClass)
15 |   
16 |   val data = Random.shuffle(1 to 1000000)
17 |   val n = 10
18 |   
19 |   "top" should "get top members quicker than sorting" in {
20 |     val t1 = Timer()
21 |     val topn = top(n, data.iterator)
22 |     t1.stop
23 |     log.debug(s"topn in ${t1.elapsedSecs} = $topn")
24 |     
25 |     val t2 = Timer()
26 |     val expected = data.sortBy(x => -x).take(n).toList
27 |     t2.stop
28 |     log.debug(s"sortBy.take(n) in ${t2.elapsedSecs}")
29 |     topn.reverse should be(expected)
30 |     assert(t1.elapsedSecs < t2.elapsedSecs)
31 |   }
32 |   
33 |   "bottom" should "get bottom members quicker than sorting" in {
34 |     val t1 = Timer()
35 |     val bottomn = bottom(n, data.iterator)
36 |     t1.stop
37 |     log.debug(s"bottomn in ${t1.elapsedSecs} = $bottomn")
38 |     
39 |     val t2 = Timer()
40 |     val expected = data.sorted.take(n).toList
41 |     t2.stop
42 |     log.debug(s"sort.take(n) in ${t2.elapsedSecs}")
43 |     bottomn.reverse should be(expected)
44 |     assert(t1.elapsedSecs < t2.elapsedSecs)
45 |   }
46 | }


--------------------------------------------------------------------------------
/dataFusion-db-service/README.md:
--------------------------------------------------------------------------------
 1 | # dataFusion-db-service
 2 | 
 3 | ## Introduction
 4 | 
 5 | This project provides [RESTful](https://en.wikipedia.org/wiki/Representational_state_transfer) web services based on [dataFusion-db](../dataFusion-db).
 6 | 
 7 | ## Build, Configuration, Running and Swagger Support
 8 | 
 9 | See the top level [README](../README.md).
10 | 
11 | 


--------------------------------------------------------------------------------
/dataFusion-db-service/build.sbt:
--------------------------------------------------------------------------------
 1 | name := "dataFusion-db-service"
 2 | 
 3 | libraryDependencies ++= Seq(
 4 |   "com.github.swagger-akka-http" %% "swagger-akka-http" % "0.9.1",
 5 |   "com.typesafe.akka" %% "akka-http-spray-json" % "10.0.7",
 6 |   "ch.megard" %% "akka-http-cors" % "0.2.1",
 7 |   "com.github.scopt" %% "scopt" % "3.5.0"
 8 | )
 9 | 
10 | com.github.retronym.SbtOneJar.oneJarSettings
11 | 
12 | mainClass in Compile := Some("au.csiro.data61.dataFusion.db.service.Main")
13 | 


--------------------------------------------------------------------------------
/dataFusion-db-service/src/main/resources/application.conf:
--------------------------------------------------------------------------------
1 | http = {
2 |   host = "0.0.0.0"
3 |   port = 8088
4 | 
5 |   host = ${?DB_HTTP_HOST}
6 |   port = ${?DB_HTTP_PORT}
7 | }


--------------------------------------------------------------------------------
/dataFusion-db-service/src/main/resources/logback.xml:
--------------------------------------------------------------------------------
 1 | <configuration>
 2 | 
 3 | <!--   <appender name="CONS" class="ch.qos.logback.core.ConsoleAppender"> -->
 4 | <!--     <encoder> -->
 5 | <!--       <pattern>%d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern> -->
 6 | <!--     </encoder> -->
 7 | <!--   </appender> -->
 8 |   
 9 |   <appender name="FILE" class="ch.qos.logback.core.FileAppender">
10 |     <file>db-service.log</file>
11 |     <encoder>
12 |       <pattern>%d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern>
13 |     </encoder>
14 |   </appender>
15 | 
16 | <!--   <logger name="au.csiro.data61" level="DEBUG" /> -->
17 |   
18 |   <root level="INFO">
19 |     <appender-ref ref="FILE" />
20 |   </root>
21 | </configuration>
22 | 


--------------------------------------------------------------------------------
/dataFusion-db-service/src/main/scala/au/csiro/data61/dataFusion/db/service/DbService.scala:
--------------------------------------------------------------------------------
  1 | package au.csiro.data61.dataFusion.db.service
  2 | 
  3 | import scala.concurrent.{ ExecutionContext, Future }
  4 | import scala.language.postfixOps
  5 | 
  6 | import com.typesafe.config.Config
  7 | import com.typesafe.scalalogging.Logger
  8 | 
  9 | import akka.http.scaladsl.marshallers.sprayjson.SprayJsonSupport._
 10 | import akka.http.scaladsl.marshalling.{ ToResponseMarshallable, ToResponseMarshaller }
 11 | import akka.http.scaladsl.model.{ StatusCode, StatusCodes }
 12 | import akka.http.scaladsl.server.Directives._
 13 | import au.csiro.data61.dataFusion.common.Data, Data.JsonProtocol._
 14 | import au.csiro.data61.dataFusion.common.Util
 15 | import au.csiro.data61.dataFusion.db.Tables, Tables._
 16 | import io.swagger.annotations.{ Api, ApiOperation, ApiResponse, ApiResponses }
 17 | import javax.ws.rs.{ Path, PathParam }
 18 | 
 19 | // deleted by Eclipse > Source > Organize Imports
 20 | // import io.swagger.annotations.ApiResponse
 21 | 
 22 | object DbService {
 23 |   
 24 |     // teach spray.json how to en/decode java.sql.Date
 25 | //    val longFormat = implicitly[JsonFormat[Long]]
 26 | //    implicit val sqlDateFormat = new JsonFormat[java.sql.Date] {
 27 | //      override def read(json: JsValue): java.sql.Date = new java.sql.Date(longFormat.read(json))
 28 | //      override def write(obj: java.sql.Date): JsValue = longFormat.write(obj.getTime)
 29 | //    }
 30 |     
 31 |   implicit val docRowCodec = jsonFormat4(DocRow)
 32 |   implicit val metaRowCodec = jsonFormat4(MetaRow)
 33 |   implicit val nerRowCodec = jsonFormat11(NerRow)
 34 | }
 35 | import DbService._
 36 | 
 37 | @Api(value = "db", description = "read-only access to dataFusion database", produces = "application/json")
 38 | @Path("")
 39 | class DbService(conf: Config)(implicit val executionContext: ExecutionContext) {
 40 |   private val log = Logger(getClass)
 41 |   
 42 |   val myTables = new Tables {
 43 |     val profile = Util.getObject[slick.jdbc.JdbcProfile](conf.getString("db.profile")) // e.g. slick.jdbc.H2Profile or slick.jdbc.PostgresProfile
 44 |   }
 45 |   import myTables._
 46 |   import myTables.profile.api._
 47 |   
 48 |   val db = Database.forConfig("db", conf)
 49 | 
 50 | 
 51 |   
 52 |   val qDocById = {
 53 |     def q(id: Rep[Long]) = Doc.filter(_.docId === id)
 54 |     Compiled(q _)
 55 |   }
 56 |   
 57 |   @Path("doc/{id}")
 58 |   @ApiOperation(httpMethod = "GET", response = classOf[Array[DocRow]], responseContainer = "List", value = "Main Doc and embedded Docs")
 59 |   def docById(@PathParam("id") id: Long): Future[Seq[DocRow]] = 
 60 |     db.run(qDocById(id).result)
 61 |       
 62 |   def docByIdRoute =
 63 |     get { path("doc" / LongNumber) { id => complete {
 64 |       docById(id)
 65 |     }}}
 66 | 
 67 |   
 68 |   
 69 |   val qMetaById = {
 70 |     def q(id: Rep[Long]) = Meta.filter(_.docId === id)
 71 |     Compiled(q _)
 72 |   }
 73 |   
 74 |   @Path("meta/{id}")
 75 |   @ApiOperation(httpMethod = "GET", response = classOf[Array[MetaRow]], responseContainer = "List", value = "Metadata for main Doc and embedded Docs")
 76 |   def metaById(@PathParam("id") id: Long): Future[Seq[MetaRow]] = 
 77 |     db.run(qMetaById(id).result)
 78 |       
 79 |   def metaByIdRoute =
 80 |     get { path("meta" / LongNumber) { id => complete {
 81 |       metaById(id)
 82 |     }}}
 83 | 
 84 |   
 85 |   
 86 |   val qNerById = {
 87 |     def q(id: Rep[Long]) = Ner.filter(_.docId === id)
 88 |     Compiled(q _)
 89 |   }
 90 |   
 91 |   @Path("ner/{id}")
 92 |   @ApiOperation(httpMethod = "GET", response = classOf[Array[NerRow]], responseContainer = "List", value = "Named Entities for main Doc and embedded Docs")
 93 |   def nerById(@PathParam("id") id: Long): Future[Seq[NerRow]] = 
 94 |     db.run(qNerById(id).result)
 95 |       
 96 |   def nerByIdRoute =
 97 |     get { path("ner" / LongNumber) { id => complete {
 98 |       nerById(id)
 99 |     }}}
100 | 
101 |   
102 |   
103 |   val qExtNameById = {
104 |     def q(id: Rep[Long]) = ExtName.filter(_.extNameId === id).map(_.name)
105 |     Compiled(q _)
106 |   }
107 |   val qExtNameLinkById = {
108 |     def q(id: Rep[Long]) = ExtNameLink.filter(_.extNameId === id).map(_.extRefId)
109 |     Compiled(q _)
110 |   }
111 |   
112 |   @Path("extRef/{extNameId}")
113 |   @ApiOperation(httpMethod = "GET", response = classOf[Data.ExtRef], value = "name and ids (from external system) associated with a Named Entity")
114 |   def extRefById(@PathParam("extNameId") id: Long) = {
115 |     val oRef = for {
116 |       onam <- db.run(qExtNameById(id).result.headOption)
117 |       ids <- db.run(qExtNameLinkById(id).result)
118 |     } yield onam.map(Data.ExtRef(_, ids.toList))
119 |     optOrElse(oRef, (StatusCodes.NotFound, ""))
120 |   }
121 | 
122 |   def extRefByIdRoute =
123 |     get { path("extRef" / LongNumber) { id => complete {
124 |       extRefById(id)
125 |   }}}
126 | 
127 |   
128 |   
129 |   /** if Some(a) marshall the a, else marshall the orElse */
130 |   def optOrElse[A](x: Future[Option[A]], orElse: => (StatusCode, String))(implicit m: ToResponseMarshaller[A]): ToResponseMarshallable =
131 |     x.map(_.map { s => ToResponseMarshallable(s) }.getOrElse(ToResponseMarshallable(orElse)) )
132 |         
133 |   val routes = docByIdRoute ~ metaByIdRoute ~ nerByIdRoute ~ extRefByIdRoute
134 | }
135 | 


--------------------------------------------------------------------------------
/dataFusion-db-service/src/main/scala/au/csiro/data61/dataFusion/db/service/Main.scala:
--------------------------------------------------------------------------------
 1 | package au.csiro.data61.dataFusion.db.service
 2 | 
 3 | import scala.language.postfixOps
 4 | import scala.reflect.runtime.universe.typeOf
 5 | 
 6 | import com.github.swagger.akka.{ HasActorSystem, SwaggerHttpService }
 7 | import com.typesafe.config.ConfigFactory
 8 | import com.typesafe.scalalogging.Logger
 9 | 
10 | import akka.actor.ActorSystem
11 | import akka.http.scaladsl.Http
12 | import akka.http.scaladsl.server.Directives._enhanceRouteWithConcatenation
13 | import akka.stream.ActorMaterializer
14 | import ch.megard.akka.http.cors.scaladsl.CorsDirectives.cors
15 | import com.typesafe.config.ConfigValueFactory
16 | 
17 | object Main {
18 |   private val log = Logger(getClass)
19 |   
20 |   val conf = ConfigFactory.load
21 |   def cg(k: String) = conf.getString(k)
22 |   def cgi(k: String) = conf.getInt(k)
23 |   
24 |   case class CliOption(host: String, port: Int, dburl: String, profile: String, driver: String, user: String, password: String)
25 |   val defaultCliOption = CliOption(cg("http.host"), cgi("http.port"), cg("db.url"), cg("db.profile"), cg("db.driver"), cg("db.properties.user"), cg("db.properties.password"))
26 | 
27 |   implicit val system = ActorSystem("dbActorSystem")
28 |   implicit val exec = system.dispatcher
29 |   implicit val materializer = ActorMaterializer()
30 |   
31 |   def main(args: Array[String]): Unit = {
32 |      val parser = new scopt.OptionParser[CliOption]("db-service") {
33 |       head("db-service", "0.x")
34 |       note("web services for read-only access to datafusion database")
35 |       opt[String]("host") action { (v, c) =>
36 |         c.copy(dburl = v)
37 |       } text (s"web service network interface/host/IP address, default ${defaultCliOption.host}")
38 |       opt[Int]("port") action { (v, c) =>
39 |         c.copy(port = v)
40 |       } text (s"web service TCP port, default ${defaultCliOption.port}")
41 |       opt[String]("dburl") action { (v, c) =>
42 |         c.copy(dburl = v)
43 |       } text (s"JDBC database URL, default ${defaultCliOption.dburl}")
44 |       opt[String]("profile") action { (v, c) =>
45 |         c.copy(profile = v)
46 |       } text (s"full class name of Slick profile, default ${defaultCliOption.profile}")
47 |       opt[String]("driver") action { (v, c) =>
48 |         c.copy(driver = v)
49 |       } text (s"full class name of JDBC driver, default ${defaultCliOption.driver}")
50 |       opt[String]("user") action { (v, c) =>
51 |         c.copy(user = v)
52 |       } text (s"database username, default ${defaultCliOption.user}")
53 |       opt[String]("password") action { (v, c) =>
54 |         c.copy(password = v)
55 |       } text (s"database user password, default ${defaultCliOption.password}")
56 |       help("help") text ("prints this usage text")
57 |     }
58 |     for (c <- parser.parse(args, defaultCliOption)) {
59 |       log.info(s"CliOption: $c}")
60 |       val conf2 = conf.withValue("db.url", ConfigValueFactory.fromAnyRef(c.dburl)) // CliOption overrides
61 |         .withValue("db.driver", ConfigValueFactory.fromAnyRef(c.driver))
62 |         .withValue("db.properties.user", ConfigValueFactory.fromAnyRef(c.user))
63 |         .withValue("db.properties.password", ConfigValueFactory.fromAnyRef(c.password))
64 |       val dbService = new DbService(conf2)
65 |       val routes = cors() {
66 |         dbService.routes ~ 
67 |         swaggerService.routes
68 |       }
69 |       Http().bindAndHandle(routes, c.host, c.port)
70 |       log.info(s"""starting server at: http://${c.host}:${c.port}
71 | Test with:
72 |   curl --header 'Content-Type: application/json' http://${c.host}:${c.port}/api-docs/swagger.json
73 |   curl --header 'Content-Type: application/json' http://${c.host}:${c.port}/tikaMain/1
74 | """)
75 |     }
76 |   }
77 |   
78 |   def swaggerService(implicit s: ActorSystem, m: ActorMaterializer) = new SwaggerHttpService with HasActorSystem {
79 |     override implicit val actorSystem = s
80 |     override implicit val materializer = m
81 |     override val apiTypes = Seq(typeOf[DbService])
82 |     override def swaggerConfig = new io.swagger.models.Swagger().basePath(prependSlashIfNecessary(basePath)) // don't specify protocol://host basePath
83 | //    override val host = s"${hst}:${prt}" // the url of your api, not swagger's json endpoint
84 | //    override val basePath = "/"          // the basePath for the API you are exposing
85 |     override val info = new io.swagger.models.Info()                    // provides license and other description details
86 |     override val apiDocsPath = "api-docs"   // http://host:port/api-docs/swagger.json
87 |   }
88 |   
89 | }


--------------------------------------------------------------------------------
/dataFusion-db-service/src/test/resources/logback-test.xml:
--------------------------------------------------------------------------------
 1 | <configuration>
 2 | 
 3 |   <appender name="CONS" class="ch.qos.logback.core.ConsoleAppender">
 4 |     <encoder>
 5 |       <pattern>%d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern>
 6 |     </encoder>
 7 |   </appender>
 8 |   
 9 | <!--   <appender name="FILE" class="ch.qos.logback.core.FileAppender"> -->
10 | <!--     <file>db-service-test.log</file> -->
11 | <!--     <encoder> -->
12 | <!--       <pattern>%d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern> -->
13 | <!--     </encoder> -->
14 | <!--   </appender> -->
15 | 
16 |   <logger name="au.csiro.data61" level="DEBUG" />
17 |   
18 |   <root level="INFO">
19 |     <appender-ref ref="CONS" />
20 |   </root>
21 | </configuration>
22 | 


--------------------------------------------------------------------------------
/dataFusion-db/3rd-party-licenses.md:
--------------------------------------------------------------------------------
 1 | # datafusion-db-licenses
 2 | 
 3 | Category | License | Dependency | Notes
 4 | --- | --- | --- | ---
 5 | Apache | [Apache 2](http://www.apache.org/licenses/LICENSE-2.0.txt) | io.spray # spray-json_2.12 # 1.3.3 | <notextile></notextile>
 6 | Apache | [Apache 2.0 License](http://www.apache.org/licenses/LICENSE-2.0.html) | com.typesafe.scala-logging # scala-logging_2.12 # 3.5.0 | <notextile></notextile>
 7 | Apache | [Apache License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0) | com.typesafe # config # 1.3.1 | <notextile></notextile>
 8 | Apache | [The Apache Software License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0.txt) | com.zaxxer # HikariCP # 2.5.1 | <notextile></notextile>
 9 | BSD | [BSD 3-Clause](http://www.scala-lang.org/license.html) | org.scala-lang # scala-library # 2.12.2 | <notextile></notextile>
10 | BSD | [BSD 3-Clause](http://www.scala-lang.org/license.html) | org.scala-lang # scala-reflect # 2.12.2 | <notextile></notextile>
11 | BSD | [BSD-2-Clause](https://jdbc.postgresql.org/about/license.html) | org.postgresql # postgresql # 42.1.4 | <notextile></notextile>
12 | BSD | [BSD-Style](http://www.opensource.org/licenses/bsd-license.php) | com.jsuereth # scala-arm_2.12 # 2.0 | <notextile></notextile>
13 | BSD | [Two-clause BSD-style license](http://github.com/slick/slick/blob/master/LICENSE.txt) | com.typesafe.slick # slick-codegen_2.12 # 3.2.1 | <notextile></notextile>
14 | BSD | [Two-clause BSD-style license](http://github.com/slick/slick/blob/master/LICENSE.txt) | com.typesafe.slick # slick-hikaricp_2.12 # 3.2.1 | <notextile></notextile>
15 | BSD | [Two-clause BSD-style license](http://github.com/slick/slick/blob/master/LICENSE.txt) | com.typesafe.slick # slick_2.12 # 3.2.1 | <notextile></notextile>
16 | CC0 | [CC0](http://creativecommons.org/publicdomain/zero/1.0/) | org.reactivestreams # reactive-streams # 1.0.0 | <notextile></notextile>
17 | GPL | [GPL](https://www.gnu.org/licenses/gpl-3.0.en.html) | au.csiro.data61 # datafusion-common_2.12 # 1.1-SNAPSHOT | <notextile></notextile>
18 | LGPL | [EPL + GNU Lesser General Public License](http://logback.qos.ch/license.html) | ch.qos.logback # logback-classic # 1.2.3 | <notextile></notextile>
19 | LGPL | [EPL + GNU Lesser General Public License](http://logback.qos.ch/license.html) | ch.qos.logback # logback-core # 1.2.3 | <notextile></notextile>
20 | MIT | [MIT License](http://www.opensource.org/licenses/mit-license.php) | com.github.scopt # scopt_2.12 # 3.7.0 | <notextile></notextile>
21 | MIT | [MIT License](http://www.slf4j.org/license.html) | org.slf4j # slf4j-api # 1.7.25 | <notextile></notextile>
22 | Mozilla | [MPL 2.0 or EPL 1.0](http://h2database.com/html/license.html) | com.h2database # h2 # 1.4.196 | <notextile></notextile>
23 | 
24 | 


--------------------------------------------------------------------------------
/dataFusion-db/README.md:
--------------------------------------------------------------------------------
 1 | # dataFusion-db
 2 | 
 3 | ## Introduction
 4 | This project provides [Slick](http://slick.lightbend.com/) bindings for a database schema to store the same data as the [Document JSON format](../dataFusion-common#document-json-format). It will work with any Slick supported database including H2, Postgres and Oracle.
 5 | Functions provided are:
 6 | - Create Slick bindings (Tables.scala) from an existing database schema. This is recommended for major schema changes.
 7 | - Drop the schema.
 8 | - Create the schema.
 9 | - Populate tables from a [Document JSON format](../dataFusion-common#document-json-format) file.
10 | 
11 | Querying is provided by the [dataFusion-db-service](../dataFusion-db-service) web service.
12 | 
13 | ## Build, Configuration and Running
14 | 
15 | See the top level [README](../README.md).
16 | 
17 | 


--------------------------------------------------------------------------------
/dataFusion-db/build.sbt:
--------------------------------------------------------------------------------
 1 | name := "dataFusion-db"
 2 | 
 3 | libraryDependencies ++= Seq(
 4 |   "slick",
 5 |   "slick-hikaricp",
 6 |   "slick-codegen"
 7 | ).map("com.typesafe.slick" %% _ % "3.2.1")
 8 | 
 9 | libraryDependencies ++= Seq(
10 |   "org.postgresql" % "postgresql" % "42.1.4",
11 |   "com.h2database" % "h2" % "1.4.196",
12 |   "com.typesafe" % "config" % "1.3.1",
13 |   "com.github.scopt" %% "scopt" % "3.7.0",
14 |   "com.jsuereth" %% "scala-arm" % "2.0"
15 |   
16 | )
17 | 
18 | com.github.retronym.SbtOneJar.oneJarSettings
19 | 
20 | mainClass in Compile := Some("au.csiro.data61.dataFusion.db.Main")
21 | 


--------------------------------------------------------------------------------
/dataFusion-db/src/main/resources/application.conf:
--------------------------------------------------------------------------------
 1 | db = {
 2 |   // H2
 3 |   url = "jdbc:h2:./datafusion"
 4 |   profile = "slick.jdbc.H2Profile"
 5 |   driver = "org.h2.Driver"        
 6 | 
 7 |   // Postgres
 8 |   // url = "jdbc:postgresql:datafusion"
 9 |   // profile = "slick.jdbc.PostgresProfile"
10 |   // driver = "org.postgresql.Driver"
11 | 
12 |   // Environment variables
13 |   url = ${?DB_URL}
14 |   profile = ${?DB_PROFILE}
15 |   driver = ${?DB_DRIVER}
16 | 
17 |   connectionPool = "HikariCP"
18 |   queueSize = 100
19 |   keepAliveConnection = true
20 |     
21 |   properties = {
22 |     user = "dfus"
23 |     password = "dfus"
24 |     
25 |     user = ${?DB_USER}
26 |     password = ${?DB_PASS}
27 |   }
28 | }
29 | 


--------------------------------------------------------------------------------
/dataFusion-db/src/main/resources/logback.xml:
--------------------------------------------------------------------------------
 1 | <configuration>
 2 | 
 3 |   <appender name="CONS" class="ch.qos.logback.core.ConsoleAppender">
 4 |     <encoder>
 5 |       <pattern>%d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern>
 6 |     </encoder>
 7 |   </appender>
 8 |   
 9 |   <appender name="FILE" class="ch.qos.logback.core.FileAppender">
10 |     <file>db.log</file>
11 |     <encoder>
12 |       <pattern>%d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern>
13 |     </encoder>
14 |   </appender>
15 | 
16 | <!--   <logger name="au.csiro.data61" level="DEBUG" /> -->
17 |   
18 |   <root level="INFO">
19 |     <appender-ref ref="FILE" />
20 |   </root>
21 | </configuration>
22 | 


--------------------------------------------------------------------------------
/dataFusion-db/src/test/resources/logback-test.xml:
--------------------------------------------------------------------------------
 1 | <configuration>
 2 | 
 3 |   <appender name="CONS" class="ch.qos.logback.core.ConsoleAppender">
 4 |     <encoder>
 5 |       <pattern>%d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern>
 6 |     </encoder>
 7 |   </appender>
 8 |   
 9 | <!--   <appender name="FILE" class="ch.qos.logback.core.FileAppender"> -->
10 | <!--     <file>db-test.log</file> -->
11 | <!--     <encoder> -->
12 | <!--       <pattern>%d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern> -->
13 | <!--     </encoder> -->
14 | <!--   </appender> -->
15 | 
16 |   <logger name="au.csiro.data61" level="DEBUG" />
17 |   
18 |   <root level="INFO">
19 |     <appender-ref ref="CONS" />
20 |   </root>
21 | </configuration>
22 | 


--------------------------------------------------------------------------------
/dataFusion-graph-service/README.md:
--------------------------------------------------------------------------------
 1 | # dataFusion-graph-service
 2 | 
 3 | ## Introduction
 4 | 
 5 | [RESTful](https://en.wikipedia.org/wiki/Representational_state_transfer) web services providing access to a network graph of related entities.
 6 | The global network is loaded from files specified in [configuration](../README.md#configuration) on startup.
 7 | 
 8 | ## Build, Configuration, Running and Swagger Support
 9 | 
10 | See the top level [README](../README.md).
11 | 
12 | 


--------------------------------------------------------------------------------
/dataFusion-graph-service/build.sbt:
--------------------------------------------------------------------------------
 1 | name := "dataFusion-graph-service"
 2 | 
 3 | libraryDependencies ++= Seq(
 4 |   "com.github.swagger-akka-http" %% "swagger-akka-http" % "0.9.1",
 5 |   "com.typesafe.akka" %% "akka-http-spray-json" % "10.0.7",
 6 |   "com.google.guava" % "guava" % "23.4-jre",
 7 |   "ch.megard" %% "akka-http-cors" % "0.2.1",
 8 |   "com.github.scopt" %% "scopt" % "3.5.0",
 9 |   // "com.jsuereth" %% "scala-arm" % "2.0",
10 |   "org.scalatest" %% "scalatest" % "3.0.0" % "test"
11 | )
12 |   
13 | com.github.retronym.SbtOneJar.oneJarSettings
14 | 
15 | mainClass in Compile := Some("au.csiro.data61.dataFusion.graph.service.Main")
16 | 


--------------------------------------------------------------------------------
/dataFusion-graph-service/src/main/resources/application.conf:
--------------------------------------------------------------------------------
 1 | graph = {
 2 |   cacheSize = 20
 3 |   nodePath = "proximity-node.json"
 4 |   edgePath = "proximity-edge.json"
 5 | 
 6 |   cacheSize = ${?GRAPH_CACHE_SIZE}
 7 |   nodePath = ${?GRAPH_NODE_PATH}
 8 |   edgePath = ${?GRAPH_EDGE_PATH}
 9 | }
10 | 
11 | http = {
12 |   host = "0.0.0.0"
13 |   port = 8089
14 | 
15 |   host = ${?GRAPH_HTTP_HOST}
16 |   port = ${?GRAPH_HTTP_PORT}
17 | }


--------------------------------------------------------------------------------
/dataFusion-graph-service/src/main/resources/logback.xml:
--------------------------------------------------------------------------------
 1 | <configuration>
 2 | 
 3 | <!--   <appender name="CONS" class="ch.qos.logback.core.ConsoleAppender"> -->
 4 | <!--     <encoder> -->
 5 | <!--       <pattern>%d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern> -->
 6 | <!--     </encoder> -->
 7 | <!--   </appender> -->
 8 |   
 9 |   <appender name="FILE" class="ch.qos.logback.core.FileAppender">
10 |     <file>graph-service.log</file>
11 |     <encoder>
12 |       <pattern>%d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern>
13 |     </encoder>
14 |   </appender>
15 | 
16 | <!--   <logger name="au.csiro.data61" level="DEBUG" /> -->
17 |   
18 |   <root level="INFO">
19 |     <appender-ref ref="FILE" />
20 |   </root>
21 | </configuration>
22 | 


--------------------------------------------------------------------------------
/dataFusion-graph-service/src/test/resources/edge.json:
--------------------------------------------------------------------------------
 1 | {"source":1,"target":2,"typ":"D61GAZ","weights":{"collectionA":[1.0,1]}}
 2 | {"source":1,"target":3,"typ":"D61GAZ","weights":{"collectionA":[1.0,1]}}
 3 | {"source":2,"target":3,"typ":"D61GAZ","weights":{"collectionA":[1.0,1]}}
 4 | {"source":3,"target":4,"typ":"D61GAZ","weights":{"collectionA":[1.0,1]}}
 5 | {"source":2,"target":5,"typ":"D61GAZ","weights":{"collectionB":[1.0,1]}}
 6 | {"source":3,"target":5,"typ":"D61GAZ","weights":{"collectionB":[1.0,1]}}
 7 | {"source":1,"target":6,"typ":"D61GAZ","weights":{"collectionB":[1.0,1]}}
 8 | {"source":2,"target":6,"typ":"D61GAZ","weights":{"collectionB":[1.0,1]}}
 9 | {"source":4,"target":6,"typ":"D61GAZ","weights":{"collectionA":[1.0,1]}}
10 | {"source":5,"target":6,"typ":"D61GAZ","weights":{"collectionB":[1.0,1]}}
11 | 


--------------------------------------------------------------------------------
/dataFusion-graph-service/src/test/resources/logback-test.xml:
--------------------------------------------------------------------------------
 1 | <configuration>
 2 | 
 3 |   <appender name="CONS" class="ch.qos.logback.core.ConsoleAppender">
 4 |     <encoder>
 5 |       <pattern>%d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern>
 6 |     </encoder>
 7 |   </appender>
 8 |   
 9 | <!--   <appender name="FILE" class="ch.qos.logback.core.FileAppender"> -->
10 | <!--     <file>graph-service-test.log</file> -->
11 | <!--     <encoder> -->
12 | <!--       <pattern>%d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern> -->
13 | <!--     </encoder> -->
14 | <!--   </appender> -->
15 | 
16 |   <logger name="au.csiro.data61" level="DEBUG" />
17 |   
18 |   <root level="INFO">
19 |     <appender-ref ref="CONS" />
20 |   </root>
21 | </configuration>
22 | 


--------------------------------------------------------------------------------
/dataFusion-graph-service/src/test/resources/node.json:
--------------------------------------------------------------------------------
1 | {"nodeId":1,"extRef":{"ids":[211,212],"name":"fred"},"typ":"PERSON","score":1}
2 | {"nodeId":2,"extRef":{"ids":[213,214],"name":"fred"},"typ":"ORGANIZATION","score":1}
3 | {"nodeId":3,"extRef":{"ids":[223,224],"name":"fred"},"typ":"PERSON","score":1}
4 | {"nodeId":4,"extRef":{"ids":[234,235],"name":"fred"},"typ":"PERSON","score":1}
5 | {"nodeId":5,"extRef":{"ids":[211,212],"name":"fred"},"typ":"PERSON2","score":1}
6 | {"nodeId":6,"extRef":{"ids":[223,224],"name":"fred"},"typ":"TO","score":1}
7 | 


--------------------------------------------------------------------------------
/dataFusion-graph-service/src/test/scala/au/csiro/data61/dataFusion/graph/service/MainTest.scala:
--------------------------------------------------------------------------------
 1 | package au.csiro.data61.dataFusion.graph.service
 2 | 
 3 | import org.scalatest.{ FlatSpec, Matchers }
 4 | 
 5 | import com.typesafe.scalalogging.Logger
 6 | 
 7 | import Main._
 8 | import scala.io.Source
 9 | import scala.io.Codec
10 | 
11 | class MainTest extends FlatSpec with Matchers {
12 |   val log = Logger(getClass)
13 |   implicit val codec = Codec.UTF8
14 |   
15 |   def getSource(resourcePath: String) = Source.fromInputStream(getClass.getClassLoader.getResourceAsStream(resourcePath))
16 |   val gs = new GraphService(0, getSource("node.json"), getSource("edge.json"))
17 | 
18 |   "graph" should "provide local network" in {
19 |     val g = gs.graph(GraphQuery(true, 0.0, None, None, Some(224), 2, 20))
20 |     log.debug(s"g = $g")
21 |     g.nodes.map(_.nodeId).toSet should be(Set(1, 2, 3, 4, 5, 6))
22 |     g.edges.map(e => (e.source, e.target)).toSet should be(Set((2,5), (3,4), (1,6), (3,5), (4,6), (2,6), (1,3), (2,3), (1,2), (5,6)))
23 |   }
24 |   
25 |   it should "filter PERSON2|EMAIL nodes" in {
26 |     val g = gs.graph(GraphQuery(false, 0.0, None, None, Some(224), 2, 20))
27 |     log.debug(s"g = $g")
28 |     g.nodes.map(_.nodeId).toSet should be(Set(1, 2, 3, 4))
29 |     g.edges.map(e => (e.source, e.target)).toSet should be(Set((2,3), (3,4), (1,2), (1,3)))
30 |   }
31 |   
32 | }


--------------------------------------------------------------------------------
/dataFusion-ner-service/README.md:
--------------------------------------------------------------------------------
 1 | # dataFusion-ner-service
 2 | 
 3 | ## Introduction
 4 | 
 5 | This project provides [RESTful](https://en.wikipedia.org/wiki/Representational_state_transfer) web services based on dataFusion-ner.
 6 | 
 7 | ## Build, Configuration and Running
 8 | 
 9 | This is mostly covered by the top level [README](../README.md), however note the particular requirements of [dataFusion-ner](../dataFusion-ner).
10 | 


--------------------------------------------------------------------------------
/dataFusion-ner-service/build.sbt:
--------------------------------------------------------------------------------
 1 | name := "dataFusion-ner-service"
 2 | 
 3 | libraryDependencies ++= Seq(
 4 |   "com.github.swagger-akka-http" %% "swagger-akka-http" % "0.9.1",
 5 |   "com.typesafe.akka" %% "akka-http-spray-json" % "10.0.7",
 6 |   "ch.megard" %% "akka-http-cors" % "0.2.1",
 7 |   
 8 |   "com.github.scopt" %% "scopt" % "3.7.0",
 9 |   "com.jsuereth" %% "scala-arm" % "2.0",
10 |   "org.scalatest" %% "scalatest" % "3.0.4" % "test"
11 | )
12 | 
13 | com.github.retronym.SbtOneJar.oneJarSettings
14 | 
15 | mainClass in Compile := Some("au.csiro.data61.dataFusion.ner.service.Main")
16 | 


--------------------------------------------------------------------------------
/dataFusion-ner-service/src/main/resources/application.conf:
--------------------------------------------------------------------------------
1 | http = {
2 |   host = "0.0.0.0"
3 |   port = 8086
4 | 
5 |   host = ${?NER_HTTP_HOST}
6 |   port = ${?NER_HTTP_PORT}
7 | }


--------------------------------------------------------------------------------
/dataFusion-ner-service/src/main/resources/logback.xml:
--------------------------------------------------------------------------------
 1 | <configuration>
 2 | 
 3 |   <appender name="CONS" class="ch.qos.logback.core.ConsoleAppender">
 4 |     <encoder>
 5 |       <pattern>%d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern>
 6 |     </encoder>
 7 |   </appender>
 8 |   
 9 |   <appender name="FILE" class="ch.qos.logback.core.FileAppender">
10 |     <file>ner-server.log</file>
11 |     <encoder>
12 |       <pattern>%d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern>
13 |     </encoder>
14 |   </appender>
15 | 
16 | <!--   <logger name="au.csiro.data61" level="DEBUG" /> -->
17 |   
18 |   <root level="INFO">
19 |     <appender-ref ref="FILE" />
20 |   </root>
21 | </configuration>
22 | 


--------------------------------------------------------------------------------
/dataFusion-ner-service/src/main/scala/au/csiro/data61/dataFusion/ner/service/Main.scala:
--------------------------------------------------------------------------------
  1 | package au.csiro.data61.dataFusion.ner.service
  2 | 
  3 | import scala.language.postfixOps
  4 | import scala.reflect.runtime.universe.typeOf
  5 | 
  6 | import com.github.swagger.akka.{ HasActorSystem, SwaggerHttpService }
  7 | import com.typesafe.config.ConfigFactory
  8 | import com.typesafe.scalalogging.Logger
  9 | 
 10 | import akka.actor.ActorSystem
 11 | import akka.http.scaladsl.Http
 12 | import akka.http.scaladsl.marshallers.sprayjson.SprayJsonSupport._
 13 | import akka.http.scaladsl.server.Directives._
 14 | import akka.stream.ActorMaterializer
 15 | import au.csiro.data61.dataFusion.common.Data.Doc
 16 | import au.csiro.data61.dataFusion.common.Data.JsonProtocol.docFormat
 17 | import au.csiro.data61.dataFusion.ner.Main.{ CliOption, Impl, defaultCliOption }
 18 | import ch.megard.akka.http.cors.scaladsl.CorsDirectives.cors
 19 | import io.swagger.annotations.{ Api, ApiOperation }
 20 | import javax.ws.rs.{ Consumes, Path }
 21 | import javax.ws.rs.core.MediaType
 22 | import spray.json.DefaultJsonProtocol._
 23 | 
 24 | object Main {
 25 |   val log = Logger(getClass)
 26 |   
 27 |   case class Docs(docs: List[Doc])
 28 |   
 29 |   object JsonProtocol {
 30 |     implicit val docCodec = jsonFormat1(Docs)
 31 |   }
 32 |   import JsonProtocol._
 33 |     
 34 |   
 35 |   @Api(value = "ner", description = "ner service", produces = "application/json")
 36 |   @Path("")
 37 |   class NerService(impl: Impl)  {
 38 | 
 39 |     @Path("ner")
 40 |     @ApiOperation(httpMethod = "POST", response = classOf[Doc], value = "input augmented with Named Entities")
 41 |     @Consumes(Array(MediaType.APPLICATION_JSON))
 42 |     def ner(d: Doc): Doc = impl.langNer(d)
 43 |         
 44 |     def nerRoute =
 45 |       post { path("langNer") { entity(as[Doc]) { in => complete {
 46 |         ner(in)
 47 |       }}}}
 48 |     
 49 |     // ----------------------------------------------------------
 50 |   
 51 |     @Path("nerMulti")
 52 |     @ApiOperation(httpMethod = "POST", response = classOf[Docs], value = "input augmented with Named Entities")
 53 |     @Consumes(Array(MediaType.APPLICATION_JSON))
 54 |     def nerMulti(d: Docs) = Docs(d.docs.map(impl.langNer))
 55 |         
 56 |     def nerMultiRoute =
 57 |       post { path("langNerMulti") { entity(as[Docs]) { in => complete {
 58 |         nerMulti(in)
 59 |       }}}}
 60 |   
 61 |     // ----------------------------------------------------------
 62 |   
 63 |     val routes = nerRoute ~ nerMultiRoute
 64 |   }
 65 |   
 66 |   def swaggerService(hst: String, prt: Int)(implicit s: ActorSystem, m: ActorMaterializer) = new SwaggerHttpService with HasActorSystem {
 67 |     override implicit val actorSystem = s
 68 |     override implicit val materializer = m
 69 |     override val apiTypes = Seq(typeOf[NerService])
 70 |     override def swaggerConfig = new io.swagger.models.Swagger().basePath(prependSlashIfNecessary(basePath)) // don't specify protocol://host basePath
 71 | //    override val host = s"${hst}:${prt}" // the url of your api, not swagger's json endpoint
 72 | //    override val basePath = "/"          // the basePath for the API you are exposing
 73 |     override val info = new com.github.swagger.akka.model.Info()                    // provides license and other description details
 74 |     override val apiDocsPath = "api-docs"   // http://host:port/api-docs/swagger.json
 75 |   }
 76 |   
 77 |   def start(impl: Impl) = {
 78 |     val conf = ConfigFactory.load
 79 |     val host = conf.getString("http.host")
 80 |     val port = conf.getInt("http.port")
 81 |     
 82 |     implicit val system = ActorSystem("nerActorSystem")
 83 |     implicit val exec = system.dispatcher
 84 |     implicit val materializer = ActorMaterializer()
 85 |     
 86 |     val routes = cors() {
 87 |       new NerService(impl).routes ~ 
 88 |       swaggerService(host, port).routes
 89 |     }
 90 |     
 91 |     Http().bindAndHandle(routes, host, port)
 92 |     log.info(s"""starting server at: http://${host}:${port}
 93 | Test with:
 94 |   curl --header 'Content-Type: application/json' http://${host}:${port}/api-docs/swagger.json
 95 | """)
 96 |   }
 97 | 
 98 |   def main(args: Array[String]): Unit = {
 99 |     val parser = new scopt.OptionParser[CliOption]("dataFusion-ner-service") {
100 |       head("dataFusion-ner-service", "0.x")
101 |       note("Named Entity Recognition web service.")
102 |       opt[Boolean]('c', "corenlp") action { (v, c) =>
103 |         c.copy(corenlp = v)
104 |       } text (s"Use CoreNLP (default ${defaultCliOption.corenlp})")
105 |       opt[Boolean]('o', "opennlp") action { (v, c) =>
106 |         c.copy(opennlp = v)
107 |       } text (s"Use OpenNLP (default ${defaultCliOption.opennlp})")
108 |       opt[Boolean]('m', "mitie") action { (v, c) =>
109 |         c.copy(mitie = v)
110 |       } text (s"Use MITIE (default ${defaultCliOption.mitie})")
111 |       help("help") text ("prints this usage text")
112 |     }
113 |     
114 |     for (c <- parser.parse(args, defaultCliOption)) {
115 |       import scala.concurrent.ExecutionContext.Implicits.global // for Impl parallel initialization
116 |       
117 |       val impl = new Impl(c)
118 |       start(impl)
119 |     }
120 |   }
121 |   
122 | }


--------------------------------------------------------------------------------
/dataFusion-ner/3rd-party-licenses.md:
--------------------------------------------------------------------------------
 1 | # datafusion-ner-licenses
 2 | 
 3 | Category | License | Dependency | Notes
 4 | --- | --- | --- | ---
 5 | Apache | [Apache 2](http://www.apache.org/licenses/LICENSE-2.0.txt) | de.jollyday # jollyday # 0.4.9 | <notextile></notextile>
 6 | Apache | [Apache 2](http://www.apache.org/licenses/LICENSE-2.0.txt) | io.spray # spray-json_2.12 # 1.3.3 | <notextile></notextile>
 7 | Apache | [Apache 2](http://www.apache.org/licenses/LICENSE-2.0.txt) | joda-time # joda-time # 2.9.4 | <notextile></notextile>
 8 | Apache | [Apache 2.0 License](http://www.apache.org/licenses/LICENSE-2.0.html) | com.typesafe.scala-logging # scala-logging_2.12 # 3.5.0 | <notextile></notextile>
 9 | Apache | [Apache License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0) | com.typesafe # config # 1.3.1 | <notextile></notextile>
10 | Apache | [The Apache Software License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0.txt) | org.apache.commons # commons-lang3 # 3.3.1 | <notextile></notextile>
11 | Apache | [The Apache Software License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0.txt) | org.apache.lucene # lucene-analyzers-common # 4.10.3 | <notextile></notextile>
12 | Apache | [The Apache Software License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0.txt) | org.apache.lucene # lucene-core # 4.10.3 | <notextile></notextile>
13 | Apache | [The Apache Software License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0.txt) | org.apache.lucene # lucene-queries # 4.10.3 | <notextile></notextile>
14 | Apache | [The Apache Software License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0.txt) | org.apache.lucene # lucene-queryparser # 4.10.3 | <notextile></notextile>
15 | Apache | [The Apache Software License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0.txt) | org.apache.lucene # lucene-sandbox # 4.10.3 | <notextile></notextile>
16 | Apache | [The Apache Software License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0.txt) | org.apache.opennlp # opennlp-tools # 1.8.1 | <notextile></notextile>
17 | Apache | [The Apache Software License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0.txt) | xalan # xalan # 2.7.0 | <notextile></notextile>
18 | Apache | [The Apache Software License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0.txt) | xerces # xercesImpl # 2.8.0 | <notextile></notextile>
19 | Apache | [The Apache Software License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0.txt) | xml-apis # xml-apis # 1.3.03 | <notextile></notextile>
20 | Apache | [the Apache License, ASL Version 2.0](http://www.apache.org/licenses/LICENSE-2.0) | org.scalactic # scalactic_2.12 # 3.0.4 | <notextile></notextile>
21 | Apache | [the Apache License, ASL Version 2.0](http://www.apache.org/licenses/LICENSE-2.0) | org.scalatest # scalatest_2.12 # 3.0.4 | <notextile></notextile>
22 | BSD | [BSD 3-Clause](http://www.scala-lang.org/license.html) | org.scala-lang # scala-library # 2.12.2 | <notextile></notextile>
23 | BSD | [BSD 3-Clause](http://www.scala-lang.org/license.html) | org.scala-lang # scala-reflect # 2.12.2 | <notextile></notextile>
24 | BSD | [BSD 3-clause](http://opensource.org/licenses/BSD-3-Clause) | org.scala-lang.modules # scala-parser-combinators_2.12 # 1.0.4 | <notextile></notextile>
25 | BSD | [BSD 3-clause](http://opensource.org/licenses/BSD-3-Clause) | org.scala-lang.modules # scala-xml_2.12 # 1.0.5 | <notextile></notextile>
26 | BSD | [BSD-Style](http://www.opensource.org/licenses/bsd-license.php) | com.jsuereth # scala-arm_2.12 # 2.0 | <notextile></notextile>
27 | BSD | [New BSD license](http://www.opensource.org/licenses/bsd-license.php) | com.google.protobuf # protobuf-java # 3.2.0 | <notextile></notextile>
28 | GPL | [GNU General Public License Version 3](http://www.gnu.org/licenses/gpl-3.0.txt) | edu.stanford.nlp # stanford-corenlp # 3.9.1 | <notextile></notextile>
29 | GPL | [GPL](https://www.gnu.org/licenses/gpl-3.0.en.html) | au.csiro.data61 # datafusion-common_2.12 # 1.1-SNAPSHOT | <notextile></notextile>
30 | GPL | [GPL2 w/ CPE](https://glassfish.java.net/public/CDDL+GPL_1_1.html) | javax.xml.bind # jaxb-api # 2.2.7 | <notextile></notextile>
31 | GPL with Classpath Extension | [CDDL + GPLv2 with classpath exception](https://glassfish.dev.java.net/nonav/public/CDDL+GPL.html) | javax.servlet # javax.servlet-api # 3.0.1 | <notextile></notextile>
32 | GPL with Classpath Extension | [CDDL + GPLv2 with classpath exception](https://glassfish.dev.java.net/nonav/public/CDDL+GPL.html) | org.glassfish # javax.json # 1.0.4 | <notextile></notextile>
33 | LGPL | [EPL + GNU Lesser General Public License](http://logback.qos.ch/license.html) | ch.qos.logback # logback-classic # 1.2.3 | <notextile></notextile>
34 | LGPL | [EPL + GNU Lesser General Public License](http://logback.qos.ch/license.html) | ch.qos.logback # logback-core # 1.2.3 | <notextile></notextile>
35 | LGPL | [LGPL](http://www.gnu.org/copyleft/lesser.html) | com.googlecode.efficient-java-matrix-library # ejml # 0.23 | <notextile></notextile>
36 | LGPL | [The GNU Lesser General Public License, Version 2.1](http://www.gnu.org/licenses/lgpl-2.1.html) | com.io7m.xom # xom # 1.2.10 | <notextile></notextile>
37 | MIT | [MIT License](http://www.opensource.org/licenses/mit-license.php) | com.github.scopt # scopt_2.12 # 3.7.0 | <notextile></notextile>
38 | MIT | [MIT License](http://www.slf4j.org/license.html) | org.slf4j # slf4j-api # 1.7.25 | <notextile></notextile>
39 | 
40 | 


--------------------------------------------------------------------------------
/dataFusion-ner/MITIE-native/centos/libjavamitie.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/data61/dataFusion/4b766ae65254eed6c5a095f3bb8b6b42b9500a9c/dataFusion-ner/MITIE-native/centos/libjavamitie.so


--------------------------------------------------------------------------------
/dataFusion-ner/MITIE-native/ubuntu/libjavamitie.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/data61/dataFusion/4b766ae65254eed6c5a095f3bb8b6b42b9500a9c/dataFusion-ner/MITIE-native/ubuntu/libjavamitie.so


--------------------------------------------------------------------------------
/dataFusion-ner/README.md:
--------------------------------------------------------------------------------
 1 | # dataFusion-ner
 2 | 
 3 | ## Introduction
 4 | 
 5 | This project provides a library and multi-threaded CLI (command line interface) for bulk processing.
 6 | It performs [Named Entity Recognition](https://en.wikipedia.org/wiki/Named-entity_recognition) using [CoreNLP](http://stanfordnlp.github.io/CoreNLP/), [OpenNLP](http://opennlp.apache.org/) and [MITIE](https://github.com/mit-nlp/MITIE).
 7 | 
 8 | ## Build, Configuration and Running
 9 | 
10 | This is mostly covered by the top level [README](../README.md), however MITIE is C++ code and has some particular requirements satisfied by the script `build-MITIE.sh`:
11 | 
12 | 1. a platform independent java library `lib/javamitie.jar`
13 | 2. a platform dependent shared library `MITIE-native/{platform}/libjavamitie.so`
14 | 3. language dependent models e.g. `MITIE-models/english/ner_model.dat`
15 | 4. environment variables to access the above
16 | 
17 | 1 and 2 (for both ubuntu and centos) are checked into the code repository (so if you use one of these hopefully you won't need to build MITIE except to use a newer version), however 3 (language models) are large, not in the code repository, and you will need to run the script to get them and to create the script `sh/setenv.{platform}` (for 4).
18 | 
19 | Run `build-MITIE.sh` with no args to do as little as necessary, or `build-MITIE.sh --clean` to start from scratch and build the lastest MITIE.
20 | 
21 | Configuration to run in Eclipse:
22 | 
23 | Select `Build Path` > `Configure Build Path` > `Source` > `dataFusion-ner/src/main/scala` > `Native library location`
24 | and add the `MITIE-native/{platform}` directory.
25 | 
26 | 


--------------------------------------------------------------------------------
/dataFusion-ner/build-MITIE.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/sh
 2 | 
 3 | set -vex
 4 | 
 5 | # run in dataFusion-ner dir
 6 | DFUS_NER=$PWD
 7 | 
 8 | # ubuntu has: ID=ubuntu but centos has: ID="centos"
 9 | OS=`sed --regexp-extended --quiet 's/^ID="?([a-z]+)"?$/\1/p' /etc/os-release`
10 | 
11 | NER=ner_model.dat
12 | EN=MITIE-models/english
13 | ES=MITIE-models/spanish
14 | 
15 | # when these files are missing the script makes a fresh start, else it does as little as necessary
16 | [ "$1" = "--clean" ] && rm -rf MITIE MITIE-native/$OS/libjavamitie.so "$EN/$NER" ../sh/setenv.$OS # "$ES/$NER"
17 | 
18 | # Build MITIE java jar and native shared library
19 | [ -d MITIE ] || git clone https://github.com/mit-nlp/MITIE
20 | [ -f MITIE-native/$OS/libjavamitie.so ] || {
21 |   BUILD=MITIE/mitielib/java/build-$OS
22 |   rm -rf $BUILD
23 |   mkdir -p $BUILD
24 |   cd $BUILD
25 |   cmake ..
26 |   cmake --build . --config Release --target install
27 | 
28 |   # Install MITIE libraries where this project expects them
29 |   mkdir -p $DFUS_NER/lib $DFUS_NER/MITIE-native/$OS
30 |   cp lib/javamitie.jar $DFUS_NER/lib
31 |   cp lib/libjavamitie.so $DFUS_NER/MITIE-native/$OS
32 | 
33 |   cd $DFUS_NER
34 | }
35 | 
36 | # Install English NER model
37 | [ -r "$EN/$NER" ] || {
38 |   echo "Downloading English models ..."
39 |   EN_BZ2=MITIE-models-v0.2.tar.bz2
40 |   curl --location https://github.com/mit-nlp/MITIE/releases/download/v0.4/$EN_BZ2 > $EN_BZ2
41 |   tar xvfj $EN_BZ2 $EN/$NER  # only extract EN NER model
42 |   rm $EN_BZ2
43 | }
44 | 
45 | # Install Spanish NER model
46 | if false; then
47 | [ -r "$ES/$NER" ] || {
48 |   echo "Downloading Spanish models ..."
49 |   ES_ZIP=MITIE-models-v0.2-Spanish.zip
50 |   curl --location https://github.com/mit-nlp/MITIE/releases/download/v0.4/$ES_ZIP > $ES_ZIP
51 |   unzip $ES_ZIP $ES/$NER  # only extract ES NER model
52 |   rm $ES_ZIP
53 | }
54 | fi
55 | 
56 | # create a file that can be sourced to set required environment variables
57 | [ -r "../sh/setenv.$OS" ] || {
58 | cat > ../sh/setenv.$OS <<EoF1
59 | #! /not/to/be/execed
60 |  
61 | # used by sh/dfus
62 | export DFUS_DIR=\${PWD}
63 | export SCALA_VER=2.12
64 | export DFUS_VER=`sed --regexp-extended --quiet 's/^.*"(.+)"$/\1/p' ../version.sbt`
65 | 
66 | # needed by dataFusion-ner (including sbt tests)
67 | export LD_LIBRARY_PATH=\${DFUS_DIR}/dataFusion-ner/MITIE-native/$OS   # directory containing libjavamitie.so
68 | export NER_MITIE_ENGLISH_MODEL=\${DFUS_DIR}/dataFusion-ner/$EN/$NER
69 | # export NER_MITIE_SPANISH_MODEL=\${DFUS_DIR}/dataFusion-ner/$ES/$NER
70 | 
71 | SEARCH_DIR=\${DFUS_DIR}/dataFusion-search
72 | export SEARCH_SYNONYMS=\${SEARCH_DIR}/synonyms.txt
73 | export SEARCH_DOC_INDEX=\${SEARCH_DIR}/docIndex
74 | export SEARCH_META_INDEX=\${SEARCH_DIR}/metaIndex
75 | export SEARCH_NER_INDEX=\${SEARCH_DIR}/nerIndex
76 | 
77 | export PATH=\${DFUS_DIR}/sh:/usr/sbin:/usr/bin:/sbin:/bin
78 | 
79 | cat <<EoF2
80 | Source this file with the dataFusion source tree top level dir as the current directory.
81 | It sets env vars for the project and sets a restricted PATH with dataFusion/sh as the highest priority item.
82 | 
83 | Try "dfus -h" to get started.
84 | EoF2
85 | 
86 | EoF1
87 | }
88 | 
89 | 
90 | 


--------------------------------------------------------------------------------
/dataFusion-ner/build.sbt:
--------------------------------------------------------------------------------
 1 | name := "dataFusion-ner"
 2 | 
 3 | libraryDependencies ++= Seq(
 4 |   "edu.stanford.nlp" % "stanford-corenlp" % "3.9.1",
 5 |   "edu.stanford.nlp" % "stanford-corenlp" % "3.9.1" classifier "models", //  classifier "models-spanish",
 6 |   
 7 |   "org.apache.opennlp" % "opennlp-tools" % "1.9.0",
 8 |   "com.typesafe" % "config" % "1.3.1",
 9 |   "com.github.scopt" %% "scopt" % "3.7.0",
10 |   "com.jsuereth" %% "scala-arm" % "2.0",
11 |   "org.scalatest" %% "scalatest" % "3.0.4" % "test"
12 | )
13 | 
14 | com.github.retronym.SbtOneJar.oneJarSettings
15 | 
16 | mainClass in Compile := Some("au.csiro.data61.dataFusion.ner.Main")
17 | 


--------------------------------------------------------------------------------
/dataFusion-ner/lib/javamitie.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/data61/dataFusion/4b766ae65254eed6c5a095f3bb8b6b42b9500a9c/dataFusion-ner/lib/javamitie.jar


--------------------------------------------------------------------------------
/dataFusion-ner/src/main/resources/application.conf:
--------------------------------------------------------------------------------
1 | mitie = {
2 |   englishNerModel = MITIE-models/english/ner_model.dat
3 |   spanishNerModel = MITIE-models/spanish/ner_model.dat
4 | 
5 |   englishNerModel = ${?NER_MITIE_ENGLISH_MODEL}
6 |   spanishNerModel = ${?NER_MITIE_SPANISH_MODEL}
7 | }
8 | 


--------------------------------------------------------------------------------
/dataFusion-ner/src/main/resources/logback.xml:
--------------------------------------------------------------------------------
 1 | <configuration>
 2 | 
 3 |   <appender name="CONS" class="ch.qos.logback.core.ConsoleAppender">
 4 |     <encoder>
 5 |       <pattern>%d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern>
 6 |     </encoder>
 7 |   </appender>
 8 |   
 9 |   <appender name="FILE" class="ch.qos.logback.core.FileAppender">
10 |     <file>ner.log</file>
11 |     <encoder>
12 |       <pattern>%d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern>
13 |     </encoder>
14 |   </appender>
15 | 
16 | <!--   <logger name="au.csiro.data61" level="DEBUG" /> -->
17 |   
18 |   <root level="INFO">
19 |     <appender-ref ref="FILE" />
20 |   </root>
21 | </configuration>
22 | 


--------------------------------------------------------------------------------
/dataFusion-ner/src/main/resources/opennlp-models-1.5/en-ner-date.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/data61/dataFusion/4b766ae65254eed6c5a095f3bb8b6b42b9500a9c/dataFusion-ner/src/main/resources/opennlp-models-1.5/en-ner-date.bin


--------------------------------------------------------------------------------
/dataFusion-ner/src/main/resources/opennlp-models-1.5/en-ner-location.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/data61/dataFusion/4b766ae65254eed6c5a095f3bb8b6b42b9500a9c/dataFusion-ner/src/main/resources/opennlp-models-1.5/en-ner-location.bin


--------------------------------------------------------------------------------
/dataFusion-ner/src/main/resources/opennlp-models-1.5/en-ner-money.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/data61/dataFusion/4b766ae65254eed6c5a095f3bb8b6b42b9500a9c/dataFusion-ner/src/main/resources/opennlp-models-1.5/en-ner-money.bin


--------------------------------------------------------------------------------
/dataFusion-ner/src/main/resources/opennlp-models-1.5/en-ner-organization.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/data61/dataFusion/4b766ae65254eed6c5a095f3bb8b6b42b9500a9c/dataFusion-ner/src/main/resources/opennlp-models-1.5/en-ner-organization.bin


--------------------------------------------------------------------------------
/dataFusion-ner/src/main/resources/opennlp-models-1.5/en-ner-percentage.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/data61/dataFusion/4b766ae65254eed6c5a095f3bb8b6b42b9500a9c/dataFusion-ner/src/main/resources/opennlp-models-1.5/en-ner-percentage.bin


--------------------------------------------------------------------------------
/dataFusion-ner/src/main/resources/opennlp-models-1.5/en-ner-person.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/data61/dataFusion/4b766ae65254eed6c5a095f3bb8b6b42b9500a9c/dataFusion-ner/src/main/resources/opennlp-models-1.5/en-ner-person.bin


--------------------------------------------------------------------------------
/dataFusion-ner/src/main/resources/opennlp-models-1.5/en-ner-time.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/data61/dataFusion/4b766ae65254eed6c5a095f3bb8b6b42b9500a9c/dataFusion-ner/src/main/resources/opennlp-models-1.5/en-ner-time.bin


--------------------------------------------------------------------------------
/dataFusion-ner/src/main/resources/opennlp-models-1.5/en-sent.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/data61/dataFusion/4b766ae65254eed6c5a095f3bb8b6b42b9500a9c/dataFusion-ner/src/main/resources/opennlp-models-1.5/en-sent.bin


--------------------------------------------------------------------------------
/dataFusion-ner/src/main/resources/opennlp-models-1.5/en-token.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/data61/dataFusion/4b766ae65254eed6c5a095f3bb8b6b42b9500a9c/dataFusion-ner/src/main/resources/opennlp-models-1.5/en-token.bin


--------------------------------------------------------------------------------
/dataFusion-ner/src/main/scala/au/csiro/data61/dataFusion/ner/MITIE.scala:
--------------------------------------------------------------------------------
  1 | package au.csiro.data61.dataFusion.ner
  2 | 
  3 | import java.io.{ File, FileNotFoundException }
  4 | import java.nio.charset.Charset
  5 | 
  6 | import scala.language.{ implicitConversions, reflectiveCalls }
  7 | 
  8 | import com.typesafe.config.ConfigFactory
  9 | import com.typesafe.scalalogging.Logger
 10 | 
 11 | import edu.mit.ll.mitie.{ NamedEntityExtractor, global }
 12 | 
 13 | import au.csiro.data61.dataFusion.common.Data.Ner
 14 | import java.nio.charset.Charset
 15 | import au.csiro.data61.dataFusion.common.Timer
 16 | 
 17 | /**
 18 |  * MITIE https://github.com/mit-nlp/MITIE
 19 |  * is C++ code that can be compiled to use optimised BLAS implementations (or use its own).
 20 |  * The Java wrapper is not available in maven repos so this entails sbt unmanaged jar in lib
 21 |  * and loading a native shared library.
 22 |  */
 23 | object MITIE {
 24 |   val log = Logger(getClass)
 25 |   val conf = ConfigFactory.load
 26 |   val utf8 = Charset.forName("UTF-8")
 27 |   
 28 |   log.info(s"loading MITIE native library")
 29 |   System.loadLibrary("javamitie")
 30 |   
 31 |   // the mitie.*Vector classes declare no common interface, so we resort to a "structural type" (i.e. duck type)
 32 |   implicit def toIter[T](v: { def get(i: Int): T; def size(): Long }) = new Iterator[T] {
 33 |     var i = 0
 34 |     override def hasNext = i < v.size
 35 |     override def next = {
 36 |       val n = v.get(i)
 37 |       i += 1
 38 |       n
 39 |     }
 40 |   }
 41 |   
 42 | //  // http://stackoverflow.com/questions/15038616/how-to-convert-between-character-and-byte-position-in-objective-c-c-c
 43 | //  // map a UTF-8 byte to it's width in UTF-16 ints:
 44 | //  // - leading byte of 1-3 byte UTF-8 chars -> 1 (these chars map to 1 UTF-16 int)
 45 | //  // - leading byte of 4 byte UTF-8 chars -> 2 (these chars map to 2 UTF-16 ints)
 46 | //  // - extension bytes -> 0
 47 | //  val utf16width = (0 until 256 map {
 48 | //    case i if Seq(0 to 0x7f, 0xc0 to 0xdf, 0xe0 to 0xef).exists(_.contains(i)) => 1
 49 | //    case i if (0xf0 to 0xf7).contains(i) => 2
 50 | //    case _ => 0
 51 | //  }).toArray
 52 | //  def javaOffset2(utf8Str: Array[Byte], from: Int, until: Int) = {
 53 | //    var x = 0
 54 | //    for (i <- from until until) {
 55 | //      x += utf16width(utf8Str(i) & 0xff) // prevent byte to int sign extension
 56 | //    }
 57 | //    x
 58 | //  }
 59 | 
 60 |   // Timing of this impl compared to above:
 61 |   // short strings: javaOffset 5.2 secs; javaOffset2 7.3 secs
 62 |   // long strings: javaOffset 2.891 secs; javaOffset2 2.892 secs
 63 |   // so there is no justification for the complexity of javaOffset2
 64 |   def javaOffset(utf8Str: Array[Byte], from: Int, until: Int) = new String(utf8Str, from, until - from, utf8).length
 65 |   
 66 |   class Nlp(path: String) {
 67 |     if (!new File(path).canRead) throw new FileNotFoundException(s"Can't read $path")
 68 |     log.info(s"loading MITIE model $path")
 69 |     
 70 |     // multi-threading test shows that NamedEntityExtractor is not thread-safe
 71 |     // MITIE comes with a generic non-thread-safe warning in: https://github.com/mit-nlp/MITIE/blob/master/mitielib/include/mitie.h
 72 |     val neExtractor = new ThreadLocal[NamedEntityExtractor] {
 73 |       override protected def initialValue = new NamedEntityExtractor(path)
 74 |     }
 75 |     val neTypes = neExtractor.get.getPossibleNerTags.toIndexedSeq
 76 |     log.debug(s"Nlp: posible ner types from $path are $neTypes")
 77 |    
 78 |     // val t = Timer()
 79 |     
 80 |     def ner(in: String) = {
 81 |       
 82 |       val inUtf8 = in.getBytes(utf8) // MITIE's offsets are relative to this
 83 |       // we get the NERs in order of increasing offset, so we can calculate the offsets incrementally
 84 |       var o8 = 0
 85 |       var o16 = 0
 86 |       def toJavaOffset(utf8Off: Int) = {
 87 |         if (utf8Off < o8) {
 88 |           log.debug(s"resetting o8, o16: utf8Off = $utf8Off, o8 = $o8, o16 = $o16")
 89 |           o8 = 0
 90 |           o16 = 0
 91 |         }
 92 |         o16 += javaOffset(inUtf8, o8, utf8Off)
 93 |         o8 = utf8Off
 94 |         o16
 95 |       }
 96 |       
 97 |       val words = global.tokenizeWithOffsets(in) // multi-threading test appears to show that this is thread-safe 
 98 |       neExtractor.get.extractEntities(words).map { e =>
 99 |         val offStrUtf8 = words.get(e.getStart).getIndex.toInt
100 |         val offStr = toJavaOffset(offStrUtf8)
101 |         val end = words.get(e.getEnd - 1)
102 |         val offEnd = toJavaOffset(end.getIndex.toInt) + end.getToken.length
103 |         Ner(
104 |           e.getStart, e.getEnd, offStr, offEnd,
105 |           e.getScore, in.substring(offStr, offEnd),
106 |           neTypes(e.getTag), "MITIE", None
107 |         )
108 |       }.toList
109 |     }
110 |   }
111 |     
112 |   object English {
113 |     val nlp = new Nlp(conf.getString("mitie.englishNerModel"))
114 |   }
115 |   
116 | //  object Spanish {
117 | //    val nlp = new Nlp(conf.getString("mitie.spanishNerModel"))
118 | //  }
119 |       
120 | //  def ner(lang: String, in: String): List[Ner] =
121 | //    lang match {
122 | //      case "es" => {
123 | //        // log.debug("Spanish")
124 | //        Spanish.nlp.ner(in)
125 | //      }
126 | //      case _ => {
127 | //        // log.debug("English")
128 | //        English.nlp.ner(in)
129 | //      }
130 | //    }
131 |   def ner(lang: String, in: String): List[Ner] = English.nlp.ner(in)
132 | }
133 | 


--------------------------------------------------------------------------------
/dataFusion-ner/src/main/scala/au/csiro/data61/dataFusion/ner/OpenNLP.scala:
--------------------------------------------------------------------------------
  1 | package au.csiro.data61.dataFusion.ner
  2 | 
  3 | import java.io.InputStream
  4 | import java.util.Properties
  5 | 
  6 | import scala.collection.JavaConverters._
  7 | 
  8 | import com.typesafe.scalalogging.Logger
  9 | 
 10 | import edu.stanford.nlp.ling.CoreAnnotations.{ SentencesAnnotation, TokensAnnotation }
 11 | import edu.stanford.nlp.pipeline.Annotator.{ STANFORD_SSPLIT, STANFORD_TOKENIZE }
 12 | import edu.stanford.nlp.pipeline.StanfordCoreNLP
 13 | import opennlp.tools.namefind.{ NameFinderME, TokenNameFinderModel }
 14 | import opennlp.tools.sentdetect.{ SentenceDetectorME, SentenceModel }
 15 | import opennlp.tools.tokenize.{ TokenizerME, TokenizerModel }
 16 | import resource.managed
 17 | 
 18 | import au.csiro.data61.dataFusion.common.Data.Ner
 19 | 
 20 | object OpenNLP {
 21 |    val log = Logger(getClass)
 22 |  
 23 |   def loadModel[M](path: String, ctor: InputStream => M) = {
 24 |     log.info(s"loading OpenNLP model $path")
 25 |     managed(getClass.getResourceAsStream(path)).map(ctor).tried.get
 26 |   }
 27 |     
 28 |   // *Model's are thread-safe
 29 |   object English {
 30 |     val sentence = loadModel("/opennlp-models-1.5/en-sent.bin", in => new SentenceModel(in))
 31 |     val tokenizer = loadModel("/opennlp-models-1.5/en-token.bin", in => new TokenizerModel(in))
 32 |     val ners = Seq("date", "location", "money", "organization", "percentage", "person", "time").map { typ =>
 33 |       loadModel(s"/opennlp-models-1.5/en-ner-${typ}.bin", in => new TokenNameFinderModel(in))
 34 |     }
 35 |   }
 36 |   
 37 | //  object Spanish {
 38 | //    /** Spanish sentence & tokenizer models used in training Spanish NameFinder models are not available,
 39 | //      * so use CoreNLP (just for this) and hope it is not too different!
 40 | //      */
 41 | //    val coreNLP = managed(getClass.getResourceAsStream("/StanfordCoreNLP-spanish.properties")).map { in =>
 42 | //        val p = new Properties
 43 | //        p.load(in)
 44 | //        p.setProperty("annotators", Seq(STANFORD_TOKENIZE, STANFORD_SSPLIT).mkString(", "))
 45 | //        CoreNLP.synchronized { new StanfordCoreNLP(p, true) } // synchronized else multi-threaded sbt test fails
 46 | //      }.tried.get
 47 | //      
 48 | //    val ners = Seq("location", "organization", "person", "misc").map { typ =>
 49 | //      loadModel(s"/opennlp-models-1.5/es-ner-${typ}.bin", in => new TokenNameFinderModel(in))
 50 | //    }
 51 | //  }
 52 |   
 53 |   /**
 54 |    * Not thread-safe 
 55 |    */
 56 |   class EnOpenNLP {
 57 |     // because *ME's are not thread-safe
 58 |     val sent = new SentenceDetectorME(English.sentence)
 59 |     val tok = new TokenizerME(English.tokenizer)
 60 |     val ners = English.ners.map(new NameFinderME(_))
 61 |     
 62 |     def ner(in: String): List[Ner] = {
 63 |       var tokenIdx = 0;
 64 |       val r = for {
 65 |         sentencePos <- sent.sentPosDetect(in)
 66 |         sentence = in.substring(sentencePos.getStart, sentencePos.getEnd)
 67 |         pos = tok.tokenizePos(sentence)
 68 |         tIdx = tokenIdx
 69 |         _ = tokenIdx += pos.size // start of next sentence
 70 |         tokens = pos.map(s => sentence.substring(s.getStart, s.getEnd))
 71 |         ner <- ners
 72 |         s <- ner.find(tokens)
 73 |         start = sentencePos.getStart + pos(s.getStart).getStart
 74 |         end = sentencePos.getStart + pos(s.getEnd - 1).getEnd
 75 |       } yield Ner(tIdx + s.getStart, tIdx + s.getEnd, start, end, s.getProb, in.substring(start, end), s.getType.toUpperCase, "OpenNLP", None)
 76 |       
 77 |       ners.foreach(_.clearAdaptiveData)
 78 |       r.toList
 79 |     }
 80 |   }
 81 |   val enOpenNLP = new ThreadLocal[EnOpenNLP] {
 82 |     override protected def initialValue = new EnOpenNLP
 83 |   }
 84 |   
 85 |       
 86 |   /**
 87 |    * Not thread-safe 
 88 |    */
 89 | //  class EsOpenNLP {
 90 | //    // because *ME are not thread-safe
 91 | //    val ners = Spanish.ners.map(new NameFinderME(_))
 92 | //    
 93 | //    def ner(in: String): List[Ner] = {
 94 | //      var tokenIdx = 0;
 95 | //      val r = for {
 96 | //        sentence <- Spanish.coreNLP.process(in).get(classOf[SentencesAnnotation]).asScala
 97 | //        tokens = sentence.get(classOf[TokensAnnotation]).asScala.toArray
 98 | //        tIdx = tokenIdx
 99 | //        _ = tokenIdx += tokens.size // token index of start of next sentence
100 | //        ner <- ners
101 | //        s <- ner.find(tokens.map(_.originalText))
102 | //        start = tokens(s.getStart).beginPosition
103 | //        end = tokens(s.getEnd - 1).endPosition
104 | //      } yield Ner(tIdx + s.getStart, tIdx + s.getEnd, start, end, s.getProb, in.substring(start, end), s.getType.toUpperCase, "OpenNLP")
105 | //      
106 | //      ners.foreach(_.clearAdaptiveData)
107 | //      r.toList
108 | //    }
109 | //  }
110 | //  val esOpenNLP = new ThreadLocal[EsOpenNLP] {
111 | //    override protected def initialValue = new EsOpenNLP
112 | //  }
113 |   
114 |   /** thread-safe */
115 | //  def ner(lang: String, in: String): List[Ner] =
116 | //    lang match {
117 | //      case "es" => {
118 | //        // log.debug("Spanish")
119 | //        esOpenNLP.get.ner(in)
120 | //      }
121 | //      case _ => {
122 | //        // log.debug("English")
123 | //        enOpenNLP.get.ner(in)
124 | //      }
125 | //    }
126 |   def ner(lang: String, in: String): List[Ner] = enOpenNLP.get.ner(in)
127 | }
128 | 


--------------------------------------------------------------------------------
/dataFusion-ner/src/main/scala/au/csiro/data61/dataFusion/ner/Split.scala:
--------------------------------------------------------------------------------
 1 | package au.csiro.data61.dataFusion.ner
 2 | 
 3 | import com.typesafe.scalalogging.Logger
 4 | 
 5 | object Split {
 6 |   private val log = Logger(getClass)
 7 | 
 8 |   val allAlpha = """\p{Alpha}{3,}""".r
 9 |   val hasVowel = """(?i)[aeiou]""".r.unanchored
10 |   val hasConsonant = """(?i)[a-z&&[^aeiouy]]""".r.unanchored
11 |   
12 |   def wordLike(word: String) =
13 |     (for {
14 |       a <- allAlpha.unapplySeq(word)
15 |       b <- hasVowel.unapplySeq(word)
16 |       c <- hasConsonant.unapplySeq(word)
17 |     } yield (a, b, c)).isDefined
18 | 
19 |   def containsWordLike(line: String) = line.split(" +") exists wordLike
20 |     
21 |   /**
22 |    * CoreNLP doesn't terminate on long input, so split on lines where containsWordLike is false
23 |    * (that includes blank lines so it splits on paragraphs),
24 |    * but don't split again for the next splitmin lines.
25 |    * Paragraphs longer than splitmax are split in segments of splitmax lines, without considering sentence breaks.
26 |    * This is a bit drastic, but less so than non-terminating processing.
27 |    * This handles spreadsheet data with potentially very long "paragraphs".
28 |    */
29 |   def splitParagraphs(lines: IndexedSeq[String], splitmin: Int, splitmax: Int): Iterator[(Int, Int, String)] = {
30 |     val splits = (for ((l, i) <- lines.zipWithIndex if !containsWordLike(l)) yield i).toList :+ lines.size
31 |     log.debug(s"main: splits = $splits")
32 |     val splitsFiltered = (splits.foldLeft((splitmin, 0, List(0))){ case ((maxi, prev, result), x) => 
33 |       log.debug(s"splitParagraphs: x = $x, maxi = $maxi")
34 |       if (x <= maxi) {
35 |         (maxi, x, result) // drop values within range of splitmin
36 |       } else {
37 |         val z = ((prev + splitmax) until x by splitmax).toList :+ x // if bigger than splitmax then split every splitmax
38 |         (x + splitmin, x, z.reverse ++ result)
39 |       }
40 |     })._3.reverse
41 |     log.debug(s"splitParagraphs: splitsFiltered = $splitsFiltered")
42 |     if (splitsFiltered.size < 2) {
43 |       Iterator.single((0, lines.size, lines.mkString("", "\n", "\n")))
44 |     } else {
45 |       splitsFiltered.sliding(2).flatMap {
46 |         case a :: b :: Nil if a < b => List((a, b, lines.slice(a, b).mkString("", "\n", "\n"))) // include trailing \n so concatenating splits gives original String 
47 |         case _ => List.empty
48 |       }
49 |     }
50 |   }
51 |     
52 | //  def main(args: Array[String]): Unit = {
53 | //    implicit val utf8 = io.Codec.UTF8
54 | //    val lines = io.Source.fromFile("/home/bac003/sw/submissions/data/submissions/sub-043-part-1.txt").getLines.take(100).toIndexedSeq
55 | //    val splits = split(lines, 5).toList
56 | //    log.debug(s"main: splits = $splits")
57 | //  }
58 | }


--------------------------------------------------------------------------------
/dataFusion-ner/src/test/resources/logback-test.xml:
--------------------------------------------------------------------------------
 1 | <configuration>
 2 | 
 3 |   <appender name="CONS" class="ch.qos.logback.core.ConsoleAppender">
 4 |     <encoder>
 5 |       <pattern>%d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern>
 6 |     </encoder>
 7 |   </appender>
 8 |   
 9 |   <appender name="FILE" class="ch.qos.logback.core.FileAppender">
10 |     <file>ner-test.log</file>
11 |     <encoder>
12 |       <pattern>%d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern>
13 |     </encoder>
14 |   </appender>
15 | 
16 |   <logger name="au.csiro.data61" level="DEBUG" />
17 | 
18 |   <root level="INFO">
19 |     <appender-ref ref="CONS" />
20 |   </root>
21 | </configuration>
22 | 


--------------------------------------------------------------------------------
/dataFusion-ner/src/test/scala/au/csiro/data61/dataFusion/ner/CoreNLPTest.scala:
--------------------------------------------------------------------------------
 1 | package au.csiro.data61.dataFusion.ner
 2 | 
 3 | import org.scalatest.{ Finders, FlatSpec, Matchers }
 4 | 
 5 | import com.typesafe.scalalogging.Logger
 6 | 
 7 | import au.csiro.data61.dataFusion.common.Data.Ner
 8 | import CoreNLP._
 9 | 
10 | class CoreNLPTest extends FlatSpec with Matchers {
11 |   val log = Logger(getClass)
12 | 
13 |   val en ="en"
14 |   val enTxt = """The Clinton Engineer Works was the site of the Manhattan Project's World War II production facilities that provided the enriched uranium used in the bombing of Hiroshima in August 1945.
15 | 
16 | Its X-10 Graphite Reactor produced the first samples of plutonium from a reactor.
17 | 
18 | Located just south of the town of Clinton, Tennessee, it included the production facilities of the K-25, Y-12 and S-50 projects, various utilities, and the township of Oak Ridge.
19 | 
20 | The Manhattan District Engineer, Kenneth Nichols, moved the Manhattan District headquarters there from Manhattan in August 1943.
21 | """
22 | 
23 | //  val es = "es"
24 | //  val esTxt = """Cristóbal Colón, Cristoforo Colombo en italiano o Christophorus Columbus en latín (Génova,n. 1 1 2 c. 1436-14513 -Valladolid, 20 de mayo de 1506), fue un navegante, cartógrafo, almirante, virrey y gobernador general de las Indias Occidentales al servicio de la Corona de Castilla.
25 | //
26 | //Es famoso por haber realizado el descubrimiento de América, el 12 de octubre de 1492, al llegar a la isla de Guanahani, actualmente en las Bahamas.
27 | //
28 | //Efectuó cuatro viajes a las Indias —denominación del continente americano hasta la publicación del Planisferio de Martín Waldseemüller en 1507— y aunque posiblemente no fue el primer explorador europeo de América, se le considera el descubridor de un nuevo continente —por eso llamado el Nuevo Mundo— para Europa, al ser el primero que trazó una ruta de ida y vuelta a través del océano Atlántico y dio a conocer la noticia.
29 | //
30 | //Este hecho impulsó decisivamente la expansión mundial de la civilización europea, y la conquista y colonización por varias de sus potencias del continente americano.
31 | //"""
32 |   
33 |   "CoreNLP NER" should "get English entities" in {
34 |     val ners = nerSplitParagraphs(en, enTxt, 1, 1) // split into small chunks
35 |     log.debug(s"ners = ${ners}")
36 |     // 3.9.1 has many new tags over 3.8.0, e.g. CITY, STATE_OR_PROVINCE (rather than just LOCATION), TITLE for job title, CAUSE_OF_DEATH (for "war"), CRIMINAL_CHARGE for "bombing"
37 |     assert(ners.contains(Ner(12, 15, 67, 79, 1.0, "World War II", "MISC", "CoreNLP", None)))
38 |     assert(ners.contains(Ner(98, 100, 566, 577, 1.0, "August 1943", "DATE", "CoreNLP", None)))
39 |   }
40 |   
41 |   // CoreNLP 3.8.0, 3.9.1 fail this test
42 |   // For ATO project we built corenlp from latest in github on 2017-09-22 (while 3.8.0 was the current release) and this passed.
43 | //  it should "handle no space between digits and mutiplier" in {
44 | //    for (mult <- Seq("hundred", "thousand", "million", "billion", "trillion")) {
45 | //      val text = "Henry bought Sally a new car for $3.75" + mult + " for her birthday."
46 | //      val ners = ner("en", text)
47 | //      log.debug(s"text = $text, ners = ${ners}")
48 | //      assert(ners.exists(_.typ == "MONEY"))
49 | //    }
50 | //  }
51 |   
52 | //  it should "get Spanish entities" in {
53 | //    val ners = nerSplit(es, esTxt, 1) // split into small chunks
54 | //    log.debug(s"ners = ${ners}")
55 | //    assert(ners.contains(Ner(0, 2, 0, 15, 1.0, "Cristóbal Colón", "PERSON", "CoreNLP")))
56 | //    assert(ners.contains(Ner(141, 142, 737, 743, 1.0, "Europa", "LOCATION", "CoreNLP")))
57 | //  }
58 | //  
59 | //  it should "get Spanish entities in mutiple threads" in {
60 | //    val expected = nerSplit(es, esTxt, 1) // split into small chunks
61 | //    
62 | //    val r = new Runnable {
63 | //      override def run = {
64 | //        val ners = ner(es, esTxt) // would split to 100 lines, but text is smaller than that, so no split
65 | //        ners should be(expected)
66 | //      }
67 | //    }
68 | //    val threads = Iterator.range(0, 8).map { _ => 
69 | //      val t = new Thread(r)
70 | //      t.start
71 | //      t
72 | //    }.toList
73 | //    threads.foreach(_.join)
74 | //  }
75 | }


--------------------------------------------------------------------------------
/dataFusion-ner/src/test/scala/au/csiro/data61/dataFusion/ner/OpenNLPTest.scala:
--------------------------------------------------------------------------------
 1 | package au.csiro.data61.dataFusion.ner
 2 | 
 3 | import org.scalatest.{ FlatSpec, Matchers }
 4 | 
 5 | import com.typesafe.scalalogging.Logger
 6 | 
 7 | import au.csiro.data61.dataFusion.common.Data.Ner
 8 | import OpenNLP.ner
 9 | 
10 | class OpennlpNerTest extends FlatSpec with Matchers {
11 |   val log = Logger(getClass)
12 | 
13 |   val en = "en"
14 |   val enTxt = """The Clinton Engineer Works was the site of the Manhattan Project's World War II production facilities that provided the enriched uranium used in the bombing of Hiroshima in August 1945. Its X-10 Graphite Reactor produced the first samples of plutonium from a reactor. Located just south of the town of Clinton, Tennessee, it included the production facilities of the K-25, Y-12 and S-50 projects, various utilities, and the township of Oak Ridge. The Manhattan District Engineer, Kenneth Nichols, moved the Manhattan District headquarters there from Manhattan in August 1943. """
15 |   
16 |   val es = "es"
17 |   val esTxt = """Cristóbal Colón, Cristoforo Colombo en italiano o Christophorus Columbus en latín (Génova,n. 1 1 2 c. 1436-14513 -Valladolid, 20 de mayo de 1506), fue un navegante, cartógrafo, almirante, virrey y gobernador general de las Indias Occidentales al servicio de la Corona de Castilla. Es famoso por haber realizado el descubrimiento de América, el 12 de octubre de 1492, al llegar a la isla de Guanahani, actualmente en las Bahamas.
18 | Efectuó cuatro viajes a las Indias —denominación del continente americano hasta la publicación del Planisferio de Martín Waldseemüller en 1507— y aunque posiblemente no fue el primer explorador europeo de América, se le considera el descubridor de un nuevo continente —por eso llamado el Nuevo Mundo— para Europa, al ser el primero que trazó una ruta de ida y vuelta a través del océano Atlántico y dio a conocer la noticia. Este hecho impulsó decisivamente la expansión mundial de la civilización europea, y la conquista y colonización por varias de sus potencias del continente americano."""
19 |   
20 |   "OpenNLP NER" should "get English entities" in {
21 |     val ners = ner(en, enTxt)
22 |     log.debug(s"ners = $ners")
23 |     assert(ners.map(_.copy(score = 1.0)).contains(Ner(78, 80, 436, 445, 1.0, "Oak Ridge", "LOCATION", "OpenNLP", None)))
24 |   }
25 |   
26 | //  it should "get Spanish entities" in {
27 | //    val ners = ner(es, esTxt)
28 | //    log.debug(s"ners = ${ners}")
29 | //    assert(ners.map(_.copy(score = 1.0)).contains(Ner(80, 81, 390, 399, 1.0, "Guanahani", "LOCATION", "OpenNLP")))
30 | //  }
31 | //  
32 | //  it should "get Spanish entities in mutiple threads" in {
33 | //    val expected = ner(es, esTxt)
34 | //    
35 | //    val r = new Runnable {
36 | //      override def run = {
37 | //        val ners = ner(es, esTxt)
38 | //        ners should be(expected)
39 | //      }
40 | //    }
41 | //    val threads = Iterator.range(0, 8).map { _ => 
42 | //      val t = new Thread(r)
43 | //      t.start
44 | //      t
45 | //    }.toList
46 | //    threads.foreach(_.join)
47 | //  }
48 | }


--------------------------------------------------------------------------------
/dataFusion-ner/src/test/scala/au/csiro/data61/dataFusion/ner/SplitTest.scala:
--------------------------------------------------------------------------------
 1 | package au.csiro.data61.dataFusion.ner
 2 | 
 3 | import org.scalatest.{ Finders, FlatSpec, Matchers }
 4 | 
 5 | import com.typesafe.scalalogging.Logger
 6 | import Split._
 7 | 
 8 | class SplitTest extends FlatSpec with Matchers {
 9 |   val log = Logger(getClass)
10 |   val wrdLike = Seq("word", "Word", "zzza")
11 |   val nonWrdLike = Seq("", "an", "zebra.", "sky", "aeiou")
12 |   
13 |   "wordLike" should "detect tokens looking like words" in {
14 |     for (w <- wrdLike) assert(wordLike(w))
15 |   }
16 |   
17 |   it should "reject words < 3 chars, with non alpha chars, with no vowels or no consonants" in {
18 |     for (w <- nonWrdLike) assert(!wordLike(w))
19 |   }
20 |   
21 |   "containsWordLike" should "detect lines containing a wordLike" in {
22 |     for {
23 |       i <- 0 to nonWrdLike.size // index where we'll insert the wrdLike
24 |       w <- wrdLike
25 |     } {
26 |       val line = (nonWrdLike.take(i) ++ Seq(w) ++ nonWrdLike.drop(i)).mkString(" ")
27 |       assert(containsWordLike(line))
28 |     }
29 |   }
30 |   
31 |   it should  "reject lines with no wordLike" in {
32 |     for (line <- Seq("", nonWrdLike.mkString(" "))) assert(!containsWordLike(line))
33 |   }
34 |   
35 |   val longText = """
36 | Fiction House apparently made the decision to launch Planet Stories
37 | so quickly that there was little time for Reiss to obtain new stories,
38 | so he worked with Julius Schwartz and other authors' agents to fill the
39 | first issue. The results were unremarkable, but Reiss was energetic, and
40 | was able to improve the quality of fiction in succeeding issues, though
41 | he occasionally apologized to the readers for printing weak material.
42 | The magazine was exclusively focused on interplanetary adventures,
43 | often taking place in primitive societies that would now be regarded as
44 | "sword and sorcery" settings, and was aimed at a young readership; the
45 | result was a mixture of what became known as space opera and planetary
46 | romances—melodramatic tales of action and adventure on alien planets
47 | and in interplanetary space. Planet relied on a few authors to
48 | provide the bulk of its fiction in the early years, with Nelson Bond
49 | providing eight lead stories, some of them novels. Fourteen more were
50 | written by Ray Cummings and Ross Rocklynne; and Leigh Brackett was also
51 | a regular contributor, with seventeen stories in total published over
52 | the lifetime of the magazine.
53 | 
54 | The letter column in Planet was titled "The Vizigraph"; it was very
55 | active, with long letters from an engaged readership. It often printed
56 | letters from established writers, and from fans who would go on to become
57 | well known professionally: Damon Knight's letters are described by sf
58 | historian Mike Ashley as "legendary"; and Robert Silverberg commented
59 | in a letter in the Summer 1950 issue that Ray Bradbury "certainly gets
60 | some original ideas, if not good ones". The editors put a good
61 | deal of effort into keeping the letter column friendly and lively;
62 | contemporary writer and editor Robert Lowndes recalls that "Reiss was
63 | sincere and urbane; Wilbur [Peacock] enjoyed taking his coat off and
64 | being one of the crowd"."""
65 |   
66 |   "splitParagraphs" should "split paragraphs" in {
67 |     val paras = splitParagraphs(longText.split("\n"), 3, 200).toList
68 |     // log.debug(s"paras = $paras")
69 |     paras.map(x => (x._1, x._2)) should be(List((0, 18), (18, 30)))
70 |   }
71 | 
72 |   it should "further split long paragraphs" in {
73 |     val paras = splitParagraphs(longText.split("\n"), 3, 8).toList
74 |     // log.debug(s"paras = $paras")
75 |     paras.map(x => (x._1, x._2)) should be(List((0, 8), (8, 16), (16, 18), (18, 26), (26, 30)))
76 |   }
77 | }


--------------------------------------------------------------------------------
/dataFusion-search-service/README.md:
--------------------------------------------------------------------------------
 1 | # dataFusion-search-service
 2 | 
 3 | ## Introduction
 4 | 
 5 | This project provides [RESTful](https://en.wikipedia.org/wiki/Representational_state_transfer) web services based on [dataFusion-search](../dataFusion-search).
 6 | 
 7 | ## Build, Configuration, Running and Swagger Support
 8 | 
 9 | See the top level [README](../README.md). This will not run concurrently with the dataFusion-search CLI, unless they are configured to use different search indices, because Lucene takes an exclusive lock on its index.
10 | 
11 | 


--------------------------------------------------------------------------------
/dataFusion-search-service/build.sbt:
--------------------------------------------------------------------------------
 1 | name := "dataFusion-search-service"
 2 | 
 3 | libraryDependencies ++= Seq(
 4 |   "com.github.swagger-akka-http" %% "swagger-akka-http" % "0.9.1",
 5 |   "com.typesafe.akka" %% "akka-http-spray-json" % "10.0.7",
 6 |   "ch.megard" %% "akka-http-cors" % "0.2.1",
 7 |   "com.github.scopt" %% "scopt" % "3.5.0",
 8 |   "com.jsuereth" %% "scala-arm" % "2.0",
 9 |   "org.scalatest" %% "scalatest" % "3.0.0" % "test"
10 | )
11 | 
12 | com.github.retronym.SbtOneJar.oneJarSettings
13 | 
14 | mainClass in Compile := Some("au.csiro.data61.dataFusion.search.service.Main")
15 | 


--------------------------------------------------------------------------------
/dataFusion-search-service/src/main/resources/application.conf:
--------------------------------------------------------------------------------
1 | http = {
2 |   host = "0.0.0.0"
3 |   port = 8087
4 | 
5 |   host = ${?SEARCH_HTTP_HOST}
6 |   port = ${?SEARCH_HTTP_PORT}
7 | }


--------------------------------------------------------------------------------
/dataFusion-search-service/src/main/resources/logback.xml:
--------------------------------------------------------------------------------
 1 | <configuration>
 2 | 
 3 | <!--   <appender name="CONS" class="ch.qos.logback.core.ConsoleAppender"> -->
 4 | <!--     <encoder> -->
 5 | <!--       <pattern>%d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern> -->
 6 | <!--     </encoder> -->
 7 | <!--   </appender> -->
 8 |   
 9 |   <appender name="FILE" class="ch.qos.logback.core.FileAppender">
10 |     <file>search-service.log</file>
11 |     <encoder>
12 |       <pattern>%d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern>
13 |     </encoder>
14 |   </appender>
15 | 
16 | <!--   <logger name="au.csiro.data61" level="DEBUG" /> -->
17 |   
18 |   <root level="INFO">
19 |     <appender-ref ref="FILE" />
20 |   </root>
21 | </configuration>
22 | 


--------------------------------------------------------------------------------
/dataFusion-search-service/src/test/resources/logback-test.xml:
--------------------------------------------------------------------------------
 1 | <configuration>
 2 | 
 3 |   <appender name="CONS" class="ch.qos.logback.core.ConsoleAppender">
 4 |     <encoder>
 5 |       <pattern>%d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern>
 6 |     </encoder>
 7 |   </appender>
 8 |   
 9 | <!--   <appender name="FILE" class="ch.qos.logback.core.FileAppender"> -->
10 | <!--     <file>search-service-test.log</file> -->
11 | <!--     <encoder> -->
12 | <!--       <pattern>%d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern> -->
13 | <!--     </encoder> -->
14 | <!--   </appender> -->
15 | 
16 |   <logger name="au.csiro.data61" level="DEBUG" />
17 |   
18 |   <root level="INFO">
19 |     <appender-ref ref="CONS" />
20 |   </root>
21 | </configuration>
22 | 


--------------------------------------------------------------------------------
/dataFusion-search/3rd-party-licenses.md:
--------------------------------------------------------------------------------
 1 | # datafusion-search-licenses
 2 | 
 3 | Category | License | Dependency | Notes
 4 | --- | --- | --- | ---
 5 | Apache | [Apache 2](http://www.apache.org/licenses/LICENSE-2.0.txt) | io.spray # spray-json_2.12 # 1.3.3 | <notextile></notextile>
 6 | Apache | [Apache 2.0 License](http://www.apache.org/licenses/LICENSE-2.0.html) | com.typesafe.scala-logging # scala-logging_2.12 # 3.5.0 | <notextile></notextile>
 7 | Apache | [Apache License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0) | com.typesafe # config # 1.3.1 | <notextile></notextile>
 8 | Apache | [Apache License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0.txt) | commons-io # commons-io # 2.5 | <notextile></notextile>
 9 | Apache | [The Apache Software License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0.txt) | com.google.guava # guava # 18.0 | <notextile></notextile>
10 | Apache | [The Apache Software License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0.txt) | org.apache.lucene # lucene-analyzers-common # 7.0.1 | <notextile></notextile>
11 | Apache | [The Apache Software License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0.txt) | org.apache.lucene # lucene-core # 7.0.1 | <notextile></notextile>
12 | Apache | [The Apache Software License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0.txt) | org.apache.lucene # lucene-highlighter # 7.0.1 | <notextile></notextile>
13 | Apache | [The Apache Software License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0.txt) | org.apache.lucene # lucene-join # 7.0.1 | <notextile></notextile>
14 | Apache | [The Apache Software License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0.txt) | org.apache.lucene # lucene-memory # 7.0.1 | <notextile></notextile>
15 | Apache | [The Apache Software License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0.txt) | org.apache.lucene # lucene-queries # 7.0.1 | <notextile></notextile>
16 | Apache | [The Apache Software License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0.txt) | org.apache.lucene # lucene-queryparser # 7.0.1 | <notextile></notextile>
17 | Apache | [The Apache Software License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0.txt) | org.apache.lucene # lucene-sandbox # 7.0.1 | <notextile></notextile>
18 | Apache | [the Apache License, ASL Version 2.0](http://www.apache.org/licenses/LICENSE-2.0) | org.scalactic # scalactic_2.12 # 3.0.0 | <notextile></notextile>
19 | Apache | [the Apache License, ASL Version 2.0](http://www.apache.org/licenses/LICENSE-2.0) | org.scalatest # scalatest_2.12 # 3.0.0 | <notextile></notextile>
20 | BSD | [BSD 3-Clause](http://www.scala-lang.org/license.html) | org.scala-lang # scala-library # 2.12.2 | <notextile></notextile>
21 | BSD | [BSD 3-Clause](http://www.scala-lang.org/license.html) | org.scala-lang # scala-reflect # 2.12.2 | <notextile></notextile>
22 | BSD | [BSD 3-clause](http://opensource.org/licenses/BSD-3-Clause) | org.scala-lang.modules # scala-parser-combinators_2.12 # 1.0.4 | <notextile></notextile>
23 | BSD | [BSD 3-clause](http://opensource.org/licenses/BSD-3-Clause) | org.scala-lang.modules # scala-xml_2.12 # 1.0.5 | <notextile></notextile>
24 | BSD | [BSD-Style](http://www.opensource.org/licenses/bsd-license.php) | com.jsuereth # scala-arm_2.12 # 2.0 | <notextile></notextile>
25 | GPL | [GPL](https://www.gnu.org/licenses/gpl-3.0.en.html) | au.csiro.data61 # datafusion-common_2.12 # 1.1-SNAPSHOT | <notextile></notextile>
26 | LGPL | [EPL + GNU Lesser General Public License](http://logback.qos.ch/license.html) | ch.qos.logback # logback-classic # 1.2.3 | <notextile></notextile>
27 | LGPL | [EPL + GNU Lesser General Public License](http://logback.qos.ch/license.html) | ch.qos.logback # logback-core # 1.2.3 | <notextile></notextile>
28 | MIT | [MIT License](http://www.opensource.org/licenses/mit-license.php) | com.github.scopt # scopt_2.12 # 3.5.0 | <notextile></notextile>
29 | MIT | [MIT License](http://www.slf4j.org/license.html) | org.slf4j # slf4j-api # 1.7.25 | <notextile></notextile>
30 | 
31 | 


--------------------------------------------------------------------------------
/dataFusion-search/README.md:
--------------------------------------------------------------------------------
 1 | # dataFusion-search
 2 | 
 3 | ## Introduction
 4 | This project provides:
 5 | - a search library
 6 | - an indexer
 7 | - a multi-threaded CLI (command line interface) for high performance bulk searching for known entities
 8 | - other specialised command line tools (see --help)
 9 | 
10 | Search results are at the level of embedded document (e.g. a main document with embIdx = -1 or a specific embedded document with embIdx >= 0). Please see [Search Result JSON format](../dataFusion-common#search-result-json-format) for details of the output.
11 | 
12 | ## Indexing
13 | The `--index` CLI option creates the search index (at a location specified in [configuration](../README.md#configuration)). The input is in the [Document JSON format](../dataFusion-common#document-json-format) with the `content` and `embedded[].content` fields containing the text which is searched. The `meta` and `ner` data (again both main and embedded) is also separately indexed and can be searched using the [dataFusion-search-service](./dataFusion-search-service).
14 | 
15 | ## Search Strategy
16 | ### Tokenization and Punctuation
17 | Lucene's default `StandardTokenizer` removes punctuation, but as some organizations use punctuation as significant parts of their name this project uses Lucene's `WhitespaceTokenizer` and `LowerCaseFilter` with a custom `TrailingPunctuationFilter` to remove trailing commas, full stops etc. for a search which is case insensitive, but sensitive to non-trailing punctuation.
18 | ### Synonyms
19 | Lucene's `SynonymGraphFilter` is used to map synonyms specified in a file `synonyms.txt` (the location is specified in [configuration](../README.md#configuration)), initially set to map "proprietary" to "pty" and "limited" to "ltd", but can be updated by the user. The synonym mapping should be consistent for indexing and searching.
20 | ### Organizations
21 | A search hit must match all tokens in the query with tokens in the same order.
22 | ### People
23 | A search hit must match all tokens in the query, but the tokens may appear in any order.
24 | A phrase search for unordered terms (e.g. for PERSON|PERSON2) produces spurious matches where all terms are matched but not with the correct number of occurrences e.g. “Aaron H Aaron” matches “Aaron H H”. Fetching the text to check the number of occurrences would negatively impact the performance of the search, so this check is deferred to `dataFusion-util --hits` processing where the text is already available. Consequently the Search Result JSON (hits.json) contains the spurious matches, but they are filtered out from gaz.json.
25 | ### Scoring
26 | A query is assigned an [IDF](https://en.wikipedia.org/wiki/Tf%E2%80%93idf#Inverse_document_frequency) score  calculated using [Lucene’s formula](https://lucene.apache.org/core/7_1_0/core/org/apache/lucene/search/similarities/TFIDFSimilarity.html). If this score is below the threshold set by the `--minScore` CLI option (default 3.5) then the query is deemed to be insufficiently distiguishing and is skipped.
27 | ### Query Generation from CSV
28 | The `--searchCsv`  option generates queries from CSV data. If a type field is 'BUS' the record represents an organization, otherwise it represent a person.
29 | - People's names are expected to be segmented into 3 fields for the person's family, first given and other names.
30 |   - Where the 3 name fields for a person are non-blank a query is generated to search for all tokens in the name. The query and any resultant hits have `typ=PERSON`.
31 |   - Where the first and family name fields are non-blank (whether or not the other names field is non-blank) a query is generated to search for all tokens in these two fields. The query and any resultant hits have `typ=PERSON2`.
32 | - Organization names are in a single field. Where this contains at least 2 tokems a query is generated to search for all tokens in the name. The query and any resultant hits have `typ=ORGANIZATION`.
33 | - A numeric id field is carried through from the CSV to the query and the results, to facilitate integration with other systems.
34 | - Queries for the same name and `typ` are combined into a single query with multiple id values in `ExtRef.ids[]`.
35 | 
36 | ## Build, Configuration and Running
37 | 
38 | See the top level [README](../README.md).
39 | 
40 | 


--------------------------------------------------------------------------------
/dataFusion-search/build.sbt:
--------------------------------------------------------------------------------
 1 | name := "dataFusion-search"
 2 | 
 3 | libraryDependencies ++= Seq(
 4 |   "lucene-core",
 5 |   "lucene-analyzers-common",
 6 |   "lucene-queryparser",
 7 |   "lucene-highlighter"
 8 | ).map("org.apache.lucene" % _ % "7.0.1")
 9 | 
10 | libraryDependencies ++= Seq(
11 |   "com.google.guava" % "guava" % "18.0", // not "23.0", for compatability with search-service dependencies
12 |   "commons-io" % "commons-io" % "2.5",
13 |   "com.typesafe" % "config" % "1.3.1",
14 |   "com.github.scopt" %% "scopt" % "3.5.0",
15 |   "com.jsuereth" %% "scala-arm" % "2.0",
16 |   "org.scalatest" %% "scalatest" % "3.0.0" % "test"
17 | )
18 | 
19 | com.github.retronym.SbtOneJar.oneJarSettings
20 | 
21 | mainClass in Compile := Some("au.csiro.data61.dataFusion.search.Main")
22 | 


--------------------------------------------------------------------------------
/dataFusion-search/src/main/resources/application.conf:
--------------------------------------------------------------------------------
 1 | search = {
 2 |   // See: https://lucene.apache.org/core/6_6_0/analyzers-common/org/apache/lucene/analysis/synonym/SolrSynonymParser.html
 3 |   synonyms = "synonyms.txt"
 4 |   synonyms = ${?SEARCH_SYNONYMS}
 5 |   
 6 |   docIndex = "docIndex"
 7 |   docIndex = ${?SEARCH_DOC_INDEX}
 8 | 
 9 |   metaIndex = "metaIndex"
10 |   metaIndex = ${?SEARCH_META_INDEX}
11 | 
12 |   nerIndex = "nerIndex"
13 |   nerIndex = ${?SEARCH_NER_INDEX}
14 | }
15 | 


--------------------------------------------------------------------------------
/dataFusion-search/src/main/resources/logback.xml:
--------------------------------------------------------------------------------
 1 | <configuration>
 2 | 
 3 | <!--   <appender name="CONS" class="ch.qos.logback.core.ConsoleAppender"> -->
 4 | <!--     <encoder> -->
 5 | <!--       <pattern>%d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern> -->
 6 | <!--     </encoder> -->
 7 | <!--   </appender> -->
 8 |   
 9 |   <appender name="FILE" class="ch.qos.logback.core.FileAppender">
10 |     <file>search.log</file>
11 |     <encoder>
12 |       <pattern>%d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern>
13 |     </encoder>
14 |   </appender>
15 | 
16 | <!--   <logger name="au.csiro.data61" level="DEBUG" /> -->
17 |   
18 |   <root level="INFO">
19 |     <appender-ref ref="FILE" />
20 |   </root>
21 | </configuration>
22 | 


--------------------------------------------------------------------------------
/dataFusion-search/src/main/scala/au/csiro/data61/dataFusion/search/DocFreq.scala:
--------------------------------------------------------------------------------
  1 | package au.csiro.data61.dataFusion.search
  2 | 
  3 | import java.io.OutputStreamWriter
  4 | import java.nio.charset.Charset
  5 | 
  6 | import scala.io.Source
  7 | 
  8 | import org.apache.lucene.index.{ DirectoryReader, MultiFields }
  9 | 
 10 | import com.google.common.hash.{ BloomFilter, Funnels }
 11 | import com.typesafe.scalalogging.Logger
 12 | 
 13 | import DataFusionLucene.{ F_CONTENT, analyzer, docIndex }
 14 | import LuceneUtil.{ directory, termIter, tokenIter }
 15 | import Main.CliOption
 16 | import au.csiro.data61.dataFusion.common.Data.{ PosQuery, T_ORGANIZATION }
 17 | import au.csiro.data61.dataFusion.common.Data.ExtRef
 18 | import au.csiro.data61.dataFusion.common.Data.JsonProtocol.posQueryCodec
 19 | import au.csiro.data61.dataFusion.common.Timer
 20 | import resource.managed
 21 | import spray.json.{ pimpAny, pimpString }
 22 | 
 23 | object DocFreq {
 24 |   private val log = Logger(getClass)
 25 |   
 26 |   /**
 27 |    * Output docFreq,term
 28 |    */
 29 |   def writeDocFreqs(c: CliOption) = {
 30 |     for {
 31 |       r <- managed(DirectoryReader.open(directory(docIndex)))
 32 |       ti <- termIter(MultiFields.getFields(r).terms(F_CONTENT))
 33 |     } {
 34 |       println(s"${ti.docFreq},${ti.term.utf8ToString}")
 35 |     }
 36 |   }
 37 | 
 38 |   def loadTermFilter(expectedInsertions: Int) = {
 39 |     val timer = Timer()
 40 |     val termFilter = BloomFilter.create(Funnels.stringFunnel(Charset.forName("UTF-8")), expectedInsertions)
 41 |     var n = 0
 42 |     for {
 43 |       r <- managed(DirectoryReader.open(directory(docIndex)))
 44 |       ti <- termIter(MultiFields.getFields(r).terms(F_CONTENT)) // we could filter this: /^[A-Z](?:['A-Z-]*[A-Z])$/, but there are not too many without filtering
 45 |     } {
 46 |       termFilter put ti.term.utf8ToString
 47 |       n += 1
 48 |     }
 49 |     log.info(s"loadTermSet: $n terms loaded in ${timer.elapsedSecs} secs. Max expectedInsertions = $expectedInsertions")
 50 |     if (n > expectedInsertions) log.error(s"Exceeded expectedInsertions = $expectedInsertions")
 51 |     termFilter
 52 |   }
 53 |   
 54 |   /**
 55 |    * true iff termFilter mightContain all the tokens in query
 56 |    */
 57 |   def containsAllTokens(termFilter: BloomFilter[CharSequence], query: String) = {
 58 |     val tokens = tokenIter(analyzer, F_CONTENT, query).toList
 59 |     log.debug(s"containsAllTokens: analyzed tokens = ${tokens.toList}")
 60 |     tokens forall termFilter.mightContain // if false the filter definitely does not contain the term
 61 |   }
 62 |   
 63 |   def filterQuery(c: CliOption) = {
 64 |     val termFilter = loadTermFilter(c.maxTerms)
 65 |     for (w <- managed(new OutputStreamWriter(System.out, "UTF-8"))) {
 66 |       for (line <- Source.fromInputStream(System.in, "UTF-8").getLines) {
 67 |         val q = line.parseJson.convertTo[PosQuery]
 68 |         if (containsAllTokens(termFilter, q.extRef.name)) {
 69 |           w.write(line)
 70 |           w.write('\n')
 71 |         } else log.debug(s"filterQuery: not all tokens in index")
 72 |       }
 73 |     }
 74 |   }
 75 |   
 76 |   /**
 77 |    * read NER results, filter, write queries
 78 |    */
 79 |   def nerToQuery(c: CliOption) = {
 80 |     val rNonName = "[^A-Za-z.'-]".r
 81 |     val rBigSpace = " {2,}".r
 82 |     def clean(q: String) = {
 83 |       val q2 = rNonName.replaceAllIn(q, " ").trim
 84 |       rBigSpace.replaceAllIn(q2, " ")
 85 |     }
 86 |     
 87 |     val termFilter = loadTermFilter(c.maxTerms)
 88 |     for (w <- managed(new OutputStreamWriter(System.out, "UTF-8"))) {
 89 |       for (line <- Source.fromInputStream(System.in, "UTF-8").getLines) {
 90 |         val query = clean(line.parseJson.toString)
 91 |         if (query.length >= 6 && containsAllTokens(termFilter, query)) {
 92 |           val q = PosQuery(ExtRef(query, List.empty), T_ORGANIZATION)
 93 |           w.write(q.toJson.compactPrint)
 94 |           w.write('\n')
 95 |         } else log.debug(s"nerToQuery: shorter than 6 chars or not all tokens in index")
 96 |       }
 97 |     }
 98 |   }
 99 |   
100 | }


--------------------------------------------------------------------------------
/dataFusion-search/src/main/scala/au/csiro/data61/dataFusion/search/Indexer.scala:
--------------------------------------------------------------------------------
 1 | package au.csiro.data61.dataFusion.search
 2 | 
 3 | import scala.io.{ Codec, Source }
 4 | 
 5 | import org.apache.lucene.index.IndexWriter
 6 | 
 7 | import com.typesafe.config.ConfigFactory
 8 | import com.typesafe.scalalogging.Logger
 9 | 
10 | import DataFusionLucene.{ docIndex, metaIndex, nerIndex }
11 | import DataFusionLucene.DFIndexing.{ ldoc2doc, lmeta2doc, lner2doc, mkIndexer }
12 | import LuceneUtil.directory
13 | import Main.CliOption
14 | import au.csiro.data61.dataFusion.common.Data.{ Doc, EMB_IDX_MAIN, IdEmbIdx, LDoc, LMeta, LNer }
15 | import au.csiro.data61.dataFusion.common.Data.JsonProtocol.docFormat
16 | import au.csiro.data61.dataFusion.common.Parallel.doParallel
17 | import resource.managed
18 | import spray.json.pimpString
19 | 
20 | object Indexer {
21 |   private val log = Logger(getClass)
22 |   implicit val codec = Codec.UTF8
23 |   
24 |   def indexer(docIndexer: IndexWriter, metaIndexer: IndexWriter, nerIndexer: IndexWriter)(d: Doc): Unit = {
25 |     val idMain = IdEmbIdx(d.id, EMB_IDX_MAIN)
26 |     docIndexer.addDocument(LDoc(idMain, d.content.getOrElse(""), d.path))
27 |     for {
28 |       (k, v) <- d.meta
29 |     } metaIndexer.addDocument(LMeta(idMain, k, v))
30 |     for {
31 |       n <- d.ner
32 |     } nerIndexer.addDocument(LNer(idMain, n.posStr, n.posEnd, n.offStr, n.offEnd, n.text, n.typ, n.impl))
33 |     
34 |     for {
35 |       (e, embIdx) <- d.embedded.zipWithIndex
36 |     } {
37 |       val idEmb = IdEmbIdx(d.id, embIdx)
38 |       docIndexer.addDocument(LDoc(idEmb, e.content.getOrElse(""), d.path))
39 |       for {
40 |         (k, v) <- e.meta
41 |       } metaIndexer.addDocument(LMeta(idEmb, k, v))
42 |       for {
43 |         n <- e.ner
44 |       } nerIndexer.addDocument(LNer(idEmb, n.posStr, n.posEnd, n.offStr, n.offEnd, n.text, n.typ, n.impl))
45 |     }
46 |   }
47 |   
48 |   /**
49 |    * Reads JSON Doc's from stdin (one per line) and indexes them.
50 |    */
51 |   def run(c: CliOption) = {
52 |     val conf = ConfigFactory.load.getConfig("search")
53 | 
54 |     for {
55 |       docIndexer <- managed(mkIndexer(directory(docIndex)))
56 |       metaIndexer <- managed(mkIndexer(directory(metaIndex)))
57 |       nerIndexer <- managed(mkIndexer(directory(nerIndex)))
58 |     } {
59 |       val index: Doc => Unit = indexer(docIndexer, metaIndexer, nerIndexer)
60 |       
61 |       var count = 0
62 |       val in: Iterator[String] = Source.fromInputStream(System.in).getLines.map { json =>
63 |         count += 1
64 |         if (count % 1000 == 0) log.info(s"run.in: Queued $count docs ...")
65 |         json
66 |       }
67 |       def work(json: String): Boolean = {
68 |         index(json.parseJson.convertTo[Doc])
69 |         true
70 |       }
71 |       def out(more: Boolean): Unit = ()
72 |       
73 |       doParallel(in, work, out, "", false, c.numWorkers)
74 |       log.info(s"run: complete. Indexed $count docs")
75 |     }
76 |   }
77 | 
78 | }


--------------------------------------------------------------------------------
/dataFusion-search/src/main/scala/au/csiro/data61/dataFusion/search/LuceneUtil.scala:
--------------------------------------------------------------------------------
  1 | package au.csiro.data61.dataFusion.search
  2 | 
  3 | import java.io.{ Closeable, File }
  4 | 
  5 | import scala.util.Try
  6 | 
  7 | import org.apache.lucene.analysis.{ Analyzer, TokenStream }
  8 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute
  9 | import org.apache.lucene.document.Document
 10 | import org.apache.lucene.index.{ DirectoryReader, PostingsEnum, Terms, TermsEnum }
 11 | import org.apache.lucene.search.{ IndexSearcher, Query, ScoreDoc }
 12 | import org.apache.lucene.store.{ Directory, FSDirectory }
 13 | 
 14 | import com.typesafe.scalalogging.Logger
 15 | 
 16 | import au.csiro.data61.dataFusion.common.Timer
 17 | import org.apache.lucene.analysis.TokenFilter
 18 | 
 19 | 
 20 | 
 21 | /**
 22 |  * Generic Lucene indexing and searching.
 23 |  * 
 24 |  * simplified from: https://github.csiro.au/bac003/social-watch/blob/master/analytics/src/main/scala/org/t3as/socialWatch/analytics/LuceneUtil.scala
 25 |  */
 26 | object LuceneUtil {
 27 |   private val log = Logger(getClass)
 28 | 
 29 |   def tokenIter(ts: TokenStream): Iterator[String] = {
 30 |     ts.reset
 31 |     Iterator.continually {
 32 |       val more = ts.incrementToken
 33 |       if (!more) {
 34 |         ts.end
 35 |         ts.close
 36 |         // log.debug("tokenIter: TokenStream closed")
 37 |       }
 38 |       more
 39 |     }.takeWhile(identity).map(_ => ts.getAttribute(classOf[CharTermAttribute]).toString)
 40 |   }
 41 | 
 42 |   def tokenIter(analyzer: Analyzer, fieldName: String, text: String): Iterator[String]
 43 |     = tokenIter(analyzer.tokenStream(fieldName, text))
 44 |     
 45 |   def directory(indexDir: File) = FSDirectory.open(indexDir.toPath)
 46 |   
 47 |   /** unsafe - returns the same TermsEnum but repositioned each iteration */
 48 |   def termIter(terms: Terms): Iterator[TermsEnum] = {
 49 |     val ti = terms.iterator
 50 |     Iterator.continually(ti.next).takeWhile(_ != null).map(_ => ti)
 51 |   }
 52 |   
 53 |   /** unsafe - returns the same PostingsEnum but repositioned each iteration. Int value is position (index of term/word in field). */
 54 |   def postIter(p: PostingsEnum): Iterator[(Int, PostingsEnum)] = {
 55 |     p.nextDoc
 56 |     Iterator.range(0, p.freq).map { _ =>
 57 |       val pos = p.nextPosition
 58 |       (pos, p)
 59 |     }
 60 |   }
 61 |   
 62 |   /**
 63 |    * TokenFilter that removes all trailing chars after the last letter or digit.
 64 |    * Based on: org.apache.lucene.analysis.en.EnglishPossessiveFilter.
 65 |    */
 66 |   class TrailingPunctuationFilter(in: TokenStream) extends TokenFilter(in) {
 67 |     val termAtt = addAttribute(classOf[CharTermAttribute])
 68 |   
 69 |     override def incrementToken: Boolean = {
 70 |       if (!in.incrementToken()) {
 71 |         return false;
 72 |       }
 73 |       val buf = termAtt.buffer
 74 |       val len = termAtt.length
 75 |       
 76 |       val lastAlphaNum = {
 77 |         var last = -1
 78 |         var i = len - 1
 79 |         while (i >= 0 && last == -1) {
 80 |           if (Character.isLetterOrDigit(buf(i))) last = i
 81 |           i -= 1
 82 |         }
 83 |         last
 84 |       }
 85 |       
 86 |       if (lastAlphaNum != -1) termAtt.setLength(lastAlphaNum + 1)
 87 |       return true;
 88 |     }
 89 |   }
 90 |   
 91 |   class Searcher[Hit, Results](
 92 |     directory: Directory,
 93 |     toHit: (ScoreDoc, Document) => Hit, // convert score and map of fields to Hit
 94 |     toResults: (Int, Float, Seq[Hit], Option[String]) => Results // convert totalHits, elapsedSecs, Seq[Hit], Option[error] to Results
 95 |   ) extends Closeable {      
 96 |     val log = Logger(getClass)
 97 | 
 98 |     val searcher = open
 99 |     protected def open = new IndexSearcher(DirectoryReader.open(directory))
100 |     
101 |     log.debug(s"Searcher: numDocs = ${searcher.getIndexReader.numDocs}")
102 |         
103 |     def search(q: Query, numHits: Int = 20) = {
104 |       val timer = Timer()
105 |       
106 |       val result = for {
107 |         topDocs <- Try {
108 |           searcher.search(q, numHits)
109 |         }
110 |         hits <- Try {
111 |           topDocs.scoreDocs map { scoreDoc => toHit(scoreDoc, searcher.doc(scoreDoc.doc)) }
112 |         }
113 |       } yield toResults(topDocs.totalHits.toInt, timer.elapsedSecs.toFloat, hits, None)
114 |       
115 |       result.recover { case e => toResults(0, timer.elapsedSecs.toFloat, List(), Some(e.getMessage)) }.get
116 |     }
117 |     
118 |     def close = searcher.getIndexReader.close
119 |   }
120 | 
121 | }


--------------------------------------------------------------------------------
/dataFusion-search/src/main/scala/au/csiro/data61/dataFusion/search/Main.scala:
--------------------------------------------------------------------------------
 1 | package au.csiro.data61.dataFusion.search
 2 | 
 3 | import java.io.File
 4 | 
 5 | import scala.util.control.NonFatal
 6 | 
 7 | import com.typesafe.scalalogging.Logger
 8 | 
 9 | object Main {
10 |   private val log = Logger(getClass)
11 |   
12 |   case class CliOption(output: File, index: Boolean, searchJson: Boolean, searchCsv: Boolean, csvDelim: Char, csvFields: Seq[String], csvPersonWith2Names: Boolean, minScore: Float, docFreq: Boolean, export: Boolean, filterQueryOnly: Boolean, filterQuery: Boolean, maxTerms: Int, nerToQuery: Boolean, slop: Int, numWorkers: Int)
13 |   
14 |   val defaultCliOption = CliOption(new File("hits.json"), false, false, false, '\t', Seq("STRCTRD_FMLY_NM", "STRCTRD_GVN_NM", "STRCTRD_OTHR_GVN_NM", "SEX_CD", "USTRCTRD_FULL_NM", "CLNT_INTRNL_ID"), true, 3.5f, false, false, false, true, 10000000, false, 0, Runtime.getRuntime.availableProcessors)
15 |   
16 |   val parser = new scopt.OptionParser[CliOption]("search") {
17 |     head("search", "0.x")
18 |     opt[File]("output") action { (v, c) =>
19 |       c.copy(output = v)
20 |     } text (s"output JSON file, (default ${defaultCliOption.output.getPath})")
21 |     opt[Unit]("index") action { (_, c) =>
22 |       c.copy(index = true, numWorkers = Math.min(12, c.numWorkers))      // slower with more than 12 workers, if you really want more put --index before --numWorkers
23 |     } text (s"create Lucene indices from JSON input (default ${defaultCliOption.index})")
24 |     opt[Unit]("searchJson") action { (_, c) =>
25 |       c.copy(searchJson = true, numWorkers = Math.min(25, c.numWorkers)) // slower with more than 25 workers, if you really want more put --searchJson before --numWorkers
26 |     } text (s"search with JSON queries on stdin (default ${defaultCliOption.searchJson})")
27 |     opt[Unit]("searchCsv") action { (_, c) =>
28 |       c.copy(searchCsv = true, numWorkers = Math.min(25, c.numWorkers))  // slower with more than 25 workers, if you really want more put --searchCsv before --numWorkers
29 |     } text (s"search with CSV queries on stdin (default ${defaultCliOption.searchCsv})")
30 |     opt[String]("csvDelim") action { (v, c) =>
31 |       c.copy(csvDelim = v.headOption.getOrElse(defaultCliOption.csvDelim))
32 |     } text (s"CSV field delimeter (default ${if (defaultCliOption.csvDelim == '\t') "tab" else defaultCliOption.csvDelim.toString})")
33 |     opt[Seq[String]]("csvFields") action { (v, c) =>
34 |       c.copy(csvFields = v)
35 |     } validate { v =>
36 |       if (v.size == 6) success
37 |       else failure("6 field names are required")
38 |     } text (s"CSV field names (6) for person's family, first given and other names, record type ('BUS' for organization or gender? for a person), business name, id (default ${defaultCliOption.csvFields.toList})")
39 |     opt[Boolean]("csvPersonWith2Names") action { (v, c) =>
40 |       c.copy(csvPersonWith2Names = v)
41 |     } text (s"CSV used to generate 2 name (omitting middle name) searches for people in addition to 3 name search (default ${defaultCliOption.csvPersonWith2Names})")
42 |     opt[Double]("minScore") action { (v, c) =>
43 |       c.copy(minScore = v.toFloat)
44 |     } text (s"minScore queries with a (IDF) score below this are skipped, (default ${defaultCliOption.minScore})")
45 |     opt[Unit]("docFreq") action { (_, c) =>
46 |       c.copy(docFreq = true)
47 |     } text (s"output term document frequencies from index as CSV (default ${defaultCliOption.docFreq})")
48 |     opt[Unit]("export") action { (_, c) =>
49 |       c.copy(export = true)
50 |     } text (s"output the stored JSON for each doc (default ${defaultCliOption.export})")
51 |     opt[Unit]("filterQueryOnly") action { (_, c) =>
52 |       c.copy(filterQueryOnly = true)
53 |     } text (s"filter Query JSON from stdin to stdout, outputing only lines with all query terms most likely in the index (default ${defaultCliOption.filterQueryOnly})")
54 |     opt[Boolean]("filterQuery") action { (v, c) =>
55 |       c.copy(filterQuery = v)
56 |     } text (s"search CLI skips search if any query term is definitely not in the index (default ${defaultCliOption.filterQuery})")
57 |     opt[Int]("maxTerms") action { (v, c) =>
58 |       c.copy(maxTerms = v)
59 |     } text (s"maxTerms for Bloom Filter used with filterQuery, (default ${defaultCliOption.maxTerms})")
60 |     opt[Unit]("nerToQuery") action { (_, c) =>
61 |       c.copy(nerToQuery = true)
62 |     } text (s"filter JSON names from stdin to stdout, outputing queries only for lines with all specified query terms in the index (default ${defaultCliOption.filterQuery})")
63 |     opt[Int]("slop") action { (v, c) =>
64 |       c.copy(slop = v)
65 |     } text (s"slop for posQuery, (default ${defaultCliOption.slop})")
66 |     opt[Int]("numWorkers") action { (v, c) =>
67 |       c.copy(numWorkers = v)
68 |     } text (s"numWorkers for CLI queries, (default ${defaultCliOption.numWorkers} the number of CPUs)")
69 |     help("help") text ("prints this usage text")
70 |   }
71 |     
72 |   def main(args: Array[String]): Unit = {
73 |     try {
74 |       parser.parse(args, defaultCliOption).foreach { c => 
75 |         log.info(s"main: cliOptions = $c")
76 |         if (c.index) Indexer.run(c)
77 |         else if (c.docFreq) DocFreq.writeDocFreqs(c)
78 |         else if (c.filterQueryOnly) DocFreq.filterQuery(c)
79 |         else if (c.nerToQuery) DocFreq.nerToQuery(c)
80 |         else if (c.export) Search.cliExportDocIds(c)
81 |         else if (c.searchJson || c.searchCsv) Search.cliPosDocSearch(c)
82 |         else log.info("Nothing to do. Try --help")
83 |       }
84 |     } catch {
85 |       case NonFatal(e) => log.error("main:", e)
86 |     }
87 |   }
88 |   
89 | }


--------------------------------------------------------------------------------
/dataFusion-search/src/test/resources/logback-test.xml:
--------------------------------------------------------------------------------
 1 | <configuration>
 2 | 
 3 |   <appender name="CONS" class="ch.qos.logback.core.ConsoleAppender">
 4 |     <encoder>
 5 |       <pattern>%d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern>
 6 |     </encoder>
 7 |   </appender>
 8 |   
 9 | <!--   <appender name="FILE" class="ch.qos.logback.core.FileAppender"> -->
10 | <!--     <file>search-test.log</file> -->
11 | <!--     <encoder> -->
12 | <!--       <pattern>%d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern> -->
13 | <!--     </encoder> -->
14 | <!--   </appender> -->
15 | 
16 |   <logger name="au.csiro.data61" level="DEBUG" />
17 |   
18 |   <root level="INFO">
19 |     <appender-ref ref="CONS" />
20 |   </root>
21 | </configuration>
22 | 


--------------------------------------------------------------------------------
/dataFusion-search/src/test/scala/au/csiro/data61/dataFusion/search/DataFusionLuceneTest.scala:
--------------------------------------------------------------------------------
  1 | package au.csiro.data61.dataFusion.search
  2 | 
  3 | import org.apache.lucene.index.DirectoryReader
  4 | import org.apache.lucene.search.IndexSearcher
  5 | import org.apache.lucene.store.RAMDirectory
  6 | import org.scalatest.{ FlatSpec, Matchers }
  7 | 
  8 | import com.typesafe.scalalogging.Logger
  9 | 
 10 | import DataFusionLucene.{ F_CONTENT, analyzer, synonymAnalyzer }
 11 | import DataFusionLucene.DFIndexing.{ ldoc2doc, mkIndexer }
 12 | import DataFusionLucene.DFSearching.PosDocSearch.searchSpans
 13 | import LuceneUtil.tokenIter
 14 | import au.csiro.data61.dataFusion.common.Data.{ ExtRef, IdEmbIdx, LDoc, PosQuery, T_ORGANIZATION, T_PERSON }
 15 | 
 16 | class DataFusionLuceneTest extends FlatSpec with Matchers {
 17 |   val log = Logger(getClass)
 18 | 
 19 |   "SynonymAnalyzer" should "work" in {
 20 |     // depends on mapping: limited => ltd in synonyms.txt
 21 |     tokenIter(synonymAnalyzer, F_CONTENT, "AA AA Pty. Limited").mkString(" ") should be ("aa aa pty ltd")
 22 |   }
 23 |   
 24 |   val doc1 = "doc1: Sarah Jones\nAA AA Pty. Limited"
 25 |   val doc2 = "doc2: John Jones\nMs. AA\nMr. AA BB AA"
 26 |   val doc3 = "doc3: @ PTY. LIMITED is a subsidiary of $ PTY LIMITED"
 27 | 
 28 |   def mkTestSearcher = {
 29 |     val dir = new RAMDirectory
 30 |     val xer = mkIndexer(dir)
 31 |     for {
 32 |       (content, idx) <- Seq(doc1, doc2, doc3).zipWithIndex
 33 |     } xer.addDocument(LDoc(IdEmbIdx(idx, -1), content, "path"))
 34 |     xer.close
 35 |     new IndexSearcher(DirectoryReader.open(dir))
 36 |   }
 37 |   
 38 |   "SpanQuery" should "provide positions" in {
 39 |     val searcher = mkTestSearcher
 40 |     log.debug(s"numDocs = ${searcher.getIndexReader.numDocs}")
 41 |     
 42 |     {
 43 |       val q = PosQuery(ExtRef("AA AA Proprietary Ltd.", List(1L)), T_ORGANIZATION)
 44 |       val x = searchSpans(searcher, 0, q, 0.0f)
 45 |       log.debug(s"SpanQuery: x = $x")
 46 |       x.stats.totalHits should be(1)
 47 |       x.hits.size should be(1)
 48 |       x.hits.head.posInfos.size should be(1)
 49 |       val pi = x.hits.head.posInfos.head
 50 |       doc1.substring(pi.offStr, pi.offEnd) should be ("AA AA Pty. Limited")
 51 |     }
 52 |     
 53 |     // TODO: this is known to fail, "@ PTY LTD" and "$ PTY LTD" are tokenized to "PTY LTD"
 54 |     // We could use WhitespaceTokenizer with LuceneUtil.TrailingPunctuationFilter to fix,
 55 |     // but the current StandardTokenizer might be addressing issues we don't know about so this might cause other issues.
 56 |     {
 57 |       val q = PosQuery(ExtRef("$ Proprietary Ltd.", List(1L)), T_ORGANIZATION)
 58 |       val tokens = tokenIter(analyzer, F_CONTENT, q.extRef.name).toList
 59 |       log.debug(s"SpanQuery: tokens = $tokens")
 60 |       tokens.size should be(3)
 61 |       
 62 |       val x = searchSpans(searcher, 0, q, 0.0f)
 63 |       log.debug(s"SpanQuery: x = $x")
 64 |       x.stats.totalHits should be(1)
 65 |       x.hits.size should be(1)
 66 |       x.hits.head.posInfos.size should be(1)
 67 |       val pi = x.hits.head.posInfos.head
 68 |       doc3.substring(pi.offStr, pi.offEnd) should be ("$ PTY LIMITED")
 69 |     }
 70 |     
 71 |     {
 72 |       val q = PosQuery(ExtRef("Jones Sarah", List(2L)), T_PERSON)
 73 |       val x = searchSpans(searcher, 0, q, 0.0f)
 74 |       log.debug(s"SpanQuery: x = $x")
 75 |       x.stats.totalHits should be(1)
 76 |       x.hits.size should be(1)
 77 |       x.hits.head.posInfos.size should be(1)
 78 |       
 79 |       val pi = x.hits.head.posInfos.head
 80 |       doc1.substring(pi.offStr, pi.offEnd) should be ("Sarah Jones") 
 81 |     }
 82 |     
 83 |     {
 84 |       val q = PosQuery(ExtRef("AA AA", List(1L)), T_PERSON)
 85 |       val x = searchSpans(searcher, 0, q, 0.0f)
 86 |       log.debug(s"SpanQuery: x = $x")
 87 |       x.stats.totalHits should be(1)
 88 |       x.hits.size should be(1)
 89 |       x.hits.head.posInfos.size should be(1)
 90 |       val pi = x.hits.head.posInfos.head
 91 |       doc1.substring(pi.offStr, pi.offEnd) should be ("AA AA")
 92 |     }
 93 |     
 94 |     {
 95 |       val q = PosQuery(ExtRef("AA AA BB", List(1L)), T_PERSON)
 96 |       val x = searchSpans(searcher, 0, q, 0.0f)
 97 |       log.debug(s"SpanQuery: x = $x")
 98 |       x.stats.totalHits should be(1)
 99 |       x.hits.size should be(1)
100 |       x.hits.head.posInfos.size should be(1)
101 |       val pi = x.hits.head.posInfos.head
102 |       doc2.substring(pi.offStr, pi.offEnd) should be ("AA BB AA")
103 |     }
104 | 
105 |     
106 |     {
107 |       val q = PosQuery(ExtRef("AA AA CC", List(1L)), T_PERSON)
108 |       val x = searchSpans(searcher, 0, q, 0.0f)
109 |       log.debug(s"SpanQuery: x = $x")
110 |       x.stats.totalHits should be(0)
111 |     }
112 |     
113 | //    // TODO: this is known to fail, single term search is not working
114 | //    {
115 | //      val q = PosQuery(ExtRef("John", List(1L)), T_PERSON)
116 | //      val x = searchSpans(searcher, 0, q, 0.0f)
117 | //      log.debug(s"SpanQuery: x = $x")
118 | //      x.stats.totalHits should be(1)
119 | //      x.hits.size should be(1)
120 | //      x.hits.head.posInfos.size should be(1)
121 | //      val pi = x.hits.head.posInfos.head
122 | //      doc2.substring(pi.offStr, pi.offEnd) should be ("JOHN")
123 | //    }
124 |   }
125 | 
126 | }


--------------------------------------------------------------------------------
/dataFusion-search/src/test/scala/au/csiro/data61/dataFusion/search/JsonTest.scala:
--------------------------------------------------------------------------------
 1 | package au.csiro.data61.dataFusion.search
 2 | 
 3 | import org.scalatest.{ FlatSpec, Matchers }
 4 | 
 5 | import com.typesafe.scalalogging.Logger
 6 | 
 7 | import au.csiro.data61.dataFusion.common.Data.{ DHits, EMB_IDX_MAIN, IdEmbIdx, LDoc, Stats }
 8 | import au.csiro.data61.dataFusion.common.Data.JsonProtocol.dHitsCodec
 9 | import spray.json.{ pimpAny, pimpString }
10 | 
11 | class JsonTest extends FlatSpec with Matchers {
12 |   val log = Logger(getClass)
13 |   
14 |   val hits = DHits(Stats(1, 0.5f), List((12.3f, LDoc(IdEmbIdx(1, EMB_IDX_MAIN), "some content", "a/path"))), None)
15 |   
16 |   "DocHits" should "ser/deserialize" in {
17 |     val json = hits.toJson.compactPrint
18 |     log.debug(s"json = $json")
19 |     val d2 = json.parseJson.convertTo[DHits]
20 |     d2 should be(hits)
21 |   }
22 |   
23 | }


--------------------------------------------------------------------------------
/dataFusion-search/src/test/scala/au/csiro/data61/dataFusion/search/SearchTest.scala:
--------------------------------------------------------------------------------
 1 | package au.csiro.data61.dataFusion.search
 2 | 
 3 | import org.scalatest.{ FlatSpec, Matchers }
 4 | 
 5 | import com.typesafe.scalalogging.Logger
 6 | 
 7 | import Main.defaultCliOption
 8 | import Search.inCsv
 9 | import au.csiro.data61.dataFusion.common.Data.{ ExtRef, PosQuery, T_ORGANIZATION, T_PERSON, T_PERSON2 }
10 | 
11 | class SearchTest extends FlatSpec with Matchers {
12 |   val log = Logger(getClass)
13 |   
14 |   "inCsv" should "parse CSV" in {
15 |     val lines = Seq(
16 |       "Clnt_Intrnl_Id|SEX_CD|STRCTRD_FMLY_NM|STRCTRD_GVN_NM|STRCTRD_OTHR_GVN_NM|USTRCTRD_FULL_NM",
17 |       "1|M|BLOGGS|FREDERICK|A|",
18 |       "2|BUS||||COSMIC HOLDINGS INCORPORATED",
19 |     )
20 |     val qs = inCsv(defaultCliOption.copy(csvDelim = '|'), lines.iterator).toList
21 |     log.debug(s"qs = $qs")
22 |     val x1 = PosQuery(ExtRef("FREDERICK A BLOGGS", List(1L)), T_PERSON)
23 |     val x2 = PosQuery(ExtRef("FREDERICK BLOGGS", List(1L)), T_PERSON2)
24 |     val x3 = PosQuery(ExtRef("COSMIC HOLDINGS INCORPORATED", List(2L)), T_ORGANIZATION)
25 |     qs.toSet should be(Set(x1, x2, x3)) // inCsv is parallelized so results not ordered
26 |   }
27 |   
28 | }


--------------------------------------------------------------------------------
/dataFusion-search/synonyms.txt:
--------------------------------------------------------------------------------
1 | # Grammar: https://lucene.apache.org/core/6_6_0/analyzers-common/index.html?org/apache/lucene/analysis/synonym/SolrSynonymParser.html
2 | # e.g.   i-pod, i pod => ipod
3 | 
4 | proprietary => pty
5 | limited => ltd
6 | 


--------------------------------------------------------------------------------
/dataFusion-tika-service/README.md:
--------------------------------------------------------------------------------
 1 | # dataFusion-tika-service
 2 | 
 3 | ## Introduction
 4 | 
 5 | This project provides a [RESTful](https://en.wikipedia.org/wiki/Representational_state_transfer) web service based on dataFusion-tika.
 6 | 
 7 | ## Build, Configuration and Running
 8 | 
 9 | See the top level [README](../README.md).
10 | 
11 | Example:
12 | 
13 |     # run web service
14 |     java -jar target/scala-2.12/datafusion-tika_2.12-0.2-SNAPSHOT-one-jar.jar
15 |     # get swagger description (useful when loaded into Swagger UI)
16 |     curl http://localhost:9998/api-docs/swagger.json
17 |     # process a file
18 |     curl --upload-file src/test/resources/exampleData/PDF002.pdf http://localhost:9998/tika?path=PDF002.pdf
19 | 


--------------------------------------------------------------------------------
/dataFusion-tika-service/build.sbt:
--------------------------------------------------------------------------------
 1 | name := "dataFusion-tika-service"
 2 | 
 3 | // the one-jar classloader helpfully reports on conflicting classes (same package & name) from different jars
 4 | // (including whether the byte-code differs) and this has been used to set the following exclusions:
 5 | 
 6 | libraryDependencies ++= Seq(
 7 |   "com.github.swagger-akka-http" %% "swagger-akka-http" % "0.9.1" exclude("javax.ws.rs", "jsr311-api"), // replaced by javax.ws.rs-api
 8 |   "com.typesafe.akka" %% "akka-http-spray-json" % "10.0.7",
 9 |   "ch.megard" %% "akka-http-cors" % "0.2.1",
10 |   
11 |   "com.github.scopt" %% "scopt" % "3.7.0",
12 |   "com.jsuereth" %% "scala-arm" % "2.0",
13 |   "org.scalatest" %% "scalatest" % "3.0.4" % "test"
14 | )
15 |   
16 | com.github.retronym.SbtOneJar.oneJarSettings
17 | 
18 | mainClass in Compile := Some("au.csiro.data61.dataFusion.tika.service.Main")
19 | 


--------------------------------------------------------------------------------
/dataFusion-tika-service/src/main/resources/application.conf:
--------------------------------------------------------------------------------
1 | http = {
2 |   host = "0.0.0.0"
3 |   port = 9998
4 | 
5 |   host = ${?TIKA_HTTP_HOST}
6 |   port = ${?TIKA_HTTP_PORT}
7 | }


--------------------------------------------------------------------------------
/dataFusion-tika-service/src/main/resources/logback.xml:
--------------------------------------------------------------------------------
 1 | <configuration>
 2 | 
 3 |   <appender name="CONS" class="ch.qos.logback.core.ConsoleAppender">
 4 |     <encoder>
 5 |       <pattern>%d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern>
 6 |     </encoder>
 7 |   </appender>
 8 |   
 9 |   <appender name="FILE" class="ch.qos.logback.core.FileAppender">
10 |     <file>tika-service.log</file>
11 |     <encoder>
12 |       <pattern>%d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern>
13 |     </encoder>
14 |   </appender>
15 | 
16 | <!--   <logger name="au.csiro.data61" level="DEBUG" /> -->
17 |   
18 |   <root level="INFO">
19 |     <appender-ref ref="FILE" />
20 |   </root>
21 | </configuration>
22 | 


--------------------------------------------------------------------------------
/dataFusion-tika-service/src/test/resources/logback-test.xml:
--------------------------------------------------------------------------------
 1 | <configuration>
 2 | 
 3 |   <appender name="CONS" class="ch.qos.logback.core.ConsoleAppender">
 4 |     <encoder>
 5 |       <pattern>%d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern>
 6 |     </encoder>
 7 |   </appender>
 8 |   
 9 |   <appender name="FILE" class="ch.qos.logback.core.FileAppender">
10 |     <file>tika-service-test.log</file>
11 |     <encoder>
12 |       <pattern>%d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern>
13 |     </encoder>
14 |   </appender>
15 | 
16 |   <logger name="au.csiro.data61" level="DEBUG" />
17 | 
18 |   <!-- If main level is DEBUG use the following because these packages are too chatty:
19 |   <logger name="org.apache.pdfbox" level="INFO" />
20 |   <logger name="org.apache.fontbox" level="INFO" />
21 |   <logger name="com.optimaize.langdetect" level="INFO" />
22 |   -->
23 |   
24 |   <root level="INFO">
25 |     <appender-ref ref="CONS" />
26 |   </root>
27 | </configuration>
28 | 


--------------------------------------------------------------------------------
/dataFusion-tika/README.md:
--------------------------------------------------------------------------------
 1 | # dataFusion-tika
 2 | 
 3 | ## Introduction
 4 | 
 5 | This project provides a library and multi-threaded CLI (command line interface) for bulk processing. It provides:
 6 | 
 7 | - access to [Apache Tika](https://tika.apache.org/) customized to OCR images embedded in PDFs (including TIFF, JPEG2000 and JBIG2, which are not handled by Tika out-of-the-box);
 8 | - some cleaning and filtering of Tika metadata;
 9 | - augmentation of the metadata with the language of the text (`language-code` and `language-prob`) and a score for how closely the text matches a simple model for English sentences `english-score`; and
10 | - results in the [Document JSON format](../dataFusion-common#document-json-format).
11 | 
12 | ## Build, Configuration and Running
13 | 
14 | See the top level [README](../README.md).
15 | 
16 | Example:
17 | 
18 |     # CLI processing, with one file path per input line
19 |     ls -1 src/test/resources/exampleData/PDF00{2,3}* | \
20 |     java -jar target/scala-2.12/datafusion-tika_2.12-0.2-SNAPSHOT-one-jar.jar
21 | 


--------------------------------------------------------------------------------
/dataFusion-tika/build.sbt:
--------------------------------------------------------------------------------
 1 | name := "dataFusion-tika"
 2 | 
 3 | // the one-jar classloader helpfully reports on conflicting classes (same package & name) from different jars
 4 | // (including whether the byte-code differs) and this has been used to set the following exclusions:
 5 | // jj2000 is older fork of jai-imageio-jpeg2000
 6 | //  
 7 | // tika-parsers and sentiment-analysis-parser both contain org/apache/tika/parser/sentiment/analysis/SentimentParser
 8 | // I guess the tika-parsers one is newer but still relies on other code in sentiment-analysis-parser?
 9 | // We don't use it so exclude sentiment-analysis-parser to avoid the conflict.
10 | //
11 | // The junrar and jcip-annotations dependencies of tika-parsers have dubious licenses, so these are excluded.
12 | // An alternative jcip-annotations is used (no alternative for unrar)
13 |   
14 | 
15 | libraryDependencies ++= Seq(
16 |   "org.apache.tika" % "tika-parsers" % "1.16" exclude("edu.ucar", "jj2000") exclude("edu.usc.ir", "sentiment-analysis-parser") exclude("com.github.junrar", "junrar") exclude("net.jcip", "jcip-annotations"),
17 |   "com.github.stephenc.jcip" % "jcip-annotations" % "1.0-1",
18 |   "com.github.jai-imageio" % "jai-imageio-core" % "1.3.1",        // add PDFBox support for TIFF
19 |   "com.github.jai-imageio" % "jai-imageio-jpeg2000" % "1.3.0",    // add PDFBox support for jpeg2000
20 |   "com.levigo.jbig2" % "levigo-jbig2-imageio" % "2.0",            // add PDFBox support for jbig2
21 |   "org.xerial" % "sqlite-jdbc" % "3.19.3",                        // add to 'parse' sqlite files and embedded files
22 |   "com.optimaize.languagedetector" % "language-detector" % "0.6", // tika-langdetect-1.15 dependency is 0.5, but we use language-detector directly, not via tika-langdetect
23 |   "com.typesafe" % "config" % "1.3.1",
24 |   "com.github.scopt" %% "scopt" % "3.7.0",
25 |   "com.jsuereth" %% "scala-arm" % "2.0",
26 |   "org.scalatest" %% "scalatest" % "3.0.4" % "test"
27 | )
28 |   
29 | com.github.retronym.SbtOneJar.oneJarSettings
30 | 
31 | mainClass in Compile := Some("au.csiro.data61.dataFusion.tika.Main")
32 | 


--------------------------------------------------------------------------------
/dataFusion-tika/src/main/resources/META-INF/services/javax.imageio.spi.ImageReaderSpi:
--------------------------------------------------------------------------------
 1 | # from jai-imageio-core for TIFF support
 2 | 
 3 | #com.github.jaiimageio.impl.plugins.jpeg.CLibJPEGImageReaderSpi
 4 | #com.github.jaiimageio.impl.plugins.png.CLibPNGImageReaderSpi
 5 | #com.github.jaiimageio.impl.plugins.jpeg2000.J2KImageReaderSpi
 6 | #com.github.jaiimageio.impl.plugins.jpeg2000.J2KImageReaderCodecLibSpi
 7 | com.github.jaiimageio.impl.plugins.wbmp.WBMPImageReaderSpi
 8 | com.github.jaiimageio.impl.plugins.bmp.BMPImageReaderSpi
 9 | com.github.jaiimageio.impl.plugins.pcx.PCXImageReaderSpi
10 | com.github.jaiimageio.impl.plugins.pnm.PNMImageReaderSpi
11 | com.github.jaiimageio.impl.plugins.raw.RawImageReaderSpi
12 | com.github.jaiimageio.impl.plugins.tiff.TIFFImageReaderSpi
13 | 
14 | 
15 | # from jai-imageio-jpeg2000 for jpeg2000 support
16 | 
17 | com.github.jaiimageio.jpeg2000.impl.J2KImageReaderSpi
18 | 
19 | 
20 | # from levigo-jbig2-imageio for jbig2 support
21 | 
22 | com.levigo.jbig2.JBIG2ImageReaderSpi
23 | 


--------------------------------------------------------------------------------
/dataFusion-tika/src/main/resources/META-INF/services/javax.imageio.spi.ImageWriterSpi:
--------------------------------------------------------------------------------
 1 | # from jai-imageio-core for TIFF support
 2 | 
 3 | #com.github.jaiimageio.impl.plugins.jpeg.CLibJPEGImageWriterSpi
 4 | #com.github.jaiimageio.impl.plugins.png.CLibPNGImageWriterSpi
 5 | #com.github.jaiimageio.impl.plugins.jpeg2000.J2KImageWriterSpi
 6 | #com.github.jaiimageio.impl.plugins.jpeg2000.J2KImageWriterCodecLibSpi
 7 | com.github.jaiimageio.impl.plugins.wbmp.WBMPImageWriterSpi
 8 | com.github.jaiimageio.impl.plugins.bmp.BMPImageWriterSpi
 9 | com.github.jaiimageio.impl.plugins.gif.GIFImageWriterSpi
10 | com.github.jaiimageio.impl.plugins.pcx.PCXImageWriterSpi
11 | com.github.jaiimageio.impl.plugins.pnm.PNMImageWriterSpi
12 | com.github.jaiimageio.impl.plugins.raw.RawImageWriterSpi
13 | com.github.jaiimageio.impl.plugins.tiff.TIFFImageWriterSpi
14 | 
15 | 
16 | # from jai-imageio-jpeg2000 for jpeg2000 support
17 | 
18 | com.github.jaiimageio.jpeg2000.impl.J2KImageWriterSpi
19 | 
20 | # no jbig2 support
21 | 


--------------------------------------------------------------------------------
/dataFusion-tika/src/main/resources/application.conf:
--------------------------------------------------------------------------------
1 | tika {
2 |   // switch to command line args
3 |   // timeout = 600
4 |   // timeout = ${?TIKA_TIMEOUT}
5 | }
6 | 


--------------------------------------------------------------------------------
/dataFusion-tika/src/main/resources/logback.xml:
--------------------------------------------------------------------------------
 1 | <configuration>
 2 | 
 3 |   <appender name="CONS" class="ch.qos.logback.core.ConsoleAppender">
 4 |     <encoder>
 5 |       <pattern>%d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern>
 6 |     </encoder>
 7 |   </appender>
 8 |   
 9 |   <appender name="FILE" class="ch.qos.logback.core.FileAppender">
10 |     <file>tika.log</file>
11 |     <encoder>
12 |       <pattern>%d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern>
13 |     </encoder>
14 |   </appender>
15 | 
16 |   <!--   <logger name="au.csiro.data61" level="DEBUG" /> -->
17 |   
18 |   <!-- too chatty at WARN -->
19 |   <logger name="org.apache.pdfbox.pdmodel.font" level="ERROR" />
20 |   <logger name="org.apache.fontbox.ttf" level="ERROR" />
21 |   
22 |   <root level="INFO">
23 |     <appender-ref ref="FILE" />
24 |   </root>
25 | </configuration>
26 | 


--------------------------------------------------------------------------------
/dataFusion-tika/src/main/resources/org/apache/tika/parser/ocr/TesseractOCRConfig.properties:
--------------------------------------------------------------------------------
 1 | #  Licensed to the Apache Software Foundation (ASF) under one or more
 2 | #  contributor license agreements.  See the NOTICE file distributed with
 3 | #  this work for additional information regarding copyright ownership.
 4 | #  The ASF licenses this file to You under the Apache License, Version 2.0
 5 | #  (the "License"); you may not use this file except in compliance with
 6 | #  the License.  You may obtain a copy of the License at
 7 | #
 8 | #       http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #  Unless required by applicable law or agreed to in writing, software
11 | #  distributed under the License is distributed on an "AS IS" BASIS,
12 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #  See the License for the specific language governing permissions and
14 | #  limitations under the License.
15 | 
16 | # Tesseract properties
17 | tesseractPath=
18 | language=eng
19 | pageSegMode=1
20 | maxFileSizeToOcr=2147483647
21 | minFileSizeToOcr=1024
22 | timeout=300
23 | #txt or hocr
24 | outputType=txt
25 | preserveInterwordSpacing=true
26 | 
27 | # properties for image processing
28 | # to enable processing, set enableImageProcessing to 1
29 | enableImageProcessing=1
30 | ImageMagickPath=
31 | density=300
32 | depth=4
33 | colorspace=gray
34 | filter=triangle
35 | resize=200


--------------------------------------------------------------------------------
/dataFusion-tika/src/main/resources/org/apache/tika/parser/ocr/rotation.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  contributor license agreements.  See the NOTICE file distributed with
 4 |  this work for additional information regarding copyright ownership.
 5 |  The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  (the "License"); you may not use this file except in compliance with
 7 |  the License.  You may obtain a copy of the License at
 8 | 
 9 |      http://www.apache.org/licenses/LICENSE-2.0
10 | 
11 |  Unless required by applicable law or agreed to in writing, software
12 |  distributed under the License is distributed on an "AS IS" BASIS,
13 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  See the License for the specific language governing permissions and
15 |  limitations under the License.
16 | """
17 | 
18 | from __future__ import division, print_function
19 | import numpy
20 | from skimage.transform import radon
21 | from PIL import Image
22 | from numpy import asarray, mean, array, blackman
23 | from numpy.fft import rfft
24 | import matplotlib
25 | matplotlib.use("Agg")
26 | import matplotlib.pyplot as plt
27 | from matplotlib.mlab import rms_flat
28 | 
29 | import sys
30 | import getopt
31 | 
32 | def main(argv):
33 |         filename = ''
34 | 
35 |         if len(sys.argv) < 3:
36 |                 print('Usage: rotation.py -f <filename>')
37 |                 sys.exit()
38 |         try:
39 |           opts, args = getopt.getopt(argv,"hf:",["file="])
40 |         except getopt.GetoptError:
41 |           print('rotation.py -f <filename>')
42 |           sys.exit(2)
43 |         for opt, arg in opts:
44 |           if opt == '-h':
45 |              print('Usage: rotation.py -f <filename>')
46 |              sys.exit()
47 |           elif opt in ("-f", "--file"):
48 |              filename = arg
49 | 
50 |         try:
51 |          from parabolic import parabolic
52 | 
53 |          def argmax(x):
54 |              return parabolic(x, numpy.argmax(x))[0]
55 |         except ImportError:
56 |          from numpy import argmax
57 | 
58 |         # Load file, converting to grayscale
59 |         I = asarray(Image.open(filename).convert('L'))
60 |         I = I - mean(I)  # Demean; make the brightness extend above and below zero
61 | 
62 |         # Do the radon transform and display the result
63 |         sinogram = radon(I)
64 | 
65 |         # Find the RMS value of each row and find "busiest" rotation,
66 |         # where the transform is lined up perfectly with the alternating dark
67 |         # text and white lines
68 |         r = array([rms_flat(line) for line in sinogram.transpose()])
69 |         rotation = argmax(r)
70 | 
71 |         print('{:.2f}'.format(-(90-rotation)))
72 | 
73 | if __name__ == "__main__":
74 |         main(sys.argv[1:])
75 |         # print('{:.2f}'.format(0))
76 | 


--------------------------------------------------------------------------------
/dataFusion-tika/src/main/resources/org/apache/tika/parser/pdf/PDFParser.properties:
--------------------------------------------------------------------------------
 1 | #  Licensed to the Apache Software Foundation (ASF) under one or more
 2 | #  contributor license agreements.  See the NOTICE file distributed with
 3 | #  this work for additional information regarding copyright ownership.
 4 | #  The ASF licenses this file to You under the Apache License, Version 2.0
 5 | #  (the "License"); you may not use this file except in compliance with
 6 | #  the License.  You may obtain a copy of the License at
 7 | #
 8 | #       http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #  Unless required by applicable law or agreed to in writing, software
11 | #  distributed under the License is distributed on an "AS IS" BASIS,
12 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #  See the License for the specific language governing permissions and
14 | #  limitations under the License.
15 | 
16 | enableAutoSpace true
17 | extractAnnotationText true
18 | sortByPosition	false
19 | suppressDuplicateOverlappingText	false
20 | extractAcroFormContent	true
21 | extractInlineImages true
22 | extractUniqueInlineImagesOnly true
23 | checkExtractAccessPermission false
24 | allowExtractionForAccessibility true
25 | ifXFAExtractOnlyXFA false
26 | catchIntermediateIOExceptions true
27 | #options: no_ocr, ocr_only, ocr_and_text_extraction
28 | ocrStrategy no_ocr
29 | #dots per inch for the ocr rendering of the page image
30 | ocrDPI 300
31 | #if you request tif, make sure you have imageio jars on your classpath!
32 | ocrImageFormatName png
33 | #options: argb, binary, gray, rgb
34 | ocrImageType gray
35 | #scale to use when rendering a page image for OCR
36 | ocrImageScale 2.0
37 | 


--------------------------------------------------------------------------------
/dataFusion-tika/src/main/scala/au/csiro/data61/dataFusion/tika/LangDetect.scala:
--------------------------------------------------------------------------------
 1 | package au.csiro.data61.dataFusion.tika
 2 | 
 3 | import com.optimaize.langdetect.{ LanguageDetector, LanguageDetectorBuilder }
 4 | import com.optimaize.langdetect.ngram.NgramExtractors
 5 | import com.optimaize.langdetect.profiles.LanguageProfileReader
 6 | import com.optimaize.langdetect.text.CommonTextObjectFactories
 7 | 
 8 | object LangDetect {
 9 |   case class Lang(lang: String, prob: Float)
10 |   
11 |   val languageProfiles = new LanguageProfileReader().readAllBuiltIn
12 |   val languageDetector: LanguageDetector = LanguageDetectorBuilder.create(NgramExtractors.standard).withProfiles(languageProfiles).build
13 |   val textObjectFactory = CommonTextObjectFactories.forDetectingOnLargeText
14 |   
15 |   def headOption[T](jl: java.util.List[T]): Option[T] = if (jl.isEmpty) None else Some(jl.get(0))
16 | 
17 |   def lang(text: String): Option[Lang] = {
18 |     headOption(languageDetector.getProbabilities(textObjectFactory.forText(text)))
19 |       .map(l => Lang(l.getLocale.getLanguage, l.getProbability.toFloat))
20 |   }
21 | }
22 |   
23 | 


--------------------------------------------------------------------------------
/dataFusion-tika/src/test/resources/exampleData/AAA.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/data61/dataFusion/4b766ae65254eed6c5a095f3bb8b6b42b9500a9c/dataFusion-tika/src/test/resources/exampleData/AAA.pptx


--------------------------------------------------------------------------------
/dataFusion-tika/src/test/resources/exampleData/Email001.msg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/data61/dataFusion/4b766ae65254eed6c5a095f3bb8b6b42b9500a9c/dataFusion-tika/src/test/resources/exampleData/Email001.msg


--------------------------------------------------------------------------------
/dataFusion-tika/src/test/resources/exampleData/PDF001.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/data61/dataFusion/4b766ae65254eed6c5a095f3bb8b6b42b9500a9c/dataFusion-tika/src/test/resources/exampleData/PDF001.pdf


--------------------------------------------------------------------------------
/dataFusion-tika/src/test/resources/exampleData/PDF002.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/data61/dataFusion/4b766ae65254eed6c5a095f3bb8b6b42b9500a9c/dataFusion-tika/src/test/resources/exampleData/PDF002.pdf


--------------------------------------------------------------------------------
/dataFusion-tika/src/test/resources/exampleData/PDF003.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/data61/dataFusion/4b766ae65254eed6c5a095f3bb8b6b42b9500a9c/dataFusion-tika/src/test/resources/exampleData/PDF003.pdf


--------------------------------------------------------------------------------
/dataFusion-tika/src/test/resources/exampleData/PDF004.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/data61/dataFusion/4b766ae65254eed6c5a095f3bb8b6b42b9500a9c/dataFusion-tika/src/test/resources/exampleData/PDF004.pdf


--------------------------------------------------------------------------------
/dataFusion-tika/src/test/resources/exampleData/README.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/data61/dataFusion/4b766ae65254eed6c5a095f3bb8b6b42b9500a9c/dataFusion-tika/src/test/resources/exampleData/README.txt


--------------------------------------------------------------------------------
/dataFusion-tika/src/test/resources/exampleData/TIF001.tif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/data61/dataFusion/4b766ae65254eed6c5a095f3bb8b6b42b9500a9c/dataFusion-tika/src/test/resources/exampleData/TIF001.tif


--------------------------------------------------------------------------------
/dataFusion-tika/src/test/resources/exampleData/TIF002.tif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/data61/dataFusion/4b766ae65254eed6c5a095f3bb8b6b42b9500a9c/dataFusion-tika/src/test/resources/exampleData/TIF002.tif


--------------------------------------------------------------------------------
/dataFusion-tika/src/test/resources/exampleData/TIF003.tif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/data61/dataFusion/4b766ae65254eed6c5a095f3bb8b6b42b9500a9c/dataFusion-tika/src/test/resources/exampleData/TIF003.tif


--------------------------------------------------------------------------------
/dataFusion-tika/src/test/resources/exampleData/Thumbs.db:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/data61/dataFusion/4b766ae65254eed6c5a095f3bb8b6b42b9500a9c/dataFusion-tika/src/test/resources/exampleData/Thumbs.db


--------------------------------------------------------------------------------
/dataFusion-tika/src/test/resources/exampleData/data-prob-2-12.XLS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/data61/dataFusion/4b766ae65254eed6c5a095f3bb8b6b42b9500a9c/dataFusion-tika/src/test/resources/exampleData/data-prob-2-12.XLS


--------------------------------------------------------------------------------
/dataFusion-tika/src/test/resources/exampleData/doc001.doc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/data61/dataFusion/4b766ae65254eed6c5a095f3bb8b6b42b9500a9c/dataFusion-tika/src/test/resources/exampleData/doc001.doc


--------------------------------------------------------------------------------
/dataFusion-tika/src/test/resources/exampleData/doc002.doc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/data61/dataFusion/4b766ae65254eed6c5a095f3bb8b6b42b9500a9c/dataFusion-tika/src/test/resources/exampleData/doc002.doc


--------------------------------------------------------------------------------
/dataFusion-tika/src/test/resources/exampleData/html001.html:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/data61/dataFusion/4b766ae65254eed6c5a095f3bb8b6b42b9500a9c/dataFusion-tika/src/test/resources/exampleData/html001.html


--------------------------------------------------------------------------------
/dataFusion-tika/src/test/resources/exampleData/image001.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/data61/dataFusion/4b766ae65254eed6c5a095f3bb8b6b42b9500a9c/dataFusion-tika/src/test/resources/exampleData/image001.png


--------------------------------------------------------------------------------
/dataFusion-tika/src/test/resources/exampleData/image002.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/data61/dataFusion/4b766ae65254eed6c5a095f3bb8b6b42b9500a9c/dataFusion-tika/src/test/resources/exampleData/image002.gif


--------------------------------------------------------------------------------
/dataFusion-tika/src/test/resources/exampleData/image003.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/data61/dataFusion/4b766ae65254eed6c5a095f3bb8b6b42b9500a9c/dataFusion-tika/src/test/resources/exampleData/image003.jpeg


--------------------------------------------------------------------------------
/dataFusion-tika/src/test/resources/exampleData/image004.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/data61/dataFusion/4b766ae65254eed6c5a095f3bb8b6b42b9500a9c/dataFusion-tika/src/test/resources/exampleData/image004.png


--------------------------------------------------------------------------------
/dataFusion-tika/src/test/resources/exampleData/xls001.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/data61/dataFusion/4b766ae65254eed6c5a095f3bb8b6b42b9500a9c/dataFusion-tika/src/test/resources/exampleData/xls001.xls


--------------------------------------------------------------------------------
/dataFusion-tika/src/test/resources/logback-test.xml:
--------------------------------------------------------------------------------
 1 | <configuration>
 2 | 
 3 |   <appender name="CONS" class="ch.qos.logback.core.ConsoleAppender">
 4 |     <encoder>
 5 |       <pattern>%d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern>
 6 |     </encoder>
 7 |   </appender>
 8 |   
 9 |   <appender name="FILE" class="ch.qos.logback.core.FileAppender">
10 |     <file>tika-test.log</file>
11 |     <encoder>
12 |       <pattern>%d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern>
13 |     </encoder>
14 |   </appender>
15 | 
16 |   <logger name="au.csiro.data61" level="DEBUG" />
17 | 
18 |   <!-- If main level is DEBUG use the following because these packages are too chatty:
19 |   <logger name="org.apache.pdfbox" level="INFO" />
20 |   <logger name="org.apache.fontbox" level="INFO" />
21 |   <logger name="com.optimaize.langdetect" level="INFO" />
22 |   -->
23 |   
24 |   <root level="INFO">
25 |     <appender-ref ref="CONS" />
26 |   </root>
27 | </configuration>
28 | 


--------------------------------------------------------------------------------
/dataFusion-tika/src/test/scala/au/csiro/data61/dataFusion/tika/TikaTest.scala:
--------------------------------------------------------------------------------
 1 | package au.csiro.data61.dataFusion.tika
 2 | 
 3 | import org.scalatest.{ FlatSpec, Matchers }
 4 | 
 5 | import com.typesafe.scalalogging.Logger
 6 | 
 7 | class TikaTest extends FlatSpec with Matchers {
 8 |   private val log = Logger(getClass)
 9 |   val tikaUtil = new TikaUtil(Main.defaultCliOption)
10 |   
11 |   "Tika" should "extract 1 page of PDF" in {
12 |     val path = "/exampleData/PDF002.pdf" // born digital, has logo image with no text
13 |     val docIn = tikaUtil.tika(getClass.getResourceAsStream(path), path, 0L)
14 |     // log.debug(s"docIn = ${docIn}")
15 |     docIn.content.map(_.size).getOrElse(0) > 100 should be(true) // born digital text
16 |     docIn.embedded.size should be(1) // has 1 embedded doc - the logo
17 | 
18 | //    log.debug(s"content = ${docIn.embedded(0).content}")
19 | //    docIn.embedded(0).content.isDefined should be(false) // for which we get no text
20 |     // we got content = None with tesseract3 but Some with tesseract4, so commented out this bit
21 |   }
22 |   
23 |   it should "extract 5 pages of PDF" in {
24 |     val path = "/exampleData/PDF003.pdf" // scanned doc
25 |     val docIn = tikaUtil.tika(getClass.getResourceAsStream(path), path, 0L)
26 |     // log.debug(s"docIn = ${docIn}")
27 |     docIn.content.map(_.size).getOrElse(0) > 100 should be(true) // text OCR by scanner
28 |     docIn.embedded.size should be(5) // 5 embedded page images
29 |     docIn.embedded.foreach(_.content.map(_.size).getOrElse(0) > 100 should be(true)) // tesseract got text from each page
30 |   }
31 |   
32 |   it should "extract from good Excel" in {
33 |     val path = "/exampleData/xls001.xls"
34 |     val d = tikaUtil.tika(getClass.getResourceAsStream(path), path, 0L)
35 |     // log.debug(s"d = $d")
36 |     d.content.get.contains("Principality of Liechtenstein") should be(true)
37 |     d.meta.get("Content-Type") should be(Some("application/vnd.ms-excel"))
38 |   }
39 |   
40 |   it should "convert good Excel to opendocument.spreadsheet (only when explicitly asked to) and extract" in {
41 |     val path = "/exampleData/xls001.xls"
42 |     val d = tikaUtil.convertAndParseDoc(getClass.getResourceAsStream(path), path, 0L)
43 |     // log.debug(s"d = $d")
44 |     d.content.get.contains("Principality of Liechtenstein") should be(true)
45 |     d.meta.get("Content-Type") should be(Some("application/vnd.oasis.opendocument.spreadsheet"))
46 |   }
47 |     
48 |   it should "convert bad Excel to opendocument.spreadsheet (when not explicitly asked to) and extract" in {
49 |     // test Excel file is attachment from: https://bz.apache.org/bugzilla/show_bug.cgi?id=57104
50 |     val path = "/exampleData/data-prob-2-12.XLS"
51 |     val d = tikaUtil.tika(getClass.getResourceAsStream(path), path, 0L)
52 |     // log.debug(s"d = $d")
53 |     d.content.get.contains("562.03") should be(true)
54 |     d.meta.get("Content-Type") should be(Some("application/vnd.oasis.opendocument.spreadsheet"))
55 |   }
56 |   
57 | }


--------------------------------------------------------------------------------
/dataFusion-util/3rd-party-licenses.md:
--------------------------------------------------------------------------------
 1 | # datafusion-util-licenses
 2 | 
 3 | Category | License | Dependency | Notes
 4 | --- | --- | --- | ---
 5 | Apache | [Apache 2](http://www.apache.org/licenses/LICENSE-2.0.txt) | io.spray # spray-json_2.12 # 1.3.3 | <notextile></notextile>
 6 | Apache | [Apache 2.0 License](http://www.apache.org/licenses/LICENSE-2.0.html) | com.typesafe.scala-logging # scala-logging_2.12 # 3.5.0 | <notextile></notextile>
 7 | Apache | [Apache License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0) | com.typesafe # config # 1.3.1 | <notextile></notextile>
 8 | Apache | [Apache License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0.txt) | commons-io # commons-io # 2.5 | <notextile></notextile>
 9 | Apache | [The Apache Software License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0.txt) | com.google.guava # guava # 18.0 | <notextile></notextile>
10 | Apache | [The Apache Software License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0.txt) | org.apache.lucene # lucene-analyzers-common # 7.0.1 | <notextile></notextile>
11 | Apache | [The Apache Software License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0.txt) | org.apache.lucene # lucene-core # 7.0.1 | <notextile></notextile>
12 | Apache | [The Apache Software License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0.txt) | org.apache.lucene # lucene-highlighter # 7.0.1 | <notextile></notextile>
13 | Apache | [The Apache Software License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0.txt) | org.apache.lucene # lucene-join # 7.0.1 | <notextile></notextile>
14 | Apache | [The Apache Software License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0.txt) | org.apache.lucene # lucene-memory # 7.0.1 | <notextile></notextile>
15 | Apache | [The Apache Software License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0.txt) | org.apache.lucene # lucene-queries # 7.0.1 | <notextile></notextile>
16 | Apache | [The Apache Software License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0.txt) | org.apache.lucene # lucene-queryparser # 7.0.1 | <notextile></notextile>
17 | Apache | [The Apache Software License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0.txt) | org.apache.lucene # lucene-sandbox # 7.0.1 | <notextile></notextile>
18 | Apache | [the Apache License, ASL Version 2.0](http://www.apache.org/licenses/LICENSE-2.0) | org.scalactic # scalactic_2.12 # 3.0.0 | <notextile></notextile>
19 | Apache | [the Apache License, ASL Version 2.0](http://www.apache.org/licenses/LICENSE-2.0) | org.scalatest # scalatest_2.12 # 3.0.0 | <notextile></notextile>
20 | BSD | [BSD 3-Clause](http://www.scala-lang.org/license.html) | org.scala-lang # scala-library # 2.12.2 | <notextile></notextile>
21 | BSD | [BSD 3-Clause](http://www.scala-lang.org/license.html) | org.scala-lang # scala-reflect # 2.12.2 | <notextile></notextile>
22 | BSD | [BSD 3-clause](http://opensource.org/licenses/BSD-3-Clause) | org.scala-lang.modules # scala-parser-combinators_2.12 # 1.0.4 | <notextile></notextile>
23 | BSD | [BSD 3-clause](http://opensource.org/licenses/BSD-3-Clause) | org.scala-lang.modules # scala-xml_2.12 # 1.0.5 | <notextile></notextile>
24 | BSD | [BSD-Style](http://www.opensource.org/licenses/bsd-license.php) | com.jsuereth # scala-arm_2.12 # 2.0 | <notextile></notextile>
25 | GPL | [GPL](https://www.gnu.org/licenses/gpl-3.0.en.html) | au.csiro.data61 # datafusion-common_2.12 # 1.1-SNAPSHOT | <notextile></notextile>
26 | GPL | [GPL](https://www.gnu.org/licenses/gpl-3.0.en.html) | au.csiro.data61 # datafusion-search_2.12 # 1.1-SNAPSHOT | <notextile></notextile>
27 | LGPL | [EPL + GNU Lesser General Public License](http://logback.qos.ch/license.html) | ch.qos.logback # logback-classic # 1.2.3 | <notextile></notextile>
28 | LGPL | [EPL + GNU Lesser General Public License](http://logback.qos.ch/license.html) | ch.qos.logback # logback-core # 1.2.3 | <notextile></notextile>
29 | MIT | [MIT License](http://www.opensource.org/licenses/mit-license.php) | com.github.scopt # scopt_2.12 # 3.5.0 | <notextile></notextile>
30 | MIT | [MIT License](http://www.slf4j.org/license.html) | org.slf4j # slf4j-api # 1.7.25 | <notextile></notextile>
31 | 
32 | 


--------------------------------------------------------------------------------
/dataFusion-util/README.md:
--------------------------------------------------------------------------------
 1 | # dataFusion-util
 2 | 
 3 | ## Introduction
 4 | This project provides command line utilities for:
 5 | - Filtering and merging [Search Result JSON format](../dataFusion-common#search-result-json-format) into the [Document JSON format](../dataFusion-common#document-json-format) (`--hits` CLI option).
 6 | Search results for `typ=PERSON2` (using only first and family names) often overlap with `typ=PERSON` (using the full name).
 7 | In this case the `typ=PERSON2` result is an inferior match and is filtered out.
 8 | This processing also filters out the spurious matches described in [People](../dataFusion-search#people).
 9 | - Parsing content for mentions of people in email headers and merging results into the [Document JSON format](../dataFusion-common#document-json-format) (`--email` CLI option).
10 | If the resulting `offStr` (see [NER Structure](../dataFusion-common#ner-structure)) matches that of a NER with `impl=D61GAZ` and `typ=PERSON|PERSON2` then the `score` and `extRef` are taken from that NER.
11 | Otherwise extRef is not set and score is computed using the Lucene's IDF formula if the `--emailIDF` option is true (default) else it's set to 1.0. 
12 | - Parsing content for age soon after a person's name and merging results into the [Document JSON format](../dataFusion-common#document-json-format) (`--age` CLI option).
13 | Age is recognized as a number from 18 - 99 inclusive, either:
14 | parenenthesized immediately after a name (a NER with `impl=D61GAZ` and `typ=PERSON|PERSON2`)
15 | and not followed by further digits (to avoid telephone number area codes);
16 | or within 50 chars and following the word "age" or "aged" (only applied to the closest preceding person's name).
17 | The `extRef` is set from from the NER representing the name and `score` is set to 1.0.
18 | - Network building from the [Document JSON format](../dataFusion-common#document-json-format) as detailed below (`--proximity` CLI option).
19 | - Reallocating the id's in a [Document JSON format](../dataFusion-common#document-json-format) file,
20 | which can be useful in the case of merging multiple partial tika runs where the joint ids would otherwise not be unique (`--resetId` CLI option). 
21 | 
22 | The CLI options `--hits`, `--email` and `--age` can be used jointly.
23 | 
24 | ## Network Building
25 | ### Input
26 | Network building uses the follow named entities (see [NER Structure](../dataFusion-common#ner-structure) for details):
27 | - `impl=D61GAZ` and `typ=PERSON|PERSON2|ORGANIZATION`;
28 | - `impl=D61EMAIL` and `typ=FROM|TO|CC|BCC` and no extRef (if extRef is set it is a duplicate of a `impl=D61GAZ` named entity).
29 | ### Node Generation
30 | Nodes generated from `impl=D61GAZ` named entities aggregate all named entity mentions with the same `typ` and `extRef`.
31 | 
32 | Nodes generated from `impl=D61EMAIL` named entities aggregate all named entity mentions with the same `text` irrespective of `typ`. Node attributes are set:
33 | - `node.extRef.name=ner.text`
34 | - `node.extRef.ids=[]`
35 | - `node.typ=D61EMAIL`
36 | 
37 | The mapping from `ner.typ=FROM|TO|CC|BCC` to `node.typ=D61EMAIL` prevents separate nodes appearing in the visualization for the same person depending on the particular email header, which would also diminish connection weights by spreading them across mutiple edges. 
38 | 
39 | ### Collections
40 | Documents are grouped into collections.
41 | Documents in the filesystem are under (but not necessarily directly under) a directory that represents their collection.
42 | The CLI option `--collectionRe` specifies a [regex](https://en.wikipedia.org/wiki/Regular_expression) to extract the collection from a document's path.
43 | The default value for this option, `/collection/([^/]+)/`, is suitable if `collection` is the common parent directory for all collections.  
44 | 
45 | ### Edge Generation
46 | Parameters are the decay value (set by the `--decay` CLI option with default value 500 characters) and a cutoff which is `5 * decay`.
47 | 
48 |     (weight, count) for an edge representing co-occurrences of named entities n1 and n2 in collection c =
49 |       sum over documents d in collection c
50 |       sum over sub-documents e in d (main content and each embedded document)
51 |       sum over pairs of instances of n1 & n2 in e, where dist = abs( n2.offStr - n1.offStr ) < cutoff
52 |       weight = exp( - dist / decay ), count = 1
53 |       
54 | ### Output
55 | The edges computed above (with count > 0) are written in [Edge JSON format](../dataFusion-common#node-and-edge-json-formats) to proximity-edge.json and the nodes referenced in these edges are written in [Node JSON format](../dataFusion-common#node-and-edge-json-formats) to proximity-node.json.
56 | 
57 | ## Build, Configuration and Running
58 | 
59 | See the top level [README](../README.md).
60 | The score computation for the `--emailIDF` option requires term document frequencies from the Lucene index, which is located using the configuration from dataFusion-search.
61 | 


--------------------------------------------------------------------------------
/dataFusion-util/build.sbt:
--------------------------------------------------------------------------------
 1 | name := "dataFusion-util"
 2 | 
 3 | libraryDependencies ++= Seq(
 4 |   "com.github.scopt" %% "scopt" % "3.5.0",
 5 |   "com.jsuereth" %% "scala-arm" % "2.0",
 6 |   "org.scalatest" %% "scalatest" % "3.0.0" % "test"
 7 | )
 8 | 
 9 | com.github.retronym.SbtOneJar.oneJarSettings
10 | 
11 | mainClass in Compile := Some("au.csiro.data61.dataFusion.util.Main")
12 | 


--------------------------------------------------------------------------------
/dataFusion-util/src/main/resources/application.conf:
--------------------------------------------------------------------------------
1 | search = {
2 |   // See: https://lucene.apache.org/core/6_6_0/analyzers-common/org/apache/lucene/analysis/synonym/SolrSynonymParser.html
3 |   synonyms = "../dataFusion-search/synonyms.txt"
4 |   synonyms = ${?SEARCH_SYNONYMS}
5 | }
6 | 


--------------------------------------------------------------------------------
/dataFusion-util/src/main/resources/logback.xml:
--------------------------------------------------------------------------------
 1 | <configuration>
 2 | 
 3 | <!--   <appender name="CONS" class="ch.qos.logback.core.ConsoleAppender"> -->
 4 | <!--     <encoder> -->
 5 | <!--       <pattern>%d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern> -->
 6 | <!--     </encoder> -->
 7 | <!--   </appender> -->
 8 |   
 9 |   <appender name="FILE" class="ch.qos.logback.core.FileAppender">
10 |     <file>util.log</file>
11 |     <encoder>
12 |       <pattern>%d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern>
13 |     </encoder>
14 |   </appender>
15 | 
16 | <!--   <logger name="au.csiro.data61" level="DEBUG" /> -->
17 |   
18 |   <root level="INFO">
19 |     <appender-ref ref="FILE" />
20 |   </root>
21 | </configuration>
22 | 


--------------------------------------------------------------------------------
/dataFusion-util/src/main/scala/au/csiro/data61/dataFusion/util/Age.scala:
--------------------------------------------------------------------------------
 1 | package au.csiro.data61.dataFusion.util
 2 | 
 3 | import com.typesafe.scalalogging.Logger
 4 | 
 5 | import au.csiro.data61.dataFusion.common.Data.{ Doc, GAZ, Ner, T_PERSON, T_PERSON2 }
 6 | 
 7 | /** create AGE Ner's from GAZ PERSON{,2} Ners followed by a parenthesized number 18-99 */
 8 | object Age {
 9 |   private val log = Logger(getClass)
10 | 
11 |   val ageRe1 = """\s*\((\d{2})\)(?!\s*\d)""".r    // look for " (dd)" after a name not followed by further digits (a phone number)
12 |   val ageRe2 = """(.{0,50}\baged?) (\d{2})\b""".r // look for "aged dd" within 50 chars after a name (to allow for a title in beween)
13 |   def wordCount(s: String) = s.split("\\s+").length
14 |   def find(s: String) = ageRe1.findPrefixMatchOf(s).map((_, 1, 0)).orElse(ageRe2.findPrefixMatchOf(s).map(m => (m, 2, wordCount(m.group(1)))))
15 |   
16 |   def toNer(content: String, ner: List[Ner]): Iterator[Ner] = {
17 |     val it = for {
18 |       n <- ner.sortBy(_.offStr).iterator if n.impl == GAZ && (n.typ == T_PERSON || n.typ == T_PERSON2)
19 |       (m, grp, posOffset) <- find(content.substring(n.offEnd)) if m.group(grp).toInt >= 18  // must be adult
20 |     } yield Ner(n.posEnd + posOffset, n.posEnd + posOffset + 1, n.offEnd + m.start(grp), n.offEnd + m.end(grp), 1.0, m.group(grp), "AGE", "D61AGE", n.extRef)
21 |     
22 |     // if we have two close PERSONs followed by an AGE the above could associate the same AGE with both of them
23 |     // with Ners sorted as above, we only want the last one
24 |     val dummy = Ner(0, 0, 0, 0, 0.0, "text", "typ", "impl", None)
25 |     (it ++ Iterator.single(dummy)).sliding(2).flatMap {
26 |       case Seq(n1, n2) if n1.offStr != n2.offStr => Iterator.single(n1)
27 |       case _ => Iterator.empty
28 |     }
29 |   }
30 |   
31 |   val augment: Doc => Doc = { d =>
32 |     val ner = d.ner ++ d.content.toList.flatMap(toNer(_, d.ner))
33 |     val embedded = d.embedded.map { e =>
34 |       val ner = e.ner ++ e.content.toList.flatMap(toNer(_, e.ner))
35 |       e.copy(ner = ner)
36 |     }
37 |     d.copy(ner = ner, embedded = embedded)
38 |   }
39 | }
40 | 


--------------------------------------------------------------------------------
/dataFusion-util/src/main/scala/au/csiro/data61/dataFusion/util/Hits.scala:
--------------------------------------------------------------------------------
  1 | package au.csiro.data61.dataFusion.util
  2 | 
  3 | import java.io.InputStream
  4 | 
  5 | import scala.io.Source
  6 | 
  7 | import com.typesafe.scalalogging.Logger
  8 | 
  9 | import au.csiro.data61.dataFusion.common.Data._
 10 | import au.csiro.data61.dataFusion.common.Data.{ LPosDoc, Ner, PHits, PosInfo }
 11 | import au.csiro.data61.dataFusion.common.Data.JsonProtocol.pHitsCodec
 12 | import spray.json.pimpString
 13 | import scala.annotation.tailrec
 14 | 
 15 | import au.csiro.data61.dataFusion.search.DataFusionLucene._
 16 | import au.csiro.data61.dataFusion.search.LuceneUtil._
 17 | import java.util.Comparator
 18 | import java.util.Arrays
 19 | 
 20 | /**
 21 |  * In dataFusion-search the phrase search for PERSON|PERSON2, with terms in any order, can make some incorrect matches.
 22 |  * This happens when the query contains repeated tokens e.g. "Aaron H Aaron" in which case text "H H Aaron" will match.
 23 |  * This is corrected here by checking that matches have the same term frequencies as the query.
 24 |  * Unfortunately this check cannot be done at search time in dataFusion-search because fetching the text at that point would
 25 |  * negatively impact performance (not an issue here because we already have the text).
 26 |  * 
 27 |  * A dependency on dataFusion-search has been added so that we can use the same term tokenization as in the search.
 28 |  */
 29 | object Hits {
 30 |   private val log = Logger(getClass)
 31 |   
 32 |   def hitIter(hIn: InputStream): Iterator[PHits] = Source.fromInputStream(hIn, "UTF-8").getLines.map(_.parseJson.convertTo[PHits])
 33 |   
 34 |   /** idEmbIdx -> extRefId, score, typ, lposdoc */
 35 |   type HitsMap = Map[IdEmbIdx, Seq[(ExtRef, Double, String, LPosDoc)]]
 36 |   
 37 |   def hitsMap(iter: Iterator[PHits]): HitsMap =
 38 |     iter.flatMap { x =>
 39 |       x.hits.map(lposdoc => (x.extRef, x.score, x.typ, lposdoc))
 40 |     }.toSeq.groupBy(_._4.idEmbIdx)    
 41 |   
 42 |   def termFreq(t: String) = tokenIter(analyzer, F_CONTENT, t).toList.groupBy(identity).map { case (t, lst) => (t, lst.size) }
 43 |     
 44 |   /**
 45 |    * @return Some(termFreq) if the hits need to be checked against this query term freq (a PERSON|PERSON2 search with repeated terms) 
 46 |    */
 47 |   def qTermFreq(t: String, typ: String) =
 48 |     if (typ == T_ORGANIZATION) None // not needed for "terms in order" search
 49 |     else {
 50 |       val tf = termFreq(t)
 51 |       if (tf.values.exists(_ > 1)) Some(tf)
 52 |       else None // not needed if no duplicate terms
 53 |     }
 54 | 
 55 |   def toNer(text: String, pi: PosInfo, extRef: ExtRef, score: Double, typ: String) = 
 56 |     Ner(pi.posStr, pi.posEnd, pi.offStr, pi.offEnd, score, text, typ, GAZ, Some(extRef))
 57 |       
 58 | /**
 59 |  * find if any PERSON NER overlaps with a PERSON2
 60 |  * A----------B        n1 = PERSON
 61 |  *       C--------D    n2 = PERSON2
 62 |  * no overlap = D < A or B < C
 63 |  * overlap = D >= A and B >= C
 64 |  * A & C are offStr; B & D are offEnd - 1 because offEnd is exclusive (1 past the end)
 65 |  * So overlap = n2.offEnd - 1 >= n1.offStr && n1.offEnd - 1 >= n2.offStr
 66 |  *            = n2.offEnd > n1.offStr && n1.offEnd > n2.offStr
 67 |  *            
 68 |  * Sort n1 = PERSON on offEnd asc
 69 |  * Binary search to find first n1: n1.offEnd > n2.offStr (the bit after &&)
 70 |  * Scan n1's until n2.offEnd < n1.offStr for overlap
 71 |  */
 72 |   val nerCmp = new Comparator[Ner] {
 73 |     override def compare(a: Ner, b:Ner) = a.offEnd - b.offEnd
 74 |   }
 75 |   
 76 |   def filterPer2(ners: Seq[Ner]): Seq[Ner] = {
 77 |     val per = {
 78 |       val a = ners.view.filter(n => n.impl == GAZ && n.typ == T_PERSON).toArray
 79 |       Arrays.sort(a, nerCmp)
 80 |       log.debug(s"filterPer2.per: ${a.toList}")
 81 |       a
 82 |     }
 83 |     
 84 |     def pred(n: Ner): Boolean = n.typ != T_PERSON2 || {
 85 |       val i = Arrays.binarySearch(per, n.copy(offEnd = n.offStr + 1), nerCmp) // find 1st per(j).offEnd > n.offStr (>= n.offStr + 1)
 86 |       val j = if (i >= 0) i else -(i + 1)
 87 |       val overlaps = j < per.length && per(j).offStr < n.offEnd // assume per(j)'s don't overlap so no need to scan
 88 |       log.debug(s"i = $i, j = $j, overlaps = $overlaps, n = $n")
 89 |       !overlaps
 90 |     }
 91 |     
 92 |     ners filter pred
 93 |   }
 94 |     
 95 |   def augment(hs: HitsMap): Doc => Doc = { d =>
 96 |     
 97 |     def searchNers(content: Option[String], idEmbIdx: IdEmbIdx): Seq[Ner] = for {
 98 |       c <- content.toSeq
 99 |       hits <- hs.get(idEmbIdx).toSeq
100 |       (extRefId, score, typ, lposdoc) <- hits
101 |       qtf = qTermFreq(extRefId.name, typ) // query: term -> freq but only if it needs to be checked
102 |       pi <- lposdoc.posInfos
103 |       text = c.substring(pi.offStr, pi.offEnd)
104 |       ok <- qtf.map(_ == termFreq(text)).orElse(Some(true)) if ok // skip if there's a term freq mismatch
105 |     } yield toNer(text, pi, extRefId, score, typ)
106 |     
107 |     def newNers(content: Option[String], idEmbIdx: IdEmbIdx): Seq[Ner] = filterPer2(searchNers(content, idEmbIdx))
108 |     
109 |     val ner = d.ner ++ newNers(d.content, IdEmbIdx(d.id, EMB_IDX_MAIN))
110 |     val embedded = d.embedded.zipWithIndex.map { case (e, embIdx) =>
111 |       val ner = e.ner ++ newNers(e.content, IdEmbIdx(d.id, embIdx))
112 |       e.copy(ner = ner)
113 |     }
114 |     d.copy(ner = ner, embedded = embedded)
115 |   }
116 | 
117 | }


--------------------------------------------------------------------------------
/dataFusion-util/src/main/scala/au/csiro/data61/dataFusion/util/Proximity.scala:
--------------------------------------------------------------------------------
  1 | package au.csiro.data61.dataFusion.util
  2 | 
  3 | import java.io.File
  4 | import java.util.concurrent.ConcurrentHashMap
  5 | import java.util.concurrent.atomic.AtomicInteger
  6 | 
  7 | import scala.collection.JavaConverters.{ asScalaSetConverter, collectionAsScalaIterableConverter }
  8 | import scala.io.Source
  9 | 
 10 | import com.typesafe.scalalogging.Logger
 11 | 
 12 | import Main.CliOption
 13 | import au.csiro.data61.dataFusion.common.Data.{ Doc, EMAIL, Edge, ExtRef, GAZ }
 14 | import au.csiro.data61.dataFusion.common.Data.{ Ner, Node, T_ORGANIZATION, T_PERSON, T_PERSON2, WeightMap }
 15 | import au.csiro.data61.dataFusion.common.Data.JsonProtocol.{ docFormat, edgeFormat, nodeFormat }
 16 | import au.csiro.data61.dataFusion.common.Parallel.doParallel
 17 | import au.csiro.data61.dataFusion.common.Util.bufWriter
 18 | import resource.managed
 19 | import spray.json.{ pimpAny, pimpString }
 20 | 
 21 | object Proximity {
 22 |   private val log = Logger(getClass)
 23 | 
 24 |   def fileWithSuffix(f: File, suffix: String) = new File(f.getPath + suffix)
 25 | 
 26 |   /**
 27 |    * GAZ NERs and EMAIL NERs with no extRef (possibly non-Australian - avoids duplicates with GAZ NERs).
 28 |    * Also map all EMAIL typ (FROM|TO|CC|BCC) to a single typ=EMAIL so we only get one node per person.
 29 |    */
 30 |   def nerFilter(ner: List[Ner]): Iterator[Ner] = {
 31 | //    val emailNer = ner.filter(n => n.impl == EMAIL)
 32 | //    val offStr = emailNer.view.map(_.offStr).toSet
 33 | //    emailNer.iterator ++ ner.view.filter(n => n.impl == GAZ && (n.typ == T_PERSON || n.typ == T_PERSON2 || n.typ == T_ORGANIZATION) && !offStr.contains(n.offStr))
 34 |     ner.iterator.filter(n => n.impl == GAZ || (n.impl == EMAIL && n.extRef.isEmpty)).map(n => if (n.impl == EMAIL) n.copy(typ = EMAIL) else n)
 35 |   }
 36 |   
 37 |   def doProximity(cliOption: CliOption) = {
 38 |     val prox = new Proximity(cliOption, nerFilter)
 39 |     
 40 |     val in = Source.fromInputStream(System.in, "UTF-8").getLines
 41 |     def work(json: String) = {
 42 |       prox.accDoc(json.parseJson.convertTo[Doc])
 43 |       "more"
 44 |     }
 45 |     def out(s: String) = {}
 46 |     doParallel(in, work, out, "done", "done", cliOption.numWorkers)
 47 |     log.info("load complete")
 48 |     
 49 |     type JOB = () => String
 50 |     
 51 |     val job1: JOB = () => {
 52 |       for {
 53 |         o <- cliOption.output
 54 |         w <- managed(bufWriter(fileWithSuffix(o, "node.json")))
 55 |         n <- prox.nodeMap.values.asScala
 56 |       } {
 57 |         w.write(n.toJson.compactPrint)
 58 |         w.write('\n')
 59 |       }
 60 |       "more"
 61 |     }
 62 |     
 63 |     val job2: JOB = () => {
 64 |       for {
 65 |         o <- cliOption.output
 66 |         w <- managed(bufWriter(fileWithSuffix(o, "edge.json")))
 67 |         e <- prox.edgeMap.entrySet.asScala
 68 |       } {
 69 |         w.write(Edge(e.getKey._1, e.getKey._2, e.getValue, GAZ).toJson.compactPrint)
 70 |         w.write('\n')
 71 |       }
 72 |       "more"
 73 |     }
 74 |     
 75 |     val in2 = Iterator(job1, job2)
 76 |     def work2(job: JOB) = job()
 77 |     doParallel(in2, work2, out, () => "done", "done", Math.min(2, cliOption.numWorkers))
 78 |   }
 79 |   
 80 |   case class NodeKey(name: String, typ: String)
 81 | }
 82 | 
 83 | /** thread-safe for concurrent accDoc's */
 84 | class Proximity(cliOption: CliOption, nerFilter: List[Ner]=> Iterator[Ner]) {
 85 |   import Proximity.NodeKey
 86 | 
 87 |   val nextId = new AtomicInteger(0)
 88 |   val nodeMap = new ConcurrentHashMap[NodeKey, Node]()
 89 | 
 90 |   // Scala's concurrent map TrieMap does not have anything like Java's ConcurrentHashMap.compute, which I think makes it rather useless!
 91 |   
 92 |   def accNode(k: NodeKey, score: Double, extRef: ExtRef): Int =
 93 |     nodeMap.computeIfAbsent(k, k => Node(nextId.getAndIncrement, extRef, score, k.typ)).nodeId
 94 | 
 95 |   val edgeMap = new ConcurrentHashMap[(Int, Int), WeightMap]
 96 |   
 97 |   def accEdge(source: Int, target: Int, collection: String, weight: Double): Unit = {
 98 |     val k = if (source < target) (source, target) else (target, source)
 99 |     edgeMap.compute(k, (k, v) => 
100 |       if (v == null) Map(collection -> (weight, 1)) withDefaultValue (0.0, 0)
101 |       else {
102 |         val (w0, c0) = v(collection)
103 |         v + (collection -> (w0 + weight, c0 + 1))
104 |       }
105 |     )
106 |   }
107 |   
108 |   val collectionRE = cliOption.collectionRe.r
109 |   def collection(path: String) = collectionRE.findFirstMatchIn(path).map(_.group(1)).getOrElse("UNKNOWN")
110 |   
111 |   // used concurrently
112 |   def accDoc(d: Doc): Unit = {
113 |     val cutoff = (cliOption.decay * 5).toInt
114 |     for {
115 |       ners <- nerFilter(d.ner) +: d.embedded.view.map(e => nerFilter(e.ner))
116 |       v = ners.toIndexedSeq.sortBy(_.offStr)
117 |       // _ = log.info(s"v.size = ${v.size}")
118 |       i <- 0 until v.size - 1 // exclude last
119 |       ni = v(i)
120 |       extRefi = ni.extRef.getOrElse(ExtRef(ni.text, List.empty))
121 |       (j, dist) <- (i + 1 until v.size).view.map { j => (j, v(j).offStr - ni.offStr) }.takeWhile(_._2 < cutoff)
122 |       nj = v(j)
123 |       extRefj = nj.extRef.getOrElse(ExtRef(nj.text, List.empty))
124 |     } {
125 |       // log.info(s"$i, $j -> $dist")
126 |       val idi = accNode(NodeKey(extRefi.name, ni.typ), ni.score, extRefi)
127 |       val idj = accNode(NodeKey(extRefj.name, nj.typ), nj.score, extRefj)
128 |       if (idi != idj) accEdge(idi, idj, collection(d.path), Math.exp(-dist/cliOption.decay)) // additive weight (distance = 1/sum(weights))
129 |     }
130 |   }
131 |   
132 | }
133 | 
134 | 


--------------------------------------------------------------------------------
/dataFusion-util/src/main/scala/au/csiro/data61/dataFusion/util/TmNer.scala:
--------------------------------------------------------------------------------
 1 | package au.csiro.data61.dataFusion.util
 2 | 
 3 | import spray.json.DefaultJsonProtocol._
 4 | import java.io.InputStream
 5 | import scala.io.Source
 6 | import spray.json._
 7 | import au.csiro.data61.dataFusion.common.Data._
 8 | 
 9 | /**
10 |  * Merge Debbie's ner results.
11 |  * Data provided as CSV files with Windows line endings and with our doc id in the filename but not in the data.
12 |  * Steps to clean the data:<ul>
13 |  * <li> sed -i 's/\r//' *                                       # get rid of Windows \r
14 |  * <li> awk -f /data/neil/tmner.awk tmner/ *.csv > tmner.json   # convert to JSON with id in the data
15 |  * </ul>
16 |  * This code merges in the resulting JSON which has the structure of case class Tmner.
17 |  */
18 | object TmNer {
19 |   case class Tmner(id: Long, typ: String, offStr: Int, offEnd: Int, text: String)  
20 |   
21 |   implicit val tmnerFormat = jsonFormat5(Tmner)  
22 |   
23 |   def tmnerIter(hIn: InputStream): Iterator[Tmner] = Source.fromInputStream(hIn, "UTF-8").getLines.map(_.parseJson.convertTo[Tmner])
24 |   
25 |   type TMap = Map[Long, Seq[Tmner]]
26 |   def tmnerMap(iter: Iterator[Tmner]): TMap = iter.toSeq.groupBy(_.id)    
27 |   
28 |   def toNer(t: Tmner) = Ner(-1, -1, t.offStr, t.offEnd, 1.0f, t.text, t.typ, "TMNER", None)
29 |       
30 |   def augment(m: TMap): Doc => Doc = { d =>
31 |     m.get(d.id) match {
32 |       case Some(s) => d.copy(ner = d.ner ++ s.map(toNer))
33 |       case None => d
34 |     }
35 |   }
36 | 
37 | }


--------------------------------------------------------------------------------
/dataFusion-util/src/test/resources/logback-test.xml:
--------------------------------------------------------------------------------
 1 | <configuration>
 2 | 
 3 |   <appender name="CONS" class="ch.qos.logback.core.ConsoleAppender">
 4 |     <encoder>
 5 |       <pattern>%d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern>
 6 |     </encoder>
 7 |   </appender>
 8 |   
 9 | <!--   <appender name="FILE" class="ch.qos.logback.core.FileAppender"> -->
10 | <!--     <file>util-test.log</file> -->
11 | <!--     <encoder> -->
12 | <!--       <pattern>%d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern> -->
13 | <!--     </encoder> -->
14 | <!--   </appender> -->
15 | 
16 |   <logger name="au.csiro.data61" level="DEBUG" />
17 |   
18 |   <root level="INFO">
19 |     <appender-ref ref="CONS" />
20 |   </root>
21 | </configuration>
22 | 


--------------------------------------------------------------------------------
/dataFusion-util/src/test/scala/au/csiro/data61/dataFusion/util/AgeTest.scala:
--------------------------------------------------------------------------------
 1 | package au.csiro.data61.dataFusion.util
 2 | 
 3 | import org.scalatest.{ FlatSpec, Matchers }
 4 | 
 5 | import com.typesafe.scalalogging.Logger
 6 | 
 7 | import au.csiro.data61.dataFusion.common.Data._
 8 | import Age._
 9 |   
10 | class AgeTest extends FlatSpec with Matchers {
11 |   private val log = Logger(getClass)
12 | 
13 |   def mkNer(content: String, name: String, ids: List[Long]) = {
14 |     val offStr = content.indexOf(name)
15 |     assert(offStr != -1)
16 |     val posStr = wordCount(content.substring(0, offStr))
17 |     Ner(posStr, posStr + wordCount(name), offStr, offStr + name.length, 0.0, name, T_PERSON2, GAZ, Some(ExtRef(name, ids))) 
18 |   }
19 | 
20 |   "Age.toNer" should "find parenthesized age" in {
21 |     val c = "The newbie Jacinda Ardern (37) was selected."
22 |     val n1 = mkNer(c, "Jacinda Ardern", List(101, 102))
23 |     val d = Doc(1, Some(c), Map.empty, "path", List(n1), List.empty)
24 |     val d2 = augment(d)
25 |     log.debug(s"d2 = $d2")
26 |     d2.ner.size should be(2) 
27 |     val expected = Ner(n1.posEnd, n1.posEnd + 1, n1.offEnd + 2, n1.offEnd + 4, 1.0, "37", "AGE", "D61AGE", n1.extRef)
28 |     assert(d2.ner.contains(expected))
29 |   }
30 |   
31 |   it should "not find parenthesized age after other text" in {
32 |     val c = "The newbie Jacinda Ardern blah (37) was selected."
33 |     val n1 = mkNer(c, "Jacinda Ardern", List(101, 102))
34 |     val d = Doc(1, Some(c), Map.empty, "path", List(n1), List.empty)
35 |     val d2 = augment(d)
36 |     log.debug(s"d2 = $d2")
37 |     d2 should be(d)
38 |   }
39 | 
40 |   it should "not find age in a phone number" in {
41 |     val c = "The newbie Jacinda Ardern (65) 3214-3456 was selected."
42 |     val n1 = mkNer(c, "Jacinda Ardern", List(101, 102))
43 |     val d = Doc(1, Some(c), Map.empty, "path", List(n1), List.empty)
44 |     val d2 = augment(d)
45 |     log.debug(s"d2 = $d2")
46 |     d2 should be(d)
47 |   }
48 |   
49 |   it should "find age in 'name, aged dd'" in {
50 |     val c = "The newbie Jacinda Ardern, aged 37, was selected."
51 |     val n1 = mkNer(c, "Jacinda Ardern", List(101, 102))
52 |     val d = Doc(1, Some(c), Map.empty, "path", List(n1), List.empty)
53 |     val d2 = augment(d)
54 |     log.debug(s"d2 = $d2")
55 |     d2.ner.size should be(2) 
56 |     val expected = Ner(n1.posEnd + 2, n1.posEnd + 3, n1.offEnd + 7, n1.offEnd + 9, 1.0, "37", "AGE", "D61AGE", n1.extRef)
57 |     assert(d2.ner.contains(expected))
58 |   }
59 |   
60 |   it should "find age within 50 chars after name" in {
61 |     val c = "Jacinda Ardern, future PM, aged 37, was selected."
62 |     val n1 = mkNer(c, "Jacinda Ardern", List(101, 102))
63 |     val d = Doc(1, Some(c), Map.empty, "path", List(n1), List.empty)
64 |     val d2 = augment(d)
65 |     log.debug(s"d2 = $d2")
66 |     d2.ner.size should be(2) 
67 |     val expected = Ner(n1.posEnd + 4, n1.posEnd + 5, n1.offEnd + 18, n1.offEnd + 20, 1.0, "37", "AGE", "D61AGE", n1.extRef)
68 |     assert(d2.ner.contains(expected))
69 |   }
70 |   
71 |   it should "associate an age only with the last preceeding PERSON" in {
72 |     val c = "Frederick Bloggs CEO and Jacinda Ardern, future PM, aged 37, were selected."
73 |     val n1 = mkNer(c, "Jacinda Ardern", List(101, 102))
74 |     val n2 = mkNer(c, "Frederick Bloggs", List(201, 202))
75 |     val d = Doc(1, Some(c), Map.empty, "path", List(n1, n2), List.empty)
76 |     val d2 = augment(d)
77 |     log.debug(s"d2 = $d2")
78 |     d2.ner.size should be(3) 
79 |     val expected = Ner(n1.posEnd + 4, n1.posEnd + 5, n1.offEnd + 18, n1.offEnd + 20, 1.0, "37", "AGE", "D61AGE", n1.extRef)
80 |     assert(d2.ner.contains(expected))
81 |   }
82 | }


--------------------------------------------------------------------------------
/dataFusion-util/src/test/scala/au/csiro/data61/dataFusion/util/EmailTest.scala:
--------------------------------------------------------------------------------
 1 | package au.csiro.data61.dataFusion.util
 2 | 
 3 | import org.scalatest.{ FlatSpec, Matchers }
 4 | 
 5 | import com.typesafe.scalalogging.Logger
 6 | 
 7 | import au.csiro.data61.dataFusion.common.Data._
 8 |   
 9 | class EmailTest extends FlatSpec with Matchers {
10 |   private val log = Logger(getClass)
11 | 
12 |   val text = """
13 | Some junk before
14 | the headers
15 | From: Ardern Jacinda (Wellington)
16 | 
17 | To: Bloggs Frederick (Akaroa); Smith
18 | Michael (Ekatahuna); Walters Roger (Pink Floyd)
19 | 
20 | Cc: Zealand New (Aotearoa)
21 | 
22 | Bcc: Peters Winston (Wellington)
23 | 
24 | Sent: Today
25 | 
26 | Subject: Forming Government
27 | in an MMP System
28 | 
29 | It's Labour ... and Prime Minister Jacinda Ardern.
30 | 
31 | New Zealand First has crowned Ardern the next prime minister with its decision to back a Labour-led government, which will also need the Green Party to govern.
32 | 
33 | Ardern will claim the top job after only two and a-half months as Labour leader - and follows her former mentor Helen Clark into the top job.
34 | 
35 | """
36 |     
37 |   "Email.toNer" should "find names" in {
38 |     val extRef = Some(ExtRef("Jacinda Ardern", List(1, 2)))
39 |     val gazNer = List(Ner(7, 9, 36, 50, 1.0, "Jacinda Ardern", T_PERSON2, GAZ , extRef))
40 |     val ners = Email.toNer(Email.extRefNer(gazNer), _ => 1.0)(text).toList
41 |     for (n <- ners) log.debug(s"ner = $n")
42 |     val expected = Seq(
43 |       Ner(7, 10, 36, 63, 1.0, "Ardern Jacinda (Wellington)", "FROM", "D61EMAIL" ,extRef),
44 |       Ner(14, 17, 96, 121, 1.0, "Smith\nMichael (Ekatahuna)", "TO", "D61EMAIL", None)
45 |     )
46 |     for (e <- expected) assert(ners.contains(e))
47 |   }
48 |   
49 | }


--------------------------------------------------------------------------------
/dataFusion-util/src/test/scala/au/csiro/data61/dataFusion/util/HitsTest.scala:
--------------------------------------------------------------------------------
 1 | package au.csiro.data61.dataFusion.util
 2 | 
 3 | import org.scalatest.{ FlatSpec, Matchers }
 4 | 
 5 | import com.typesafe.scalalogging.Logger
 6 | 
 7 | import au.csiro.data61.dataFusion.common.Data._
 8 | 
 9 | class HitsTest extends FlatSpec with Matchers {
10 |   val log = Logger(getClass)
11 |   
12 |   val id = 31L
13 |   val extRef = ExtRef("Jane", List(123L))
14 |   val score = 9.876f
15 |   val typ = "PERSON"
16 |   val path = "path"
17 |   val content = "I saw SARAH ANNE JONES here!"
18 |   
19 | //    case class Embedded(content: Option[String], meta: Map[String, String], ner: List[Ner])
20 |   val emb = Embedded(Some(content), Map.empty, List.empty)
21 | 
22 |   //    case class Doc(id: Long, content: Option[String], meta: Map[String, String], path: String, ner: List[Ner], embedded: List[Embedded])
23 |   val doc = Doc(id, Some(content), Map.empty, path, List.empty, List(emb))
24 |   
25 | //    case class PosInfo(posStr: Int, posEnd: Int, offStr: Int, offEnd: Int)
26 |   val pi = PosInfo(1, 4, doc.content.get.indexOf("SARAH"), doc.content.get.indexOf(" here!"))
27 |   
28 |   val expected = Ner(pi.posStr, pi.posEnd, pi.offStr, pi.offEnd, score, content.substring(pi.offStr, pi.offEnd), typ, GAZ, Some(extRef))
29 | 
30 |   "augment" should "add hit to doc.ner" in {
31 | //    case class LPosDoc(idEmbIdx: IdEmbIdx, posInfos: List[PosInfo])
32 |     val lPosDoc = LPosDoc(IdEmbIdx(id, EMB_IDX_MAIN), List(pi))
33 |     val hits = Seq(PHits(Stats(0, 0), List(lPosDoc), None, extRef, score, typ))
34 |     
35 |     val augment: Doc => Doc = Hits.augment(Hits.hitsMap(hits.iterator))
36 |     val doc2 = augment(doc)
37 |     log.debug(s"doc2 = $doc2")
38 |     doc2.ner.size should be(1)
39 |     doc2.ner(0) should be(expected)
40 |     doc2.embedded.size should be(1)
41 |     doc2.embedded(0).ner.size should be(0)
42 |   }
43 |   
44 |   it should "add hit to doc.embedded.ner" in {
45 | //    case class LPosDoc(idEmbIdx: IdEmbIdx, posInfos: List[PosInfo])
46 |     val lPosDoc = LPosDoc(IdEmbIdx(id, 0), List(pi))
47 |     val hits = Seq(PHits(Stats(0, 0), List(lPosDoc), None, extRef, 9.876f, "PERSON"))
48 |     
49 |     val augment: Doc => Doc = Hits.augment(Hits.hitsMap(hits.iterator))
50 |     val doc2 = augment(doc)
51 |     log.debug(s"doc2 = $doc2")
52 |     doc2.ner.size should be(0)
53 |     doc2.embedded.size should be(1)
54 |     doc2.embedded(0).ner.size should be(1)
55 |     doc2.embedded(0).ner(0) should be(expected)
56 |   }
57 |   
58 |   "termFreq" should "count terms" in {
59 |     Hits.termFreq("Aaron H Aaron") should be(Map("aaron" -> 2, "h" -> 1))
60 |     Hits.qTermFreq("Aaron H Aaron", T_PERSON) should be(Some(Map("aaron" -> 2, "h" -> 1)))
61 |     Hits.qTermFreq("Aaron H Aaron", T_ORGANIZATION) should be(None)
62 |     Hits.qTermFreq("Aaron H Bloggs", T_PERSON) should be(None)
63 |   }
64 |   
65 |   def mkNer(offStr: Int, offEnd: Int, typ: String) = Ner(0, 0, offStr, offEnd, 1.0, "text", typ, GAZ, None)
66 |   
67 |   "filterPer2" should "filter PERSON2 within PERSON" in {
68 |     val p = (0 until 3).map(i => mkNer(10 * i, 10 * i + 6, T_PERSON))
69 |     val p2 = Seq(mkNer(2, 6, T_PERSON2), mkNer(20, 24, T_PERSON2), mkNer(26, 28, T_PERSON2))
70 |     val x = Hits.filterPer2(p ++ p2)
71 |     x should be(p :+ p2(2)) // (0) & (1) are filtered out
72 |     log.debug(s"x = $x")
73 |   }
74 |   
75 | }


--------------------------------------------------------------------------------
/dataFusion-util/src/test/scala/au/csiro/data61/dataFusion/util/ProximityTest.scala:
--------------------------------------------------------------------------------
 1 | package au.csiro.data61.dataFusion.util
 2 | 
 3 | import org.scalatest.{ FlatSpec, Matchers }
 4 | 
 5 | import com.typesafe.scalalogging.Logger
 6 | 
 7 | import au.csiro.data61.dataFusion.common.Data.{ Doc, Embedded, ExtRef, GAZ, Ner, T_PERSON }
 8 | import scala.collection.JavaConverters._
 9 | 
10 | 
11 | class ProximityTest extends FlatSpec with Matchers {
12 |   val log = Logger(getClass)
13 |   
14 |   val offStr = 80
15 |   val cli = Main.defaultCliOption
16 |   val dist = (cli.decay/5).toInt
17 |   val weight = Math.exp(-dist/cli.decay)
18 |   
19 |   // case class Ner(posStr: Int, posEnd: Int, offStr: Int, offEnd: Int, score: Double, text: String, typ: String, impl: String, extRefId: Option[List[Long]])
20 |   val ner1 = Ner(0, 0, offStr, 0, 1.0, "text", T_PERSON, GAZ, Some(ExtRef("Fred", List(1, 3))))
21 |   val ner2 = Ner(0, 0, offStr + dist, 0, 1.0, "text", T_PERSON, GAZ, Some(ExtRef("Jane", List(2, 4))))
22 |   val ners = List(ner1, ner2)
23 |   
24 |   
25 |   "Proximity" should "find close ners" in {
26 |     // case class Doc(id: Long, content: Option[String], meta: Map[String, String], path: String, ner: List[Ner], embedded: List[Embedded])
27 |     val doc = Doc(0, Some("text"), Map.empty, "path", ners, List.empty)
28 |     val prox = new Proximity(cli, n => n.iterator)
29 |     prox.accDoc(doc)
30 |     for (x <- prox.nodeMap.values.asScala) log.info(s"$x")
31 |     for (x <- prox.edgeMap.entrySet.asScala) log.info(s"$x")
32 |     prox.nodeMap.size should be(2)
33 |     prox.edgeMap.size should be(1)
34 |     prox.edgeMap.get((0,1)) should be(Map("UNKNOWN" -> (weight, 1)))
35 | 
36 |     prox.accDoc(doc)
37 |     for (x <- prox.nodeMap.values.asScala) log.info(s"$x")
38 |     for (x <- prox.edgeMap.entrySet.asScala) log.info(s"$x")
39 |     prox.nodeMap.size should be(2)
40 |     prox.edgeMap.size should be(1)
41 |     prox.edgeMap.get((0,1)) should be(Map("UNKNOWN" -> (2*weight, 2)))
42 | 
43 |     // case class Embedded(content: Option[String], meta: Map[String, String], ner: List[Ner])
44 |     val emb = Embedded(Some("text"), Map.empty, ners)
45 |     val doc2 = doc.copy(embedded = List(emb))
46 |     prox.accDoc(doc2)
47 |     for (x <- prox.nodeMap.values.asScala) log.info(s"$x")
48 |     for (x <- prox.edgeMap.entrySet.asScala) log.info(s"$x")
49 |     prox.nodeMap.size should be(2)
50 |     prox.edgeMap.size should be(1)
51 |     prox.edgeMap.get((0,1)) should be(Map("UNKNOWN" -> (4*weight, 4)))
52 |   }
53 |   
54 | }


--------------------------------------------------------------------------------
/docker/.dockerignore:
--------------------------------------------------------------------------------
1 | Dockerfile-ubuntu
2 | Dockerfile-centos
3 | 


--------------------------------------------------------------------------------
/docker/Dockerfile-centos:
--------------------------------------------------------------------------------
 1 | FROM centos:latest 
 2 | 
 3 | ENV LANGUAGE=en
 4 | ENV LC_ALL=C
 5 | ENV LANG=C
 6 | 
 7 | # openblas and build tools are to build MITIE (used by dataFusion-ner)
 8 | # graphviz is for dependency graphs generated as part of the sbt build
 9 | # libreoffice, tesseract and ImageMagick are used by dataFusion-tika (and its unit tests)
10 | 
11 | RUN yum -y groupinstall 'Development Tools' && \
12 |   yum -y install https://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm && \
13 |   yum -y install openblas-devel cmake swig git java-1.8.0-openjdk-devel graphviz libreoffice tesseract ImageMagick
14 | 
15 | RUN curl https://bintray.com/sbt/rpm/rpm > /etc/yum.repos.d/bintray-sbt-rpm.repo && \
16 |   yum -y install sbt
17 | 
18 | ENTRYPOINT ["bash"]
19 | 


--------------------------------------------------------------------------------
/docker/Dockerfile-ubuntu:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:latest 
 2 | 
 3 | ENV LANGUAGE=en
 4 | ENV LC_ALL=C
 5 | ENV LANG=C
 6 | ENV JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/jre
 7 | 
 8 | # openblas and build tools are to build MITIE (used by dataFusion-ner)
 9 | # graphviz is for dependency graphs generated as part of the sbt build
10 | # libreoffice, tesseract and imagemagick are used by dataFusion-tika (and its unit tests)
11 | 
12 | RUN apt-get update && \
13 |   apt-get install -y gnupg curl unzip libopenblas-dev build-essential gfortran cmake swig git openjdk-8-jdk graphviz libreoffice tesseract-ocr tesseract-ocr-eng imagemagick && \
14 |   update-alternatives --set java /usr/lib/jvm/java-8-openjdk-amd64/jre/bin/java
15 | 
16 | RUN echo "deb https://dl.bintray.com/sbt/debian /" > /etc/apt/sources.list.d/sbt.list && \
17 |   apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv 2EE0EA64E40A89B84B2DF73499E82A75642AC823 && \
18 |   apt-get update && \
19 |   apt-get install -y sbt
20 | 
21 | ENTRYPOINT ["bash"]
22 | 


--------------------------------------------------------------------------------
/images/JSONFormatsUML.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/data61/dataFusion/4b766ae65254eed6c5a095f3bb8b6b42b9500a9c/images/JSONFormatsUML.png


--------------------------------------------------------------------------------
/images/dataFusion.zargo:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/data61/dataFusion/4b766ae65254eed6c5a095f3bb8b6b42b9500a9c/images/dataFusion.zargo


--------------------------------------------------------------------------------
/images/datafusion.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/data61/dataFusion/4b766ae65254eed6c5a095f3bb8b6b42b9500a9c/images/datafusion.png


--------------------------------------------------------------------------------
/images/network.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/data61/dataFusion/4b766ae65254eed6c5a095f3bb8b6b42b9500a9c/images/network.png


--------------------------------------------------------------------------------
/project/build.properties:
--------------------------------------------------------------------------------
1 | sbt.version=0.13.16
2 | 


--------------------------------------------------------------------------------
/project/plugins.sbt:
--------------------------------------------------------------------------------
 1 | addSbtPlugin("io.spray" % "sbt-revolver" % "0.8.0")
 2 | 
 3 | addSbtPlugin("com.typesafe.sbteclipse" % "sbteclipse-plugin" % "5.1.0")
 4 | 
 5 | addSbtPlugin("org.scala-sbt.plugins" % "sbt-onejar" % "0.8")
 6 | 
 7 | addSbtPlugin("net.virtual-void" % "sbt-dependency-graph" % "0.8.2")
 8 | 
 9 | addSbtPlugin("com.typesafe.sbt" % "sbt-license-report" % "1.2.0")
10 | 
11 | addSbtPlugin("com.github.gseitz" % "sbt-release" % "1.0.5")
12 | 


--------------------------------------------------------------------------------
/sh/dfus:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | # error if these vars not set
 4 | : ${DFUS_DIR:?must be set to the location of the dataFusion source tree cloned from from https://github.com/data61/dataFusion/. \"source sh/setenv\" from the dataFusion directory to set the environment.}
 5 | : ${SCALA_VER:?must be set to the Scala version. \"source sh/setenv\" from the dataFusion directory to set the environment.}
 6 | : ${DFUS_VER:?must be set to the dataFusion version. \"source sh/setenv\" from the dataFusion directory to set the environment.}
 7 | 
 8 | while getopts ":hm:s:" opt; do
 9 |   case $opt in
10 |   m)
11 |     HEAP=-Xmx${OPTARG}G
12 |     ;;
13 |   s)
14 |     STACK=-Xss${OPTARG}M
15 |     ;;
16 |   h)
17 |     cat <<EoF
18 | Usage: ${0##*/} [-m max_heap_GB] [-s stack_MB] {tika|ner|search|util} [option]...
19 | EoF
20 |     exit 0
21 |     ;;
22 |   :)
23 |     echo "Option -$OPTARG requires an argument." >&2
24 |     exit -1
25 |     ;;
26 |   \?)
27 |     echo "Invalid option: -$OPTARG" >&2
28 |     exit -2
29 |     ;;
30 |   esac
31 | done
32 | 
33 | eval cmd=\$$OPTIND
34 | shift $OPTIND
35 | # echo $cmd "$@"
36 | 
37 | getJar() {
38 |   echo ${DFUS_DIR}/dataFusion-${1}/target/scala-${SCALA_VER}/datafusion-${1}_${SCALA_VER}-${DFUS_VER}-one-jar.jar
39 | }
40 | 
41 | java $HEAP $STACK -jar `getJar $cmd` "$@"
42 | 


--------------------------------------------------------------------------------
/sh/setenv.centos:
--------------------------------------------------------------------------------
 1 | #! /not/to/be/execed
 2 |  
 3 | # used by sh/dfus
 4 | export DFUS_DIR=${PWD}
 5 | export SCALA_VER=2.12
 6 | export DFUS_VER=1.1-SNAPSHOT
 7 | 
 8 | # needed by dataFusion-ner (including sbt tests)
 9 | export LD_LIBRARY_PATH=${DFUS_DIR}/dataFusion-ner/MITIE-native/centos   # directory containing libjavamitie.so
10 | export NER_MITIE_ENGLISH_MODEL=${DFUS_DIR}/dataFusion-ner/MITIE-models/english/ner_model.dat
11 | # export NER_MITIE_SPANISH_MODEL=${DFUS_DIR}/dataFusion-ner/MITIE-models/spanish/ner_model.dat
12 | 
13 | SEARCH_DIR=${DFUS_DIR}/dataFusion-search
14 | export SEARCH_SYNONYMS=${SEARCH_DIR}/synonyms.txt
15 | export SEARCH_DOC_INDEX=${SEARCH_DIR}/docIndex
16 | export SEARCH_META_INDEX=${SEARCH_DIR}/metaIndex
17 | export SEARCH_NER_INDEX=${SEARCH_DIR}/nerIndex
18 | 
19 | export PATH=${DFUS_DIR}/sh:/usr/sbin:/usr/bin:/sbin:/bin
20 | 
21 | cat <<EoF2
22 | Source this file with the dataFusion source tree top level dir as the current directory.
23 | It sets env vars for the project and sets a restricted PATH with dataFusion/sh as the highest priority item.
24 | 
25 | Try "dfus -h" to get started.
26 | EoF2
27 | 
28 | 


--------------------------------------------------------------------------------
/sh/setenv.ubuntu:
--------------------------------------------------------------------------------
 1 | #! /not/to/be/execed
 2 |  
 3 | # used by sh/dfus
 4 | export DFUS_DIR=${PWD}
 5 | export SCALA_VER=2.12
 6 | export DFUS_VER=1.1-SNAPSHOT
 7 | 
 8 | # needed by dataFusion-ner (including sbt tests)
 9 | export LD_LIBRARY_PATH=${DFUS_DIR}/dataFusion-ner/MITIE-native/ubuntu   # directory containing libjavamitie.so
10 | export NER_MITIE_ENGLISH_MODEL=${DFUS_DIR}/dataFusion-ner/MITIE-models/english/ner_model.dat
11 | # export NER_MITIE_SPANISH_MODEL=${DFUS_DIR}/dataFusion-ner/MITIE-models/spanish/ner_model.dat
12 | 
13 | SEARCH_DIR=${DFUS_DIR}/dataFusion-search
14 | export SEARCH_SYNONYMS=${SEARCH_DIR}/synonyms.txt
15 | export SEARCH_DOC_INDEX=${SEARCH_DIR}/docIndex
16 | export SEARCH_META_INDEX=${SEARCH_DIR}/metaIndex
17 | export SEARCH_NER_INDEX=${SEARCH_DIR}/nerIndex
18 | 
19 | export PATH=${DFUS_DIR}/sh:/usr/sbin:/usr/bin:/sbin:/bin
20 | 
21 | cat <<EoF2
22 | Source this file with the dataFusion source tree top level dir as the current directory.
23 | It sets env vars for the project and sets a restricted PATH with dataFusion/sh as the highest priority item.
24 | 
25 | Try "dfus -h" to get started.
26 | EoF2
27 | 
28 | 


--------------------------------------------------------------------------------
/sh/tesseract4.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | # Tika calls tesseract 3 with args: imageInFile txtOutFile -l eng -psm 1 txt -c preserve_interword_spaces=0
 4 | # Tesseract 4 no longer accepts txt (which is the default anyway) so filter that out
 5 | declare -a args
 6 | for a in "$@"; do
 7 |   [[ "$a" != txt ]] && args+=("$a")
 8 | done
 9 | 
10 | # if manually built and installed
11 | # ROOT=/usr/local
12 | ROOT=/usr
13 | # put *.traindata files under $ROOT/share
14 | # mv $ROOT/share/tessdata/pdf.ttf $ROOT/share
15 | LD_LIBRARY_PATH=$ROOT/lib TESSDATA_PREFIX=$ROOT/share/tessdata $ROOT/bin/tesseract "${args[@]}"
16 | 
17 | # if installed from ppa:alex-p/tesseract-ocr
18 | # /usr/bin/tesseract "${args[@]}"
19 | 


--------------------------------------------------------------------------------
/ui/bubble/css/index.css:
--------------------------------------------------------------------------------
  1 | body {
  2 |   font-family: Verdana, sans-serif;
  3 |   margin: 0;
  4 |   padding: 0;
  5 |   position: relative;
  6 |   height: 100%;
  7 |   color: #262626;
  8 |   font-size: 0.8rem;
  9 | }
 10 | 
 11 | a {
 12 |   color: #880E4F;
 13 |   text-decoration: none;
 14 | }
 15 | 
 16 | header {
 17 |   background: #f7f7f7;
 18 |   padding: 1.2rem;
 19 | }
 20 | 
 21 | h1 {
 22 |   font-size: 1rem;
 23 |   font-weight: bold;
 24 |   margin: 0;
 25 |   padding: 0;
 26 | }
 27 | 
 28 | #logos {
 29 |   position: absolute;
 30 |   right: 1.2rem;
 31 |   top: 0.4rem;
 32 | }
 33 | 
 34 | #logos img {
 35 |   height: 2.8rem;
 36 | }
 37 | 
 38 | #container {
 39 |   height: calc(98vh - 60px);
 40 |   width: calc(98vh - 60px);
 41 |   margin-left: 150px;
 42 |   position: relative;
 43 | }
 44 | 
 45 | #info {
 46 |   border-radius: 10px;
 47 |   position: absolute;
 48 |   right: 0;
 49 |   top: 0;
 50 |   min-height: 300px;
 51 |   width: calc(98vw - 98vh - 250px);
 52 |   max-width: 500px;
 53 |   margin: 5rem 40px 40px;
 54 |   background: #f7f7f7;
 55 |   border: 1px solid lightgray;
 56 |   visibility: hidden;
 57 |   padding: 20px;
 58 |   font-size: 14px;
 59 | }
 60 | 
 61 | #name {
 62 |   border-bottom: 1px solid lightgray;
 63 |   padding-bottom: 12px;
 64 |   font-size: 16px;
 65 | }
 66 | 
 67 | #desc {
 68 |   word-wrap: break-word;
 69 | }
 70 | 
 71 | #legend {
 72 |   position: absolute;
 73 |   top: 0;
 74 |   left: -120px;
 75 |   width: 110px;
 76 |   font-size: 12px;
 77 |   list-style: none;
 78 |   margin: 20px 0 0;
 79 |   padding: 0;
 80 | }
 81 | 
 82 | #legend li {
 83 |   padding: 4px;
 84 | }
 85 | 
 86 | #legend li svg {
 87 |   vertical-align: middle;
 88 | }
 89 | 
 90 | #fetch-graph {
 91 |   left: 20rem;
 92 | }
 93 | 
 94 | #vis-opts {
 95 |   left: 30rem;
 96 | }
 97 | 
 98 | .graph-form {
 99 |   position: absolute;
100 |   top: 0rem;
101 | }
102 | 
103 | .graph-form h2 {
104 |   cursor: pointer;
105 |   font-size: 1rem;
106 |   font-weight: normal;
107 |   padding: 1.2rem 0;
108 |   margin: 0;
109 | }
110 | 
111 | .graph-form h2:after {
112 |   content: "^";
113 |   display: inline-block;
114 |   margin-left: 0.5rem;
115 |   transform: rotate(0deg) translateY(0.25rem);
116 | }
117 | 
118 | .graph-form.closed h2:after {
119 |   transform: rotate(180deg);
120 | }
121 | 
122 | .graph-form.closed form {
123 |   display: none;
124 | }
125 | 
126 | /* Form styles */
127 | form {
128 |   border-radius: 10px;
129 |   background: rgba(220, 220, 220, 0.95);
130 |   border: 1px solid gray;
131 | }
132 | 
133 | fieldset {
134 |   border: none;
135 |   margin: 0;
136 |   padding: 0.6rem 1rem;
137 | }
138 | 
139 | legend {
140 |   font-weight: bold;
141 |   padding: 0 0 0.4rem;
142 | }
143 | 
144 | label {
145 |   padding-right: 0.2rem;
146 | }
147 | 
148 | input[type=number] {
149 |   width: 3rem;
150 | }
151 | 
152 | #network-chart .links line {
153 |   stroke: #999;
154 |   stroke-opacity: 0.6;
155 | }
156 | 
157 | #network-chart .nodes circle {
158 |   stroke: #fff;
159 |   stroke-width: 1.5px;
160 | }
161 | 


--------------------------------------------------------------------------------
/ui/bubble/images/csiro-black.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/data61/dataFusion/4b766ae65254eed6c5a095f3bb8b6b42b9500a9c/ui/bubble/images/csiro-black.png


--------------------------------------------------------------------------------
/ui/bubble/images/data61-logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/data61/dataFusion/4b766ae65254eed6c5a095f3bb8b6b42b9500a9c/ui/bubble/images/data61-logo.png


--------------------------------------------------------------------------------
/ui/bubble/js/d3-selection-multi.v1.min.js:
--------------------------------------------------------------------------------
1 | // https://github.com/d3/d3-selection-multi Version 1.0.1. Copyright 2017 Mike Bostock.
2 | !function(t,n){"object"==typeof exports&&"undefined"!=typeof module?n(require("d3-selection"),require("d3-transition")):"function"==typeof define&&define.amd?define(["d3-selection","d3-transition"],n):n(t.d3,t.d3)}(this,function(t,n){"use strict";function r(n,r){return n.each(function(){var n=r.apply(this,arguments),e=t.select(this);for(var i in n)e.attr(i,n[i])})}function e(t,n){for(var r in n)t.attr(r,n[r]);return t}function i(n,r,e){return n.each(function(){var n=r.apply(this,arguments),i=t.select(this);for(var o in n)i.style(o,n[o],e)})}function o(t,n,r){for(var e in n)t.style(e,n[e],r);return t}function f(n,r){return n.each(function(){var n=r.apply(this,arguments),e=t.select(this);for(var i in n)e.property(i,n[i])})}function u(t,n){for(var r in n)t.property(r,n[r]);return t}function s(n,r){return n.each(function(){var e=r.apply(this,arguments),i=t.select(this).transition(n);for(var o in e)i.attr(o,e[o])})}function c(t,n){for(var r in n)t.attr(r,n[r]);return t}function a(n,r,e){return n.each(function(){var i=r.apply(this,arguments),o=t.select(this).transition(n);for(var f in i)o.style(f,i[f],e)})}function p(t,n,r){for(var e in n)t.style(e,n[e],r);return t}var l=function(t){return("function"==typeof t?r:e)(this,t)},y=function(t,n){return("function"==typeof t?i:o)(this,t,null==n?"":n)},h=function(t){return("function"==typeof t?f:u)(this,t)},v=function(t){return("function"==typeof t?s:c)(this,t)},d=function(t,n){return("function"==typeof t?a:p)(this,t,null==n?"":n)};t.selection.prototype.attrs=l,t.selection.prototype.styles=y,t.selection.prototype.properties=h,n.transition.prototype.attrs=v,n.transition.prototype.styles=d});
3 | 


--------------------------------------------------------------------------------
/ui/bubble/js/index.js:
--------------------------------------------------------------------------------
 1 | let colourMap = {
 2 |   "ORGANIZATION": "#880E4F",
 3 |   "PERSON": "#BF360C",
 4 |   "PERSON2": "#BF360C",
 5 |   "FROM": "#311B92",
 6 |   "TO": "#311B92",
 7 |   "BCC": "#311B92"
 8 | }
 9 | 
10 | let size = Math.min(window.innerHeight, window.innerWidth)
11 | let width = 500
12 | let height = 500
13 | 
14 | function displayDataSidebar (data) {
15 |   let info = d3.select("#info")
16 |   let title = document.querySelector("#name")
17 |   let desc = document.querySelector("#desc")
18 | 
19 |   info.style("visibility", "visible")
20 |   title.innerHTML = data.title
21 |   desc.innerHTML = data.desc
22 | }
23 | 
24 | function hideDataSidebar () {
25 |   let info = d3.select("#info")
26 |   info.style("visibility", "hidden")
27 | }
28 | 
29 | // Form switcher
30 | 
31 | function toggleClosedClass (el, forceClose) {
32 |   let cn = el.parentNode.className
33 |   let closedIdx = cn.indexOf(" closed")
34 |   if (closedIdx > -1 && !forceClose) {
35 |     cn = cn.slice(0, closedIdx)
36 |   } else {
37 |     cn = cn + " closed"
38 |   }
39 |   el.parentNode.className = cn
40 | }
41 | 
42 | let clickers = Array.prototype.slice.call(document.querySelectorAll(".graph-form h2"))
43 | 
44 | clickers.forEach((clicker, idx, arr) => {
45 |   clicker.onclick = function() {
46 |     return function (el, idx) {
47 |       let others = clickers.filter((el, currIdx, arr) => idx !== currIdx)
48 |       others.forEach(clicker => toggleClosedClass(clicker, true))
49 |       toggleClosedClass(el, false)
50 |     }(this, idx)
51 |   }
52 | })
53 | 
54 | // Form actions
55 | document.querySelector("#fetchForm").onsubmit = evt => {
56 |   evt.preventDefault()
57 |   getGraph()
58 | }
59 | 
60 | document.querySelector("#optsForm").onsubmit = evt => {
61 |   evt.preventDefault()
62 |   drawGraph()
63 | }
64 | 


--------------------------------------------------------------------------------
/ui/network/graph.css:
--------------------------------------------------------------------------------
1 | input[type=text] {
2 |   width: 60px;
3 | }
4 | 


--------------------------------------------------------------------------------
/ui/network/index.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <meta charset="utf-8">
 3 | <link rel="stylesheet" href="graph.css">
 4 | <style>
 5 | .links line {
 6 |   stroke: #999;
 7 |   stroke-opacity: 0.6;
 8 | }
 9 | 
10 | .nodes circle {
11 |   stroke: #fff;
12 |   stroke-width: 1.5px;
13 | }
14 | </style>
15 | <svg width="960" height="600"></svg>
16 | <div>
17 | <label for="includePerson2">include person nodes using only first and family names</label><input type="checkbox" id="includePerson2" name="includePerson2" />
18 | <br>
19 | <span>collections:</span>
20 | <label for="colCbc">CBC</label><input type="checkbox" id="colCbc" name="colCbc" />
21 | <label for="colChanley">Chanley</label><input type="checkbox" id="colChanley" name="colChanley" />
22 | <label for="colMascot">MASCOT</label><input type="checkbox" id="colMascot" name="colMascot" />
23 | <label for="colRtp">RTP</label><input type="checkbox" id="colRtp" name="colRtp" />
24 | <label for="colRulings">rulings</label><input type="checkbox" id="colRulings" name="colRulings" />
25 | <br>
26 | <label for="chargeStrength">charge strength</label><input type="text" id="chargeStrength" name="chargeStrength" value="-3" />
27 | <br>
28 | <span>distance range:</span>
29 | <label for="distanceLogScale">log scale</label><input type="checkbox" id="distanceLogScale" name="distanceLogScale" />
30 | <label for="distanceFrom">from</label><input type="text" id="distanceFrom" name="distanceFrom" value="20" />
31 | <label for="distanceTo">to</label><input type="text" id="distanceTo" name="distanceTo" value="300" />
32 | <br>
33 | <span>node radius range:</span>
34 | <label for="nodeRadiusLogScale">log scale</label><input type="checkbox" id="nodeRadiusLogScale" name="nodeRadiusLogScale" />
35 | <label for="nodeRadiusFrom">from</label><input type="text" id="nodeRadiusFrom" name="nodeRadiusFrom" value="2" />
36 | <label for="nodeRadiusTo">to</label><input type="text" id="nodeRadiusTo" name="nodeRadiusTo" value="10" />
37 | <br>
38 | <span>edge width range:</span>
39 | <label for="edgeWidthLogScale">log scale</label><input type="checkbox" id="edgeWidthLogScale" name="edgeWidthLogScale" />
40 | <label for="edgeWidthFrom">from</label><input type="text" id="edgeWidthFrom" name="edgeWidthFrom" value="1" />
41 | <label for="edgeWidthTo">to</label><input type="text" id="edgeWidthTo" name="edgeWidthTo" value="4" />
42 | <br>
43 | <label for="maxEdges">maxEdges</label><input type="text" id="maxEdges" name="maxEdges" value="100" />
44 | <label for="minScore">minScore</label><input type="text" id="minScore" name="minScore" value="4.5" />
45 | <label for="port">graph-service port</label><input type="text" id="port" name="port" value="8089" />
46 | <br>
47 | <button type="button" onclick="topConnected();">Top Connected</button>
48 | <br>
49 | <label for="nodeId">nodeId</label><input type="text" id="nodeId" name="nodeId" />
50 | <label for="extRefId">extRefId</label><input type="text" id="extRefId" name="extRefId" />
51 | <label for="maxHops">maxHops</label><input type="text" id="maxHops" name="maxHops" value="2" />
52 | <button type="button" onclick="localNetwork();">Local Network</button>
53 | </div>
54 | <script src="d3.v4.min.js"></script>
55 | <script src="graph.js"></script>
56 | 


--------------------------------------------------------------------------------
/version.sbt:
--------------------------------------------------------------------------------
1 | version in ThisBuild := "1.1-SNAPSHOT"
2 | 


--------------------------------------------------------------------------------