├── .gitignore ├── LICENSE.txt ├── README.md ├── build.sbt ├── build.sh ├── dataFusion-common ├── 3rd-party-licenses.md ├── README.md ├── build.sbt ├── doc │ └── dataFusion.zargo └── src │ ├── main │ └── scala │ │ └── au │ │ └── csiro │ │ └── data61 │ │ └── dataFusion │ │ └── common │ │ ├── CSV.scala │ │ ├── Data.scala │ │ ├── EnglishScore.scala │ │ ├── Parallel.scala │ │ ├── Timer.scala │ │ └── Util.scala │ └── test │ ├── resources │ └── logback-test.xml │ └── scala │ └── au │ └── csiro │ └── data61 │ └── dataFusion │ └── common │ ├── EnglishScoreTest.scala │ ├── JsonTest.scala │ ├── ParallelTest.scala │ └── UtilTest.scala ├── dataFusion-db-service ├── 3rd-party-licenses.md ├── README.md ├── build.sbt └── src │ ├── main │ ├── resources │ │ ├── application.conf │ │ └── logback.xml │ └── scala │ │ └── au │ │ └── csiro │ │ └── data61 │ │ └── dataFusion │ │ └── db │ │ └── service │ │ ├── DbService.scala │ │ └── Main.scala │ └── test │ └── resources │ └── logback-test.xml ├── dataFusion-db ├── 3rd-party-licenses.md ├── README.md ├── build.sbt └── src │ ├── main │ ├── resources │ │ ├── application.conf │ │ └── logback.xml │ └── scala │ │ └── au │ │ └── csiro │ │ └── data61 │ │ └── dataFusion │ │ └── db │ │ ├── Main.scala │ │ └── Tables.scala │ └── test │ └── resources │ └── logback-test.xml ├── dataFusion-graph-service ├── 3rd-party-licenses.md ├── README.md ├── build.sbt └── src │ ├── main │ ├── resources │ │ ├── application.conf │ │ └── logback.xml │ └── scala │ │ └── au │ │ └── csiro │ │ └── data61 │ │ └── dataFusion │ │ └── graph │ │ └── service │ │ └── Main.scala │ └── test │ ├── resources │ ├── edge.json │ ├── logback-test.xml │ └── node.json │ └── scala │ └── au │ └── csiro │ └── data61 │ └── dataFusion │ └── graph │ └── service │ └── MainTest.scala ├── dataFusion-ner-service ├── 3rd-party-licenses.md ├── README.md ├── build.sbt └── src │ └── main │ ├── resources │ ├── application.conf │ └── logback.xml │ └── scala │ └── au │ └── csiro │ └── data61 │ └── dataFusion │ └── ner │ └── service │ └── Main.scala ├── dataFusion-ner ├── 3rd-party-licenses.md ├── MITIE-native │ ├── centos │ │ └── libjavamitie.so │ └── ubuntu │ │ └── libjavamitie.so ├── README.md ├── build-MITIE.sh ├── build.sbt ├── lib │ └── javamitie.jar └── src │ ├── main │ ├── resources │ │ ├── application.conf │ │ ├── logback.xml │ │ └── opennlp-models-1.5 │ │ │ ├── en-ner-date.bin │ │ │ ├── en-ner-location.bin │ │ │ ├── en-ner-money.bin │ │ │ ├── en-ner-organization.bin │ │ │ ├── en-ner-percentage.bin │ │ │ ├── en-ner-person.bin │ │ │ ├── en-ner-time.bin │ │ │ ├── en-sent.bin │ │ │ └── en-token.bin │ └── scala │ │ └── au │ │ └── csiro │ │ └── data61 │ │ └── dataFusion │ │ └── ner │ │ ├── CoreNLP.scala │ │ ├── MITIE.scala │ │ ├── Main.scala │ │ ├── OpenNLP.scala │ │ └── Split.scala │ └── test │ ├── resources │ └── logback-test.xml │ └── scala │ └── au │ └── csiro │ └── data61 │ └── dataFusion │ └── ner │ ├── CoreNLPTest.scala │ ├── MITIETest.scala │ ├── OpenNLPTest.scala │ └── SplitTest.scala ├── dataFusion-search-service ├── 3rd-party-licenses.md ├── README.md ├── build.sbt └── src │ ├── main │ ├── resources │ │ ├── application.conf │ │ └── logback.xml │ └── scala │ │ └── au │ │ └── csiro │ │ └── data61 │ │ └── dataFusion │ │ └── search │ │ └── service │ │ └── Main.scala │ └── test │ └── resources │ └── logback-test.xml ├── dataFusion-search ├── 3rd-party-licenses.md ├── README.md ├── build.sbt ├── src │ ├── main │ │ ├── resources │ │ │ ├── application.conf │ │ │ └── logback.xml │ │ └── scala │ │ │ └── au │ │ │ └── csiro │ │ │ └── data61 │ │ │ └── dataFusion │ │ │ └── search │ │ │ ├── DataFusionLucene.scala │ │ │ ├── DocFreq.scala │ │ │ ├── Indexer.scala │ │ │ ├── LuceneUtil.scala │ │ │ ├── Main.scala │ │ │ └── Search.scala │ └── test │ │ ├── resources │ │ └── logback-test.xml │ │ └── scala │ │ └── au │ │ └── csiro │ │ └── data61 │ │ └── dataFusion │ │ └── search │ │ ├── DataFusionLuceneTest.scala │ │ ├── JsonTest.scala │ │ └── SearchTest.scala └── synonyms.txt ├── dataFusion-tika-service ├── 3rd-party-licenses.md ├── README.md ├── build.sbt └── src │ ├── main │ ├── resources │ │ ├── application.conf │ │ └── logback.xml │ └── scala │ │ └── au │ │ └── csiro │ │ └── data61 │ │ └── dataFusion │ │ └── tika │ │ └── service │ │ └── Main.scala │ └── test │ └── resources │ └── logback-test.xml ├── dataFusion-tika ├── 3rd-party-licenses.md ├── README.md ├── build.sbt └── src │ ├── main │ ├── java │ │ └── org │ │ │ └── apache │ │ │ └── tika │ │ │ └── parser │ │ │ └── ocr │ │ │ └── TesseractOCRParser.java │ ├── resources │ │ ├── META-INF │ │ │ └── services │ │ │ │ ├── javax.imageio.spi.ImageReaderSpi │ │ │ │ └── javax.imageio.spi.ImageWriterSpi │ │ ├── application.conf │ │ ├── logback.xml │ │ └── org │ │ │ └── apache │ │ │ └── tika │ │ │ └── parser │ │ │ ├── ocr │ │ │ ├── TesseractOCRConfig.properties │ │ │ └── rotation.py │ │ │ └── pdf │ │ │ └── PDFParser.properties │ └── scala │ │ └── au │ │ └── csiro │ │ └── data61 │ │ └── dataFusion │ │ └── tika │ │ ├── LangDetect.scala │ │ ├── Main.scala │ │ └── TikaUtil.scala │ └── test │ ├── resources │ ├── exampleData │ │ ├── AAA.pptx │ │ ├── Email001.msg │ │ ├── PDF001.pdf │ │ ├── PDF002.pdf │ │ ├── PDF003.pdf │ │ ├── PDF004.pdf │ │ ├── README.txt │ │ ├── TIF001.tif │ │ ├── TIF002.tif │ │ ├── TIF003.tif │ │ ├── Thumbs.db │ │ ├── data-prob-2-12.XLS │ │ ├── doc001.doc │ │ ├── doc002.doc │ │ ├── html001.html │ │ ├── image001.png │ │ ├── image002.gif │ │ ├── image003.jpeg │ │ ├── image004.png │ │ ├── rtf001.rtf │ │ └── xls001.xls │ └── logback-test.xml │ └── scala │ └── au │ └── csiro │ └── data61 │ └── dataFusion │ └── tika │ └── TikaTest.scala ├── dataFusion-util ├── 3rd-party-licenses.md ├── README.md ├── build.sbt └── src │ ├── main │ ├── resources │ │ ├── application.conf │ │ └── logback.xml │ └── scala │ │ └── au │ │ └── csiro │ │ └── data61 │ │ └── dataFusion │ │ └── util │ │ ├── Age.scala │ │ ├── Email.scala │ │ ├── Hits.scala │ │ ├── Main.scala │ │ ├── Proximity.scala │ │ └── TmNer.scala │ └── test │ ├── resources │ └── logback-test.xml │ └── scala │ └── au │ └── csiro │ └── data61 │ └── dataFusion │ └── util │ ├── AgeTest.scala │ ├── EmailTest.scala │ ├── HitsTest.scala │ └── ProximityTest.scala ├── docker ├── .dockerignore ├── Dockerfile-centos └── Dockerfile-ubuntu ├── images ├── JSONFormatsUML.png ├── JSONFormatsUML.svg ├── dataFusion.zargo ├── datafusion.png ├── datafusion.svg └── network.png ├── project ├── build.properties └── plugins.sbt ├── sh ├── dfus ├── setenv.centos ├── setenv.ubuntu └── tesseract4.sh ├── ui ├── README.md ├── bubble │ ├── css │ │ └── index.css │ ├── data │ │ └── data.json │ ├── images │ │ ├── csiro-black.png │ │ └── data61-logo.png │ ├── index.html │ └── js │ │ ├── bubble.js │ │ ├── d3-selection-multi.v1.min.js │ │ ├── d3.v4.min.js │ │ ├── form.js │ │ ├── index.js │ │ └── network.js └── network │ ├── d3.v4.min.js │ ├── graph.css │ ├── graph.js │ └── index.html └── version.sbt /.gitignore: -------------------------------------------------------------------------------- 1 | # Logs 2 | logs 3 | *.log 4 | npm-debug.log* 5 | 6 | # Runtime data 7 | pids 8 | *.pid 9 | *.seed 10 | 11 | # Directory for instrumented libs generated by jscoverage/JSCover 12 | lib-cov 13 | 14 | # Coverage directory used by tools like istanbul 15 | coverage 16 | 17 | # nyc test coverage 18 | .nyc_output 19 | 20 | # Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files) 21 | .grunt 22 | 23 | # node-waf configuration 24 | .lock-wscript 25 | 26 | # Compiled binary addons (http://nodejs.org/api/addons.html) 27 | build/Release 28 | 29 | # Dependency directories 30 | node_modules 31 | jspm_packages 32 | 33 | # Optional npm cache directory 34 | .npm 35 | 36 | # Optional REPL history 37 | .node_repl_history 38 | 39 | # sbt and eclipse 40 | .classpath 41 | .project 42 | .settings/ 43 | bin/ 44 | test-bin/ 45 | .cache-main 46 | .cache-tests 47 | target/ 48 | 49 | # project generated CSV files, NER models, Lucene indices, H2 test database etc. 50 | dataFusion-ner/MITIE/ 51 | dataFusion-ner/MITIE-models/ 52 | *.csv 53 | dataFusion-search/*Index/ 54 | dataFusion.mv.db 55 | 56 | ui/swagger-ui-3.3.2/ 57 | -------------------------------------------------------------------------------- /build.sh: -------------------------------------------------------------------------------- 1 | #! /bin/sh 2 | 3 | set -vex 4 | 5 | # ubuntu has: ID=ubuntu but centos has: ID="centos" 6 | OS=`sed --regexp-extended --quiet 's/^ID="?([a-z]+)"?$/\1/p' /etc/os-release` 7 | 8 | # build MITIE (native code used by dataFusion-ner) 9 | # do as little as necessary by default, add --clean option to do everything from scratch 10 | cd dataFusion-ner 11 | ./build-MITIE.sh # --clean 12 | cd .. 13 | 14 | # set environment 15 | . ./sh/setenv.$OS 16 | 17 | # run Scala build 18 | sbt one-jar # minimal, or 19 | # sbt -J-Xmx3G clean test publish-local one-jar dumpLicenseReport # the works 20 | # move/rename the license reports 21 | # for i in */target/license-reports/*.md; do cp $i ${i%%/*}/3rd-party-licenses.md; done 22 | 23 | -------------------------------------------------------------------------------- /dataFusion-common/3rd-party-licenses.md: -------------------------------------------------------------------------------- 1 | # datafusion-common-licenses 2 | 3 | Category | License | Dependency | Notes 4 | --- | --- | --- | --- 5 | Apache | [Apache 2](http://www.apache.org/licenses/LICENSE-2.0.txt) | io.spray # spray-json_2.12 # 1.3.3 | 6 | Apache | [Apache 2.0 License](http://www.apache.org/licenses/LICENSE-2.0.html) | com.typesafe.scala-logging # scala-logging_2.12 # 3.5.0 | 7 | Apache | [the Apache License, ASL Version 2.0](http://www.apache.org/licenses/LICENSE-2.0) | org.scalactic # scalactic_2.12 # 3.0.3 | 8 | Apache | [the Apache License, ASL Version 2.0](http://www.apache.org/licenses/LICENSE-2.0) | org.scalatest # scalatest_2.12 # 3.0.3 | 9 | BSD | [BSD 3-Clause](http://www.scala-lang.org/license.html) | org.scala-lang # scala-library # 2.12.2 | 10 | BSD | [BSD 3-Clause](http://www.scala-lang.org/license.html) | org.scala-lang # scala-reflect # 2.12.2 | 11 | BSD | [BSD 3-clause](http://opensource.org/licenses/BSD-3-Clause) | org.scala-lang.modules # scala-parser-combinators_2.12 # 1.0.4 | 12 | BSD | [BSD 3-clause](http://opensource.org/licenses/BSD-3-Clause) | org.scala-lang.modules # scala-xml_2.12 # 1.0.5 | 13 | LGPL | [EPL + GNU Lesser General Public License](http://logback.qos.ch/license.html) | ch.qos.logback # logback-classic # 1.2.3 | 14 | LGPL | [EPL + GNU Lesser General Public License](http://logback.qos.ch/license.html) | ch.qos.logback # logback-core # 1.2.3 | 15 | MIT | [MIT License](http://www.slf4j.org/license.html) | org.slf4j # slf4j-api # 1.7.25 | 16 | 17 | -------------------------------------------------------------------------------- /dataFusion-common/build.sbt: -------------------------------------------------------------------------------- 1 | name := "dataFusion-common" 2 | 3 | libraryDependencies ++= Seq( 4 | "io.spray" %% "spray-json" % "1.3.3", 5 | // "io.swagger" % "swagger-annotations" % "1.5.12", 6 | "com.typesafe.scala-logging" %% "scala-logging" % "3.5.0", 7 | "ch.qos.logback" % "logback-classic" % "1.2.3", 8 | "org.scalatest" %% "scalatest" % "3.0.3" % "test" 9 | ) 10 | -------------------------------------------------------------------------------- /dataFusion-common/doc/dataFusion.zargo: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/data61/dataFusion/4b766ae65254eed6c5a095f3bb8b6b42b9500a9c/dataFusion-common/doc/dataFusion.zargo -------------------------------------------------------------------------------- /dataFusion-common/src/main/scala/au/csiro/data61/dataFusion/common/CSV.scala: -------------------------------------------------------------------------------- 1 | package au.csiro.data61.dataFusion.common 2 | 3 | object CSV { 4 | 5 | /** 6 | * return the indices of the fields for: id, organisation name and person's: family, first given and other given names. 7 | * @param csvHdr the header line from the CSV file 8 | */ 9 | def csvHeaderToIndices(delim: Char, fields: Seq[String], hdr: String): Seq[Int] = { 10 | val hdrs = hdr.toUpperCase.split(delim) 11 | val fieldsUp = fields.map(_.toUpperCase) 12 | val idx = fieldsUp map hdrs.indexOf 13 | val missing = for ((f, i) <- fields zip idx if i == -1) yield f 14 | if (!missing.isEmpty) throw new Exception(s"CSV header is missing fields: ${missing.mkString(",")}") 15 | idx 16 | } 17 | 18 | /** 19 | * Process the header line from iter and return a function to map the remaining lines to a seq of string data in same order as fields. 20 | * (Done this way to allow the function to be applied to different lines in parallel). 21 | */ 22 | def mkFieldData(delim: Char, fields: Seq[String], iter: Iterator[String]): String => Seq[String] = { 23 | if (iter.hasNext) { 24 | val idx = csvHeaderToIndices(delim, fields, iter.next) 25 | val reqLen = idx.max + 1 26 | line => 27 | val d = line.toUpperCase.split(delim).toIndexedSeq.padTo(reqLen, "") 28 | idx.map(d(_).trim) 29 | } else _ => Seq.empty 30 | } 31 | } -------------------------------------------------------------------------------- /dataFusion-common/src/main/scala/au/csiro/data61/dataFusion/common/Data.scala: -------------------------------------------------------------------------------- 1 | package au.csiro.data61.dataFusion.common 2 | 3 | import spray.json.DefaultJsonProtocol 4 | 5 | object Data { 6 | val T_PERSON = "PERSON" 7 | val T_PERSON2 = "PERSON2" // PHits.typ for a search on just family & first given names (not using other) 8 | val T_ORGANIZATION = "ORGANIZATION" // Z is consistent with NER implementations 9 | 10 | val GAZ = "D61GAZ" // Ner.impl for search hits 11 | val EMAIL = "D61EMAIL" // Ner.impl for names parsed from email headers 12 | 13 | /** pos{Str,End} are token indices 14 | * off{Str,End} are character offsets 15 | * {pos,off}Str is included, {pos,off}End is excluded (first token/char not included) 16 | */ 17 | case class ExtRef(name: String, ids: List[Long]) 18 | case class Ner(posStr: Int, posEnd: Int, offStr: Int, offEnd: Int, score: Double, text: String, typ: String, impl: String, extRef: Option[ExtRef]) 19 | 20 | /** metadata key for language code e.g. "en" or "es" */ 21 | val META_LANG_CODE = "language-code" 22 | val META_LANG_PROB = "language-prob" 23 | val META_EN_SCORE = "english-score" 24 | 25 | case class Embedded(content: Option[String], meta: Map[String, String], ner: List[Ner]) 26 | case class Doc(id: Long, content: Option[String], meta: Map[String, String], path: String, ner: List[Ner], embedded: List[Embedded]) 27 | 28 | // collection -> (weight, count) 29 | type WeightMap = Map[String, (Double, Int)] 30 | 31 | // sourceNodeId, targetNodeId -> Scores 32 | case class Node(nodeId: Int, extRef: ExtRef, score: Double, typ: String) 33 | case class Edge(source: Int, target: Int, weights: WeightMap, typ: String) 34 | case class NodeEdgeCount(nodeId: Int, numEdges: Int) 35 | 36 | val EMB_IDX_MAIN = -1 // a searchable value for embIdx to represent main content - not embedded 37 | case class IdEmbIdx(id: Long, embIdx: Int) 38 | 39 | case class Stats(totalHits: Int, elapsedSecs: Float) 40 | case class PosInfo(posStr: Int, posEnd: Int, offStr: Int, offEnd: Int) 41 | case class LPosDoc(idEmbIdx: IdEmbIdx, posInfos: List[PosInfo]) 42 | case class PHits(stats: Stats, hits: List[LPosDoc], error: Option[String], extRef: ExtRef, score: Double, typ: String) 43 | 44 | case class LDoc(idEmbIdx: IdEmbIdx, content: String, path: String) 45 | case class LMeta(idEmbIdx: IdEmbIdx, key: String, `val`: String) 46 | case class LNer(idEmbIdx: IdEmbIdx, posStr: Int, posEnd: Int, offStr: Int, offEnd: Int, text: String, typ: String, impl: String) 47 | 48 | case class Query(query: String, numHits: Int) 49 | case class DHits(stats: Stats, hits: List[(Float, LDoc)], error: Option[String]) 50 | case class MHits(stats: Stats, hits: List[(Float, LMeta)], error: Option[String]) 51 | case class NHits(stats: Stats, hits: List[(Float, LNer)], error: Option[String]) 52 | 53 | case class PosQuery(extRef: ExtRef, typ: String) 54 | case class PosMultiQuery(queries: List[PosQuery]) 55 | case class PMultiHits(pHits: List[PHits]) 56 | 57 | object JsonProtocol extends DefaultJsonProtocol { 58 | implicit val extRefFormat = jsonFormat2(ExtRef) 59 | implicit val nerFormat = jsonFormat9(Ner) 60 | implicit val embeddedFormat = jsonFormat3(Embedded) 61 | implicit val docFormat = jsonFormat6(Doc) 62 | 63 | implicit val nodeFormat = jsonFormat4(Node) 64 | implicit val edgeFormat = jsonFormat4(Edge) 65 | implicit val clientEdgeCountFormat = jsonFormat2(NodeEdgeCount) 66 | implicit val idEmbIdxCodec = jsonFormat2(IdEmbIdx) 67 | 68 | implicit val statsCodec = jsonFormat2(Stats) 69 | implicit val posInfoCodec = jsonFormat4(PosInfo) 70 | implicit val lposDocCodec = jsonFormat2(LPosDoc) 71 | implicit val pHitsCodec = jsonFormat6(PHits) 72 | 73 | implicit val ldocCodec = jsonFormat3(LDoc) 74 | implicit val lmetaCodec = jsonFormat3(LMeta) 75 | implicit val lnerCodec = jsonFormat8(LNer) 76 | 77 | implicit val queryCodec = jsonFormat2(Query) 78 | implicit val dHitsCodec = jsonFormat3(DHits) 79 | implicit val mHitsCodec = jsonFormat3(MHits) 80 | implicit val nHitsCodec = jsonFormat3(NHits) 81 | 82 | implicit val posQueryCodec = jsonFormat2(PosQuery) 83 | implicit val posMultiQueryCodec = jsonFormat1(PosMultiQuery) 84 | implicit val pMultiHitsCodec = jsonFormat1(PMultiHits) 85 | } 86 | } -------------------------------------------------------------------------------- /dataFusion-common/src/main/scala/au/csiro/data61/dataFusion/common/EnglishScore.scala: -------------------------------------------------------------------------------- 1 | package au.csiro.data61.dataFusion.common 2 | 3 | import com.typesafe.scalalogging.Logger 4 | 5 | object EnglishScore { 6 | private val log = Logger(getClass) 7 | 8 | case class Feat(wordLike: Boolean, initCap: Boolean, endsDot: Boolean) 9 | 10 | // A metric for English text quality. 11 | // Near enough is good enough, no need to handle voweless works like "sky" or apostrophes. 12 | 13 | val word = """\S+""".r 14 | val vowels = "AEIOUaeiou".toSet 15 | val upper = ('A' to 'Z').toSet 16 | val letter = upper ++ upper.map(Character.toLowerCase) 17 | val punct = ",;:'\"!@#$%^&*()-_+=/[]{}.".toSet 18 | 19 | def word2feat(w: String): Feat = { 20 | val numVowel = w.count(vowels contains _) 21 | val numLetter = w.count(letter contains _) 22 | val numUpper = w.count(upper contains _) 23 | val startsPunct = punct contains w.head 24 | val endsPunct = punct contains w.last 25 | val endsDot = w.endsWith(".") 26 | val expectedLetters = w.length - (if (startsPunct) 1 else 0) - (if (endsPunct) 1 else 0) 27 | val initCap = numUpper == 1 && (startsPunct && w.length > 1 && Character.isUpperCase(w(1))|| Character.isUpperCase(w.head)) 28 | val wordLike = w.length < 30 && numLetter == expectedLetters && (numUpper == 0 || initCap) && numVowel > 0 29 | // log.debug(s"word2feat: numVowel = $numVowel, numLetter = $numLetter, numUpper = $numUpper, startsPunct = $startsPunct, endsPunct = $endsPunct, endsDot = $endsDot, initCap = $initCap, length = ${w.length}, expectedLetters = $expectedLetters, wordLike = $wordLike") 30 | Feat(wordLike, initCap, endsDot) 31 | } 32 | 33 | def englishScore(text: String): Double = { 34 | val feats = word.findAllIn(text).map(word2feat).toSeq 35 | val numWords = feats.count(_.wordLike) 36 | val wordScore = numWords.toDouble / feats.size // ratio 37 | 38 | // unit test with text from wikipedia is getting a very low sentenceScore, so disabled for now 39 | val numSentence = feats.sliding(2).count { 40 | case Seq(a, b) => a.wordLike && a.endsDot && b.wordLike && b.initCap 41 | case _ => false 42 | } 43 | val x = numWords.toDouble / numSentence // avgSentenceLength 44 | // See http://hearle.nahoo.net/Academic/Maths/Sentence.html 45 | // try piece-wise linear score 46 | val sentenceScore = if (x < 10.0) 0.6 + 0.4 * x/10.0 47 | else if (x < 30.0) 1.0 48 | else if (x < 100.0) 1.0 - 0.8 * (x - 30.0)/70.0 49 | else 0.2 50 | 51 | log.debug(s"englishScore: numSentence = $numSentence, numWords = $numWords, wordScore = $wordScore, sentenceScore = $sentenceScore") 52 | wordScore * sentenceScore 53 | } 54 | } -------------------------------------------------------------------------------- /dataFusion-common/src/main/scala/au/csiro/data61/dataFusion/common/Parallel.scala: -------------------------------------------------------------------------------- 1 | package au.csiro.data61.dataFusion.common 2 | 3 | import java.util.concurrent.ArrayBlockingQueue 4 | 5 | import scala.util.{ Failure, Success, Try } 6 | 7 | import com.typesafe.scalalogging.Logger 8 | 9 | object Parallel { 10 | private val log = Logger(getClass) 11 | 12 | /** 13 | * One thread does `in`, 14 | * One thread does `out`, 15 | * `numWorkers` threads do `work`. 16 | */ 17 | def doParallel[I, O](in: Iterator[I], work: I => O, out: O => Unit, inDone: I, outDone: O, numWorkers: Int) = { 18 | val qFactor = 10 19 | val qSize = numWorkers * qFactor 20 | val iq = new ArrayBlockingQueue[I](qSize) 21 | val oq = new ArrayBlockingQueue[Try[O]](qSize) 22 | 23 | val iThread = new Thread { 24 | override def run = { 25 | in.foreach(i => iq.put(i)) 26 | iq.put(inDone) 27 | } 28 | } 29 | iThread.start 30 | 31 | val oThread = new Thread { 32 | override def run = { 33 | Iterator.continually(oq.take) takeWhile(_ != Success(outDone)) foreach { _ match { 34 | case Success(o) => out(o) 35 | case Failure(e) => log.error("worker exception", e) 36 | } } 37 | } 38 | } 39 | oThread.start 40 | 41 | val workers = (0 until numWorkers).map { i => new Thread { 42 | override def run = { 43 | Iterator.continually(iq.take) takeWhile(_ != inDone) foreach { i => oq.put(Try{ work(i) }) } 44 | iq.put(inDone) // tell another worker 45 | } 46 | } } 47 | workers.foreach(_.start) 48 | 49 | iThread.join 50 | log.debug("iThread done") 51 | workers.foreach(_.join) 52 | log.debug("workers done") 53 | oq.put(Success(outDone)) 54 | oThread.join 55 | log.debug("oThread done") 56 | } 57 | 58 | } -------------------------------------------------------------------------------- /dataFusion-common/src/main/scala/au/csiro/data61/dataFusion/common/Timer.scala: -------------------------------------------------------------------------------- 1 | package au.csiro.data61.dataFusion.common 2 | 3 | import com.typesafe.scalalogging.Logger 4 | 5 | /** Accumulate time since constructed or reset. 6 | * 7 | * Usage: 8 | * {{{ 9 | * val t = Timer() 10 | * ... 11 | * t.stop 12 | * log.info(s"... took ${t.elapsedSecs} secs") 13 | * }}} 14 | */ 15 | class Timer { 16 | private var t0 = 0L // start of currently measured time period 17 | private var elapsed = 0L // sum of previous time periods ended by stop/elapsedSecs 18 | 19 | reset 20 | 21 | def reset = { 22 | elapsed = 0L 23 | start 24 | } 25 | 26 | /** `start` need not be used - used to discard (not accumulate) the time between `stop` and `start`. */ 27 | def start = t0 = System.currentTimeMillis 28 | 29 | def stop = elapsed += (System.currentTimeMillis - t0) 30 | 31 | /** Get accumulated seconds up to `stop` */ 32 | def elapsedSecs: Float = elapsed * 1e-3f 33 | } 34 | 35 | object Timer { 36 | 37 | private lazy val log = Logger(getClass) 38 | 39 | def apply() = new Timer() 40 | 41 | /** Log elapsed time as info. 42 | * 43 | * Usage: 44 | * {{{ 45 | * val a: A = timed("it took {} secs") { 46 | * ... 47 | * new A() 48 | * } 49 | * }}} 50 | * 51 | * @param msg contains "{}" which is replaced by the elapsed time in secs 52 | * @param action thunk to execute and time 53 | */ 54 | def timed[T](msg: String)(action: => T) = { 55 | val t = Timer() 56 | val x = action 57 | t.stop 58 | log.info(msg, t.elapsedSecs.toString) 59 | x 60 | } 61 | } -------------------------------------------------------------------------------- /dataFusion-common/src/main/scala/au/csiro/data61/dataFusion/common/Util.scala: -------------------------------------------------------------------------------- 1 | package au.csiro.data61.dataFusion.common 2 | 3 | import java.io.{ BufferedWriter, File, FileOutputStream, OutputStreamWriter } 4 | import com.typesafe.scalalogging.Logger 5 | import scala.collection.mutable.ListBuffer 6 | 7 | object Util { 8 | private val log = Logger(getClass) 9 | 10 | /** @return a BufferedWriter using UTF-8 encoding */ 11 | def bufWriter(f: File) = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(f), "UTF-8")) 12 | 13 | /** Get a Scala singleton Object. 14 | * @param fqn object's fully qualified name 15 | * @return object as type T 16 | */ 17 | def getObject[T](fqn: String): T = { 18 | val m = scala.reflect.runtime.universe.runtimeMirror(getClass.getClassLoader) 19 | m.reflectModule(m.staticModule(fqn)).instance.asInstanceOf[T] 20 | } 21 | 22 | /** 23 | * Modified from: https://stackoverflow.com/questions/5674741/simplest-way-to-get-the-top-n-elements-of-a-scala-iterable 24 | * Well the simplest is sort.take(n), but for a large collection where n << the collection size, this is much more efficient! 25 | */ 26 | def extremeN[T](n: Int, it: Iterator[T])(comp1: ((T, T) => Boolean), comp2: ((T, T) => Boolean)): List[T] = { 27 | 28 | def sortedIns (el: T, list: List[T]): List[T] = 29 | if (list.isEmpty) List (el) else 30 | if (comp2 (el, list.head)) el :: list else 31 | list.head :: sortedIns (el, list.tail) 32 | 33 | def updateSofar (sofar: List [T], el: T) : List [T] = 34 | if (comp1 (el, sofar.head)) 35 | sortedIns (el, sofar.tail) 36 | else sofar 37 | 38 | val initN = { 39 | val buf = new ListBuffer[T] 40 | for (_ <- 0 until n if it.hasNext) buf += it.next 41 | buf.toList 42 | } 43 | if (initN.size > 1) (initN.sortWith(comp2) /: it) { updateSofar } else initN 44 | } 45 | 46 | /** @return smallest n elements in descending order */ 47 | def bottom[T](n: Int, it: Iterator[T])(implicit ord: Ordering[T]): List[T] = extremeN(n, it)(ord.lt, ord.gt) 48 | 49 | /** @return largest n elements in ascending order */ 50 | def top[T](n: Int, it: Iterator[T])(implicit ord: Ordering[T]): List[T] = extremeN(n, it)(ord.gt, ord.lt) 51 | } -------------------------------------------------------------------------------- /dataFusion-common/src/test/resources/logback-test.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | %d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /dataFusion-common/src/test/scala/au/csiro/data61/dataFusion/common/ParallelTest.scala: -------------------------------------------------------------------------------- 1 | package au.csiro.data61.dataFusion.common 2 | 3 | import scala.collection.mutable.ListBuffer 4 | 5 | import org.scalatest.{ FlatSpec, Matchers } 6 | 7 | import com.typesafe.scalalogging.Logger 8 | import scala.util.Success 9 | 10 | class ParallelTest extends FlatSpec with Matchers { 11 | val log = Logger(getClass) 12 | 13 | "Threads" should "do stuff in parallel" in { 14 | val l = ListBuffer[String]() 15 | Parallel.doParallel(Iterator.range(0, 1000).map(_.toString), (s: String) => s, (s: String) => l += s, "done", "done", 4) 16 | l.size should be(1000) 17 | for { 18 | (a, b) <- l.map(_.toInt).sortBy(identity).zipWithIndex 19 | } a should be(b) 20 | } 21 | } -------------------------------------------------------------------------------- /dataFusion-common/src/test/scala/au/csiro/data61/dataFusion/common/UtilTest.scala: -------------------------------------------------------------------------------- 1 | package au.csiro.data61.dataFusion.common 2 | 3 | import com.typesafe.scalalogging.Logger 4 | 5 | import org.scalatest.FlatSpec 6 | import org.scalatest.Matchers 7 | 8 | import com.typesafe.scalalogging.Logger 9 | import Util._ 10 | import scala.util.Random 11 | import Timer.timed 12 | 13 | class UtilTest extends FlatSpec with Matchers { 14 | val log = Logger(getClass) 15 | 16 | val data = Random.shuffle(1 to 1000000) 17 | val n = 10 18 | 19 | "top" should "get top members quicker than sorting" in { 20 | val t1 = Timer() 21 | val topn = top(n, data.iterator) 22 | t1.stop 23 | log.debug(s"topn in ${t1.elapsedSecs} = $topn") 24 | 25 | val t2 = Timer() 26 | val expected = data.sortBy(x => -x).take(n).toList 27 | t2.stop 28 | log.debug(s"sortBy.take(n) in ${t2.elapsedSecs}") 29 | topn.reverse should be(expected) 30 | assert(t1.elapsedSecs < t2.elapsedSecs) 31 | } 32 | 33 | "bottom" should "get bottom members quicker than sorting" in { 34 | val t1 = Timer() 35 | val bottomn = bottom(n, data.iterator) 36 | t1.stop 37 | log.debug(s"bottomn in ${t1.elapsedSecs} = $bottomn") 38 | 39 | val t2 = Timer() 40 | val expected = data.sorted.take(n).toList 41 | t2.stop 42 | log.debug(s"sort.take(n) in ${t2.elapsedSecs}") 43 | bottomn.reverse should be(expected) 44 | assert(t1.elapsedSecs < t2.elapsedSecs) 45 | } 46 | } -------------------------------------------------------------------------------- /dataFusion-db-service/README.md: -------------------------------------------------------------------------------- 1 | # dataFusion-db-service 2 | 3 | ## Introduction 4 | 5 | This project provides [RESTful](https://en.wikipedia.org/wiki/Representational_state_transfer) web services based on [dataFusion-db](../dataFusion-db). 6 | 7 | ## Build, Configuration, Running and Swagger Support 8 | 9 | See the top level [README](../README.md). 10 | 11 | -------------------------------------------------------------------------------- /dataFusion-db-service/build.sbt: -------------------------------------------------------------------------------- 1 | name := "dataFusion-db-service" 2 | 3 | libraryDependencies ++= Seq( 4 | "com.github.swagger-akka-http" %% "swagger-akka-http" % "0.9.1", 5 | "com.typesafe.akka" %% "akka-http-spray-json" % "10.0.7", 6 | "ch.megard" %% "akka-http-cors" % "0.2.1", 7 | "com.github.scopt" %% "scopt" % "3.5.0" 8 | ) 9 | 10 | com.github.retronym.SbtOneJar.oneJarSettings 11 | 12 | mainClass in Compile := Some("au.csiro.data61.dataFusion.db.service.Main") 13 | -------------------------------------------------------------------------------- /dataFusion-db-service/src/main/resources/application.conf: -------------------------------------------------------------------------------- 1 | http = { 2 | host = "0.0.0.0" 3 | port = 8088 4 | 5 | host = ${?DB_HTTP_HOST} 6 | port = ${?DB_HTTP_PORT} 7 | } -------------------------------------------------------------------------------- /dataFusion-db-service/src/main/resources/logback.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | db-service.log 11 | 12 | %d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /dataFusion-db-service/src/main/scala/au/csiro/data61/dataFusion/db/service/DbService.scala: -------------------------------------------------------------------------------- 1 | package au.csiro.data61.dataFusion.db.service 2 | 3 | import scala.concurrent.{ ExecutionContext, Future } 4 | import scala.language.postfixOps 5 | 6 | import com.typesafe.config.Config 7 | import com.typesafe.scalalogging.Logger 8 | 9 | import akka.http.scaladsl.marshallers.sprayjson.SprayJsonSupport._ 10 | import akka.http.scaladsl.marshalling.{ ToResponseMarshallable, ToResponseMarshaller } 11 | import akka.http.scaladsl.model.{ StatusCode, StatusCodes } 12 | import akka.http.scaladsl.server.Directives._ 13 | import au.csiro.data61.dataFusion.common.Data, Data.JsonProtocol._ 14 | import au.csiro.data61.dataFusion.common.Util 15 | import au.csiro.data61.dataFusion.db.Tables, Tables._ 16 | import io.swagger.annotations.{ Api, ApiOperation, ApiResponse, ApiResponses } 17 | import javax.ws.rs.{ Path, PathParam } 18 | 19 | // deleted by Eclipse > Source > Organize Imports 20 | // import io.swagger.annotations.ApiResponse 21 | 22 | object DbService { 23 | 24 | // teach spray.json how to en/decode java.sql.Date 25 | // val longFormat = implicitly[JsonFormat[Long]] 26 | // implicit val sqlDateFormat = new JsonFormat[java.sql.Date] { 27 | // override def read(json: JsValue): java.sql.Date = new java.sql.Date(longFormat.read(json)) 28 | // override def write(obj: java.sql.Date): JsValue = longFormat.write(obj.getTime) 29 | // } 30 | 31 | implicit val docRowCodec = jsonFormat4(DocRow) 32 | implicit val metaRowCodec = jsonFormat4(MetaRow) 33 | implicit val nerRowCodec = jsonFormat11(NerRow) 34 | } 35 | import DbService._ 36 | 37 | @Api(value = "db", description = "read-only access to dataFusion database", produces = "application/json") 38 | @Path("") 39 | class DbService(conf: Config)(implicit val executionContext: ExecutionContext) { 40 | private val log = Logger(getClass) 41 | 42 | val myTables = new Tables { 43 | val profile = Util.getObject[slick.jdbc.JdbcProfile](conf.getString("db.profile")) // e.g. slick.jdbc.H2Profile or slick.jdbc.PostgresProfile 44 | } 45 | import myTables._ 46 | import myTables.profile.api._ 47 | 48 | val db = Database.forConfig("db", conf) 49 | 50 | 51 | 52 | val qDocById = { 53 | def q(id: Rep[Long]) = Doc.filter(_.docId === id) 54 | Compiled(q _) 55 | } 56 | 57 | @Path("doc/{id}") 58 | @ApiOperation(httpMethod = "GET", response = classOf[Array[DocRow]], responseContainer = "List", value = "Main Doc and embedded Docs") 59 | def docById(@PathParam("id") id: Long): Future[Seq[DocRow]] = 60 | db.run(qDocById(id).result) 61 | 62 | def docByIdRoute = 63 | get { path("doc" / LongNumber) { id => complete { 64 | docById(id) 65 | }}} 66 | 67 | 68 | 69 | val qMetaById = { 70 | def q(id: Rep[Long]) = Meta.filter(_.docId === id) 71 | Compiled(q _) 72 | } 73 | 74 | @Path("meta/{id}") 75 | @ApiOperation(httpMethod = "GET", response = classOf[Array[MetaRow]], responseContainer = "List", value = "Metadata for main Doc and embedded Docs") 76 | def metaById(@PathParam("id") id: Long): Future[Seq[MetaRow]] = 77 | db.run(qMetaById(id).result) 78 | 79 | def metaByIdRoute = 80 | get { path("meta" / LongNumber) { id => complete { 81 | metaById(id) 82 | }}} 83 | 84 | 85 | 86 | val qNerById = { 87 | def q(id: Rep[Long]) = Ner.filter(_.docId === id) 88 | Compiled(q _) 89 | } 90 | 91 | @Path("ner/{id}") 92 | @ApiOperation(httpMethod = "GET", response = classOf[Array[NerRow]], responseContainer = "List", value = "Named Entities for main Doc and embedded Docs") 93 | def nerById(@PathParam("id") id: Long): Future[Seq[NerRow]] = 94 | db.run(qNerById(id).result) 95 | 96 | def nerByIdRoute = 97 | get { path("ner" / LongNumber) { id => complete { 98 | nerById(id) 99 | }}} 100 | 101 | 102 | 103 | val qExtNameById = { 104 | def q(id: Rep[Long]) = ExtName.filter(_.extNameId === id).map(_.name) 105 | Compiled(q _) 106 | } 107 | val qExtNameLinkById = { 108 | def q(id: Rep[Long]) = ExtNameLink.filter(_.extNameId === id).map(_.extRefId) 109 | Compiled(q _) 110 | } 111 | 112 | @Path("extRef/{extNameId}") 113 | @ApiOperation(httpMethod = "GET", response = classOf[Data.ExtRef], value = "name and ids (from external system) associated with a Named Entity") 114 | def extRefById(@PathParam("extNameId") id: Long) = { 115 | val oRef = for { 116 | onam <- db.run(qExtNameById(id).result.headOption) 117 | ids <- db.run(qExtNameLinkById(id).result) 118 | } yield onam.map(Data.ExtRef(_, ids.toList)) 119 | optOrElse(oRef, (StatusCodes.NotFound, "")) 120 | } 121 | 122 | def extRefByIdRoute = 123 | get { path("extRef" / LongNumber) { id => complete { 124 | extRefById(id) 125 | }}} 126 | 127 | 128 | 129 | /** if Some(a) marshall the a, else marshall the orElse */ 130 | def optOrElse[A](x: Future[Option[A]], orElse: => (StatusCode, String))(implicit m: ToResponseMarshaller[A]): ToResponseMarshallable = 131 | x.map(_.map { s => ToResponseMarshallable(s) }.getOrElse(ToResponseMarshallable(orElse)) ) 132 | 133 | val routes = docByIdRoute ~ metaByIdRoute ~ nerByIdRoute ~ extRefByIdRoute 134 | } 135 | -------------------------------------------------------------------------------- /dataFusion-db-service/src/main/scala/au/csiro/data61/dataFusion/db/service/Main.scala: -------------------------------------------------------------------------------- 1 | package au.csiro.data61.dataFusion.db.service 2 | 3 | import scala.language.postfixOps 4 | import scala.reflect.runtime.universe.typeOf 5 | 6 | import com.github.swagger.akka.{ HasActorSystem, SwaggerHttpService } 7 | import com.typesafe.config.ConfigFactory 8 | import com.typesafe.scalalogging.Logger 9 | 10 | import akka.actor.ActorSystem 11 | import akka.http.scaladsl.Http 12 | import akka.http.scaladsl.server.Directives._enhanceRouteWithConcatenation 13 | import akka.stream.ActorMaterializer 14 | import ch.megard.akka.http.cors.scaladsl.CorsDirectives.cors 15 | import com.typesafe.config.ConfigValueFactory 16 | 17 | object Main { 18 | private val log = Logger(getClass) 19 | 20 | val conf = ConfigFactory.load 21 | def cg(k: String) = conf.getString(k) 22 | def cgi(k: String) = conf.getInt(k) 23 | 24 | case class CliOption(host: String, port: Int, dburl: String, profile: String, driver: String, user: String, password: String) 25 | val defaultCliOption = CliOption(cg("http.host"), cgi("http.port"), cg("db.url"), cg("db.profile"), cg("db.driver"), cg("db.properties.user"), cg("db.properties.password")) 26 | 27 | implicit val system = ActorSystem("dbActorSystem") 28 | implicit val exec = system.dispatcher 29 | implicit val materializer = ActorMaterializer() 30 | 31 | def main(args: Array[String]): Unit = { 32 | val parser = new scopt.OptionParser[CliOption]("db-service") { 33 | head("db-service", "0.x") 34 | note("web services for read-only access to datafusion database") 35 | opt[String]("host") action { (v, c) => 36 | c.copy(dburl = v) 37 | } text (s"web service network interface/host/IP address, default ${defaultCliOption.host}") 38 | opt[Int]("port") action { (v, c) => 39 | c.copy(port = v) 40 | } text (s"web service TCP port, default ${defaultCliOption.port}") 41 | opt[String]("dburl") action { (v, c) => 42 | c.copy(dburl = v) 43 | } text (s"JDBC database URL, default ${defaultCliOption.dburl}") 44 | opt[String]("profile") action { (v, c) => 45 | c.copy(profile = v) 46 | } text (s"full class name of Slick profile, default ${defaultCliOption.profile}") 47 | opt[String]("driver") action { (v, c) => 48 | c.copy(driver = v) 49 | } text (s"full class name of JDBC driver, default ${defaultCliOption.driver}") 50 | opt[String]("user") action { (v, c) => 51 | c.copy(user = v) 52 | } text (s"database username, default ${defaultCliOption.user}") 53 | opt[String]("password") action { (v, c) => 54 | c.copy(password = v) 55 | } text (s"database user password, default ${defaultCliOption.password}") 56 | help("help") text ("prints this usage text") 57 | } 58 | for (c <- parser.parse(args, defaultCliOption)) { 59 | log.info(s"CliOption: $c}") 60 | val conf2 = conf.withValue("db.url", ConfigValueFactory.fromAnyRef(c.dburl)) // CliOption overrides 61 | .withValue("db.driver", ConfigValueFactory.fromAnyRef(c.driver)) 62 | .withValue("db.properties.user", ConfigValueFactory.fromAnyRef(c.user)) 63 | .withValue("db.properties.password", ConfigValueFactory.fromAnyRef(c.password)) 64 | val dbService = new DbService(conf2) 65 | val routes = cors() { 66 | dbService.routes ~ 67 | swaggerService.routes 68 | } 69 | Http().bindAndHandle(routes, c.host, c.port) 70 | log.info(s"""starting server at: http://${c.host}:${c.port} 71 | Test with: 72 | curl --header 'Content-Type: application/json' http://${c.host}:${c.port}/api-docs/swagger.json 73 | curl --header 'Content-Type: application/json' http://${c.host}:${c.port}/tikaMain/1 74 | """) 75 | } 76 | } 77 | 78 | def swaggerService(implicit s: ActorSystem, m: ActorMaterializer) = new SwaggerHttpService with HasActorSystem { 79 | override implicit val actorSystem = s 80 | override implicit val materializer = m 81 | override val apiTypes = Seq(typeOf[DbService]) 82 | override def swaggerConfig = new io.swagger.models.Swagger().basePath(prependSlashIfNecessary(basePath)) // don't specify protocol://host basePath 83 | // override val host = s"${hst}:${prt}" // the url of your api, not swagger's json endpoint 84 | // override val basePath = "/" // the basePath for the API you are exposing 85 | override val info = new io.swagger.models.Info() // provides license and other description details 86 | override val apiDocsPath = "api-docs" // http://host:port/api-docs/swagger.json 87 | } 88 | 89 | } -------------------------------------------------------------------------------- /dataFusion-db-service/src/test/resources/logback-test.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | %d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /dataFusion-db/3rd-party-licenses.md: -------------------------------------------------------------------------------- 1 | # datafusion-db-licenses 2 | 3 | Category | License | Dependency | Notes 4 | --- | --- | --- | --- 5 | Apache | [Apache 2](http://www.apache.org/licenses/LICENSE-2.0.txt) | io.spray # spray-json_2.12 # 1.3.3 | 6 | Apache | [Apache 2.0 License](http://www.apache.org/licenses/LICENSE-2.0.html) | com.typesafe.scala-logging # scala-logging_2.12 # 3.5.0 | 7 | Apache | [Apache License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0) | com.typesafe # config # 1.3.1 | 8 | Apache | [The Apache Software License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0.txt) | com.zaxxer # HikariCP # 2.5.1 | 9 | BSD | [BSD 3-Clause](http://www.scala-lang.org/license.html) | org.scala-lang # scala-library # 2.12.2 | 10 | BSD | [BSD 3-Clause](http://www.scala-lang.org/license.html) | org.scala-lang # scala-reflect # 2.12.2 | 11 | BSD | [BSD-2-Clause](https://jdbc.postgresql.org/about/license.html) | org.postgresql # postgresql # 42.1.4 | 12 | BSD | [BSD-Style](http://www.opensource.org/licenses/bsd-license.php) | com.jsuereth # scala-arm_2.12 # 2.0 | 13 | BSD | [Two-clause BSD-style license](http://github.com/slick/slick/blob/master/LICENSE.txt) | com.typesafe.slick # slick-codegen_2.12 # 3.2.1 | 14 | BSD | [Two-clause BSD-style license](http://github.com/slick/slick/blob/master/LICENSE.txt) | com.typesafe.slick # slick-hikaricp_2.12 # 3.2.1 | 15 | BSD | [Two-clause BSD-style license](http://github.com/slick/slick/blob/master/LICENSE.txt) | com.typesafe.slick # slick_2.12 # 3.2.1 | 16 | CC0 | [CC0](http://creativecommons.org/publicdomain/zero/1.0/) | org.reactivestreams # reactive-streams # 1.0.0 | 17 | GPL | [GPL](https://www.gnu.org/licenses/gpl-3.0.en.html) | au.csiro.data61 # datafusion-common_2.12 # 1.1-SNAPSHOT | 18 | LGPL | [EPL + GNU Lesser General Public License](http://logback.qos.ch/license.html) | ch.qos.logback # logback-classic # 1.2.3 | 19 | LGPL | [EPL + GNU Lesser General Public License](http://logback.qos.ch/license.html) | ch.qos.logback # logback-core # 1.2.3 | 20 | MIT | [MIT License](http://www.opensource.org/licenses/mit-license.php) | com.github.scopt # scopt_2.12 # 3.7.0 | 21 | MIT | [MIT License](http://www.slf4j.org/license.html) | org.slf4j # slf4j-api # 1.7.25 | 22 | Mozilla | [MPL 2.0 or EPL 1.0](http://h2database.com/html/license.html) | com.h2database # h2 # 1.4.196 | 23 | 24 | -------------------------------------------------------------------------------- /dataFusion-db/README.md: -------------------------------------------------------------------------------- 1 | # dataFusion-db 2 | 3 | ## Introduction 4 | This project provides [Slick](http://slick.lightbend.com/) bindings for a database schema to store the same data as the [Document JSON format](../dataFusion-common#document-json-format). It will work with any Slick supported database including H2, Postgres and Oracle. 5 | Functions provided are: 6 | - Create Slick bindings (Tables.scala) from an existing database schema. This is recommended for major schema changes. 7 | - Drop the schema. 8 | - Create the schema. 9 | - Populate tables from a [Document JSON format](../dataFusion-common#document-json-format) file. 10 | 11 | Querying is provided by the [dataFusion-db-service](../dataFusion-db-service) web service. 12 | 13 | ## Build, Configuration and Running 14 | 15 | See the top level [README](../README.md). 16 | 17 | -------------------------------------------------------------------------------- /dataFusion-db/build.sbt: -------------------------------------------------------------------------------- 1 | name := "dataFusion-db" 2 | 3 | libraryDependencies ++= Seq( 4 | "slick", 5 | "slick-hikaricp", 6 | "slick-codegen" 7 | ).map("com.typesafe.slick" %% _ % "3.2.1") 8 | 9 | libraryDependencies ++= Seq( 10 | "org.postgresql" % "postgresql" % "42.1.4", 11 | "com.h2database" % "h2" % "1.4.196", 12 | "com.typesafe" % "config" % "1.3.1", 13 | "com.github.scopt" %% "scopt" % "3.7.0", 14 | "com.jsuereth" %% "scala-arm" % "2.0" 15 | 16 | ) 17 | 18 | com.github.retronym.SbtOneJar.oneJarSettings 19 | 20 | mainClass in Compile := Some("au.csiro.data61.dataFusion.db.Main") 21 | -------------------------------------------------------------------------------- /dataFusion-db/src/main/resources/application.conf: -------------------------------------------------------------------------------- 1 | db = { 2 | // H2 3 | url = "jdbc:h2:./datafusion" 4 | profile = "slick.jdbc.H2Profile" 5 | driver = "org.h2.Driver" 6 | 7 | // Postgres 8 | // url = "jdbc:postgresql:datafusion" 9 | // profile = "slick.jdbc.PostgresProfile" 10 | // driver = "org.postgresql.Driver" 11 | 12 | // Environment variables 13 | url = ${?DB_URL} 14 | profile = ${?DB_PROFILE} 15 | driver = ${?DB_DRIVER} 16 | 17 | connectionPool = "HikariCP" 18 | queueSize = 100 19 | keepAliveConnection = true 20 | 21 | properties = { 22 | user = "dfus" 23 | password = "dfus" 24 | 25 | user = ${?DB_USER} 26 | password = ${?DB_PASS} 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /dataFusion-db/src/main/resources/logback.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | %d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n 6 | 7 | 8 | 9 | 10 | db.log 11 | 12 | %d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /dataFusion-db/src/test/resources/logback-test.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | %d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /dataFusion-graph-service/README.md: -------------------------------------------------------------------------------- 1 | # dataFusion-graph-service 2 | 3 | ## Introduction 4 | 5 | [RESTful](https://en.wikipedia.org/wiki/Representational_state_transfer) web services providing access to a network graph of related entities. 6 | The global network is loaded from files specified in [configuration](../README.md#configuration) on startup. 7 | 8 | ## Build, Configuration, Running and Swagger Support 9 | 10 | See the top level [README](../README.md). 11 | 12 | -------------------------------------------------------------------------------- /dataFusion-graph-service/build.sbt: -------------------------------------------------------------------------------- 1 | name := "dataFusion-graph-service" 2 | 3 | libraryDependencies ++= Seq( 4 | "com.github.swagger-akka-http" %% "swagger-akka-http" % "0.9.1", 5 | "com.typesafe.akka" %% "akka-http-spray-json" % "10.0.7", 6 | "com.google.guava" % "guava" % "23.4-jre", 7 | "ch.megard" %% "akka-http-cors" % "0.2.1", 8 | "com.github.scopt" %% "scopt" % "3.5.0", 9 | // "com.jsuereth" %% "scala-arm" % "2.0", 10 | "org.scalatest" %% "scalatest" % "3.0.0" % "test" 11 | ) 12 | 13 | com.github.retronym.SbtOneJar.oneJarSettings 14 | 15 | mainClass in Compile := Some("au.csiro.data61.dataFusion.graph.service.Main") 16 | -------------------------------------------------------------------------------- /dataFusion-graph-service/src/main/resources/application.conf: -------------------------------------------------------------------------------- 1 | graph = { 2 | cacheSize = 20 3 | nodePath = "proximity-node.json" 4 | edgePath = "proximity-edge.json" 5 | 6 | cacheSize = ${?GRAPH_CACHE_SIZE} 7 | nodePath = ${?GRAPH_NODE_PATH} 8 | edgePath = ${?GRAPH_EDGE_PATH} 9 | } 10 | 11 | http = { 12 | host = "0.0.0.0" 13 | port = 8089 14 | 15 | host = ${?GRAPH_HTTP_HOST} 16 | port = ${?GRAPH_HTTP_PORT} 17 | } -------------------------------------------------------------------------------- /dataFusion-graph-service/src/main/resources/logback.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | graph-service.log 11 | 12 | %d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /dataFusion-graph-service/src/test/resources/edge.json: -------------------------------------------------------------------------------- 1 | {"source":1,"target":2,"typ":"D61GAZ","weights":{"collectionA":[1.0,1]}} 2 | {"source":1,"target":3,"typ":"D61GAZ","weights":{"collectionA":[1.0,1]}} 3 | {"source":2,"target":3,"typ":"D61GAZ","weights":{"collectionA":[1.0,1]}} 4 | {"source":3,"target":4,"typ":"D61GAZ","weights":{"collectionA":[1.0,1]}} 5 | {"source":2,"target":5,"typ":"D61GAZ","weights":{"collectionB":[1.0,1]}} 6 | {"source":3,"target":5,"typ":"D61GAZ","weights":{"collectionB":[1.0,1]}} 7 | {"source":1,"target":6,"typ":"D61GAZ","weights":{"collectionB":[1.0,1]}} 8 | {"source":2,"target":6,"typ":"D61GAZ","weights":{"collectionB":[1.0,1]}} 9 | {"source":4,"target":6,"typ":"D61GAZ","weights":{"collectionA":[1.0,1]}} 10 | {"source":5,"target":6,"typ":"D61GAZ","weights":{"collectionB":[1.0,1]}} 11 | -------------------------------------------------------------------------------- /dataFusion-graph-service/src/test/resources/logback-test.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | %d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /dataFusion-graph-service/src/test/resources/node.json: -------------------------------------------------------------------------------- 1 | {"nodeId":1,"extRef":{"ids":[211,212],"name":"fred"},"typ":"PERSON","score":1} 2 | {"nodeId":2,"extRef":{"ids":[213,214],"name":"fred"},"typ":"ORGANIZATION","score":1} 3 | {"nodeId":3,"extRef":{"ids":[223,224],"name":"fred"},"typ":"PERSON","score":1} 4 | {"nodeId":4,"extRef":{"ids":[234,235],"name":"fred"},"typ":"PERSON","score":1} 5 | {"nodeId":5,"extRef":{"ids":[211,212],"name":"fred"},"typ":"PERSON2","score":1} 6 | {"nodeId":6,"extRef":{"ids":[223,224],"name":"fred"},"typ":"TO","score":1} 7 | -------------------------------------------------------------------------------- /dataFusion-graph-service/src/test/scala/au/csiro/data61/dataFusion/graph/service/MainTest.scala: -------------------------------------------------------------------------------- 1 | package au.csiro.data61.dataFusion.graph.service 2 | 3 | import org.scalatest.{ FlatSpec, Matchers } 4 | 5 | import com.typesafe.scalalogging.Logger 6 | 7 | import Main._ 8 | import scala.io.Source 9 | import scala.io.Codec 10 | 11 | class MainTest extends FlatSpec with Matchers { 12 | val log = Logger(getClass) 13 | implicit val codec = Codec.UTF8 14 | 15 | def getSource(resourcePath: String) = Source.fromInputStream(getClass.getClassLoader.getResourceAsStream(resourcePath)) 16 | val gs = new GraphService(0, getSource("node.json"), getSource("edge.json")) 17 | 18 | "graph" should "provide local network" in { 19 | val g = gs.graph(GraphQuery(true, 0.0, None, None, Some(224), 2, 20)) 20 | log.debug(s"g = $g") 21 | g.nodes.map(_.nodeId).toSet should be(Set(1, 2, 3, 4, 5, 6)) 22 | g.edges.map(e => (e.source, e.target)).toSet should be(Set((2,5), (3,4), (1,6), (3,5), (4,6), (2,6), (1,3), (2,3), (1,2), (5,6))) 23 | } 24 | 25 | it should "filter PERSON2|EMAIL nodes" in { 26 | val g = gs.graph(GraphQuery(false, 0.0, None, None, Some(224), 2, 20)) 27 | log.debug(s"g = $g") 28 | g.nodes.map(_.nodeId).toSet should be(Set(1, 2, 3, 4)) 29 | g.edges.map(e => (e.source, e.target)).toSet should be(Set((2,3), (3,4), (1,2), (1,3))) 30 | } 31 | 32 | } -------------------------------------------------------------------------------- /dataFusion-ner-service/README.md: -------------------------------------------------------------------------------- 1 | # dataFusion-ner-service 2 | 3 | ## Introduction 4 | 5 | This project provides [RESTful](https://en.wikipedia.org/wiki/Representational_state_transfer) web services based on dataFusion-ner. 6 | 7 | ## Build, Configuration and Running 8 | 9 | This is mostly covered by the top level [README](../README.md), however note the particular requirements of [dataFusion-ner](../dataFusion-ner). 10 | -------------------------------------------------------------------------------- /dataFusion-ner-service/build.sbt: -------------------------------------------------------------------------------- 1 | name := "dataFusion-ner-service" 2 | 3 | libraryDependencies ++= Seq( 4 | "com.github.swagger-akka-http" %% "swagger-akka-http" % "0.9.1", 5 | "com.typesafe.akka" %% "akka-http-spray-json" % "10.0.7", 6 | "ch.megard" %% "akka-http-cors" % "0.2.1", 7 | 8 | "com.github.scopt" %% "scopt" % "3.7.0", 9 | "com.jsuereth" %% "scala-arm" % "2.0", 10 | "org.scalatest" %% "scalatest" % "3.0.4" % "test" 11 | ) 12 | 13 | com.github.retronym.SbtOneJar.oneJarSettings 14 | 15 | mainClass in Compile := Some("au.csiro.data61.dataFusion.ner.service.Main") 16 | -------------------------------------------------------------------------------- /dataFusion-ner-service/src/main/resources/application.conf: -------------------------------------------------------------------------------- 1 | http = { 2 | host = "0.0.0.0" 3 | port = 8086 4 | 5 | host = ${?NER_HTTP_HOST} 6 | port = ${?NER_HTTP_PORT} 7 | } -------------------------------------------------------------------------------- /dataFusion-ner-service/src/main/resources/logback.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | %d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n 6 | 7 | 8 | 9 | 10 | ner-server.log 11 | 12 | %d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /dataFusion-ner-service/src/main/scala/au/csiro/data61/dataFusion/ner/service/Main.scala: -------------------------------------------------------------------------------- 1 | package au.csiro.data61.dataFusion.ner.service 2 | 3 | import scala.language.postfixOps 4 | import scala.reflect.runtime.universe.typeOf 5 | 6 | import com.github.swagger.akka.{ HasActorSystem, SwaggerHttpService } 7 | import com.typesafe.config.ConfigFactory 8 | import com.typesafe.scalalogging.Logger 9 | 10 | import akka.actor.ActorSystem 11 | import akka.http.scaladsl.Http 12 | import akka.http.scaladsl.marshallers.sprayjson.SprayJsonSupport._ 13 | import akka.http.scaladsl.server.Directives._ 14 | import akka.stream.ActorMaterializer 15 | import au.csiro.data61.dataFusion.common.Data.Doc 16 | import au.csiro.data61.dataFusion.common.Data.JsonProtocol.docFormat 17 | import au.csiro.data61.dataFusion.ner.Main.{ CliOption, Impl, defaultCliOption } 18 | import ch.megard.akka.http.cors.scaladsl.CorsDirectives.cors 19 | import io.swagger.annotations.{ Api, ApiOperation } 20 | import javax.ws.rs.{ Consumes, Path } 21 | import javax.ws.rs.core.MediaType 22 | import spray.json.DefaultJsonProtocol._ 23 | 24 | object Main { 25 | val log = Logger(getClass) 26 | 27 | case class Docs(docs: List[Doc]) 28 | 29 | object JsonProtocol { 30 | implicit val docCodec = jsonFormat1(Docs) 31 | } 32 | import JsonProtocol._ 33 | 34 | 35 | @Api(value = "ner", description = "ner service", produces = "application/json") 36 | @Path("") 37 | class NerService(impl: Impl) { 38 | 39 | @Path("ner") 40 | @ApiOperation(httpMethod = "POST", response = classOf[Doc], value = "input augmented with Named Entities") 41 | @Consumes(Array(MediaType.APPLICATION_JSON)) 42 | def ner(d: Doc): Doc = impl.langNer(d) 43 | 44 | def nerRoute = 45 | post { path("langNer") { entity(as[Doc]) { in => complete { 46 | ner(in) 47 | }}}} 48 | 49 | // ---------------------------------------------------------- 50 | 51 | @Path("nerMulti") 52 | @ApiOperation(httpMethod = "POST", response = classOf[Docs], value = "input augmented with Named Entities") 53 | @Consumes(Array(MediaType.APPLICATION_JSON)) 54 | def nerMulti(d: Docs) = Docs(d.docs.map(impl.langNer)) 55 | 56 | def nerMultiRoute = 57 | post { path("langNerMulti") { entity(as[Docs]) { in => complete { 58 | nerMulti(in) 59 | }}}} 60 | 61 | // ---------------------------------------------------------- 62 | 63 | val routes = nerRoute ~ nerMultiRoute 64 | } 65 | 66 | def swaggerService(hst: String, prt: Int)(implicit s: ActorSystem, m: ActorMaterializer) = new SwaggerHttpService with HasActorSystem { 67 | override implicit val actorSystem = s 68 | override implicit val materializer = m 69 | override val apiTypes = Seq(typeOf[NerService]) 70 | override def swaggerConfig = new io.swagger.models.Swagger().basePath(prependSlashIfNecessary(basePath)) // don't specify protocol://host basePath 71 | // override val host = s"${hst}:${prt}" // the url of your api, not swagger's json endpoint 72 | // override val basePath = "/" // the basePath for the API you are exposing 73 | override val info = new com.github.swagger.akka.model.Info() // provides license and other description details 74 | override val apiDocsPath = "api-docs" // http://host:port/api-docs/swagger.json 75 | } 76 | 77 | def start(impl: Impl) = { 78 | val conf = ConfigFactory.load 79 | val host = conf.getString("http.host") 80 | val port = conf.getInt("http.port") 81 | 82 | implicit val system = ActorSystem("nerActorSystem") 83 | implicit val exec = system.dispatcher 84 | implicit val materializer = ActorMaterializer() 85 | 86 | val routes = cors() { 87 | new NerService(impl).routes ~ 88 | swaggerService(host, port).routes 89 | } 90 | 91 | Http().bindAndHandle(routes, host, port) 92 | log.info(s"""starting server at: http://${host}:${port} 93 | Test with: 94 | curl --header 'Content-Type: application/json' http://${host}:${port}/api-docs/swagger.json 95 | """) 96 | } 97 | 98 | def main(args: Array[String]): Unit = { 99 | val parser = new scopt.OptionParser[CliOption]("dataFusion-ner-service") { 100 | head("dataFusion-ner-service", "0.x") 101 | note("Named Entity Recognition web service.") 102 | opt[Boolean]('c', "corenlp") action { (v, c) => 103 | c.copy(corenlp = v) 104 | } text (s"Use CoreNLP (default ${defaultCliOption.corenlp})") 105 | opt[Boolean]('o', "opennlp") action { (v, c) => 106 | c.copy(opennlp = v) 107 | } text (s"Use OpenNLP (default ${defaultCliOption.opennlp})") 108 | opt[Boolean]('m', "mitie") action { (v, c) => 109 | c.copy(mitie = v) 110 | } text (s"Use MITIE (default ${defaultCliOption.mitie})") 111 | help("help") text ("prints this usage text") 112 | } 113 | 114 | for (c <- parser.parse(args, defaultCliOption)) { 115 | import scala.concurrent.ExecutionContext.Implicits.global // for Impl parallel initialization 116 | 117 | val impl = new Impl(c) 118 | start(impl) 119 | } 120 | } 121 | 122 | } -------------------------------------------------------------------------------- /dataFusion-ner/3rd-party-licenses.md: -------------------------------------------------------------------------------- 1 | # datafusion-ner-licenses 2 | 3 | Category | License | Dependency | Notes 4 | --- | --- | --- | --- 5 | Apache | [Apache 2](http://www.apache.org/licenses/LICENSE-2.0.txt) | de.jollyday # jollyday # 0.4.9 | 6 | Apache | [Apache 2](http://www.apache.org/licenses/LICENSE-2.0.txt) | io.spray # spray-json_2.12 # 1.3.3 | 7 | Apache | [Apache 2](http://www.apache.org/licenses/LICENSE-2.0.txt) | joda-time # joda-time # 2.9.4 | 8 | Apache | [Apache 2.0 License](http://www.apache.org/licenses/LICENSE-2.0.html) | com.typesafe.scala-logging # scala-logging_2.12 # 3.5.0 | 9 | Apache | [Apache License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0) | com.typesafe # config # 1.3.1 | 10 | Apache | [The Apache Software License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0.txt) | org.apache.commons # commons-lang3 # 3.3.1 | 11 | Apache | [The Apache Software License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0.txt) | org.apache.lucene # lucene-analyzers-common # 4.10.3 | 12 | Apache | [The Apache Software License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0.txt) | org.apache.lucene # lucene-core # 4.10.3 | 13 | Apache | [The Apache Software License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0.txt) | org.apache.lucene # lucene-queries # 4.10.3 | 14 | Apache | [The Apache Software License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0.txt) | org.apache.lucene # lucene-queryparser # 4.10.3 | 15 | Apache | [The Apache Software License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0.txt) | org.apache.lucene # lucene-sandbox # 4.10.3 | 16 | Apache | [The Apache Software License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0.txt) | org.apache.opennlp # opennlp-tools # 1.8.1 | 17 | Apache | [The Apache Software License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0.txt) | xalan # xalan # 2.7.0 | 18 | Apache | [The Apache Software License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0.txt) | xerces # xercesImpl # 2.8.0 | 19 | Apache | [The Apache Software License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0.txt) | xml-apis # xml-apis # 1.3.03 | 20 | Apache | [the Apache License, ASL Version 2.0](http://www.apache.org/licenses/LICENSE-2.0) | org.scalactic # scalactic_2.12 # 3.0.4 | 21 | Apache | [the Apache License, ASL Version 2.0](http://www.apache.org/licenses/LICENSE-2.0) | org.scalatest # scalatest_2.12 # 3.0.4 | 22 | BSD | [BSD 3-Clause](http://www.scala-lang.org/license.html) | org.scala-lang # scala-library # 2.12.2 | 23 | BSD | [BSD 3-Clause](http://www.scala-lang.org/license.html) | org.scala-lang # scala-reflect # 2.12.2 | 24 | BSD | [BSD 3-clause](http://opensource.org/licenses/BSD-3-Clause) | org.scala-lang.modules # scala-parser-combinators_2.12 # 1.0.4 | 25 | BSD | [BSD 3-clause](http://opensource.org/licenses/BSD-3-Clause) | org.scala-lang.modules # scala-xml_2.12 # 1.0.5 | 26 | BSD | [BSD-Style](http://www.opensource.org/licenses/bsd-license.php) | com.jsuereth # scala-arm_2.12 # 2.0 | 27 | BSD | [New BSD license](http://www.opensource.org/licenses/bsd-license.php) | com.google.protobuf # protobuf-java # 3.2.0 | 28 | GPL | [GNU General Public License Version 3](http://www.gnu.org/licenses/gpl-3.0.txt) | edu.stanford.nlp # stanford-corenlp # 3.9.1 | 29 | GPL | [GPL](https://www.gnu.org/licenses/gpl-3.0.en.html) | au.csiro.data61 # datafusion-common_2.12 # 1.1-SNAPSHOT | 30 | GPL | [GPL2 w/ CPE](https://glassfish.java.net/public/CDDL+GPL_1_1.html) | javax.xml.bind # jaxb-api # 2.2.7 | 31 | GPL with Classpath Extension | [CDDL + GPLv2 with classpath exception](https://glassfish.dev.java.net/nonav/public/CDDL+GPL.html) | javax.servlet # javax.servlet-api # 3.0.1 | 32 | GPL with Classpath Extension | [CDDL + GPLv2 with classpath exception](https://glassfish.dev.java.net/nonav/public/CDDL+GPL.html) | org.glassfish # javax.json # 1.0.4 | 33 | LGPL | [EPL + GNU Lesser General Public License](http://logback.qos.ch/license.html) | ch.qos.logback # logback-classic # 1.2.3 | 34 | LGPL | [EPL + GNU Lesser General Public License](http://logback.qos.ch/license.html) | ch.qos.logback # logback-core # 1.2.3 | 35 | LGPL | [LGPL](http://www.gnu.org/copyleft/lesser.html) | com.googlecode.efficient-java-matrix-library # ejml # 0.23 | 36 | LGPL | [The GNU Lesser General Public License, Version 2.1](http://www.gnu.org/licenses/lgpl-2.1.html) | com.io7m.xom # xom # 1.2.10 | 37 | MIT | [MIT License](http://www.opensource.org/licenses/mit-license.php) | com.github.scopt # scopt_2.12 # 3.7.0 | 38 | MIT | [MIT License](http://www.slf4j.org/license.html) | org.slf4j # slf4j-api # 1.7.25 | 39 | 40 | -------------------------------------------------------------------------------- /dataFusion-ner/MITIE-native/centos/libjavamitie.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/data61/dataFusion/4b766ae65254eed6c5a095f3bb8b6b42b9500a9c/dataFusion-ner/MITIE-native/centos/libjavamitie.so -------------------------------------------------------------------------------- /dataFusion-ner/MITIE-native/ubuntu/libjavamitie.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/data61/dataFusion/4b766ae65254eed6c5a095f3bb8b6b42b9500a9c/dataFusion-ner/MITIE-native/ubuntu/libjavamitie.so -------------------------------------------------------------------------------- /dataFusion-ner/README.md: -------------------------------------------------------------------------------- 1 | # dataFusion-ner 2 | 3 | ## Introduction 4 | 5 | This project provides a library and multi-threaded CLI (command line interface) for bulk processing. 6 | It performs [Named Entity Recognition](https://en.wikipedia.org/wiki/Named-entity_recognition) using [CoreNLP](http://stanfordnlp.github.io/CoreNLP/), [OpenNLP](http://opennlp.apache.org/) and [MITIE](https://github.com/mit-nlp/MITIE). 7 | 8 | ## Build, Configuration and Running 9 | 10 | This is mostly covered by the top level [README](../README.md), however MITIE is C++ code and has some particular requirements satisfied by the script `build-MITIE.sh`: 11 | 12 | 1. a platform independent java library `lib/javamitie.jar` 13 | 2. a platform dependent shared library `MITIE-native/{platform}/libjavamitie.so` 14 | 3. language dependent models e.g. `MITIE-models/english/ner_model.dat` 15 | 4. environment variables to access the above 16 | 17 | 1 and 2 (for both ubuntu and centos) are checked into the code repository (so if you use one of these hopefully you won't need to build MITIE except to use a newer version), however 3 (language models) are large, not in the code repository, and you will need to run the script to get them and to create the script `sh/setenv.{platform}` (for 4). 18 | 19 | Run `build-MITIE.sh` with no args to do as little as necessary, or `build-MITIE.sh --clean` to start from scratch and build the lastest MITIE. 20 | 21 | Configuration to run in Eclipse: 22 | 23 | Select `Build Path` > `Configure Build Path` > `Source` > `dataFusion-ner/src/main/scala` > `Native library location` 24 | and add the `MITIE-native/{platform}` directory. 25 | 26 | -------------------------------------------------------------------------------- /dataFusion-ner/build-MITIE.sh: -------------------------------------------------------------------------------- 1 | #! /bin/sh 2 | 3 | set -vex 4 | 5 | # run in dataFusion-ner dir 6 | DFUS_NER=$PWD 7 | 8 | # ubuntu has: ID=ubuntu but centos has: ID="centos" 9 | OS=`sed --regexp-extended --quiet 's/^ID="?([a-z]+)"?$/\1/p' /etc/os-release` 10 | 11 | NER=ner_model.dat 12 | EN=MITIE-models/english 13 | ES=MITIE-models/spanish 14 | 15 | # when these files are missing the script makes a fresh start, else it does as little as necessary 16 | [ "$1" = "--clean" ] && rm -rf MITIE MITIE-native/$OS/libjavamitie.so "$EN/$NER" ../sh/setenv.$OS # "$ES/$NER" 17 | 18 | # Build MITIE java jar and native shared library 19 | [ -d MITIE ] || git clone https://github.com/mit-nlp/MITIE 20 | [ -f MITIE-native/$OS/libjavamitie.so ] || { 21 | BUILD=MITIE/mitielib/java/build-$OS 22 | rm -rf $BUILD 23 | mkdir -p $BUILD 24 | cd $BUILD 25 | cmake .. 26 | cmake --build . --config Release --target install 27 | 28 | # Install MITIE libraries where this project expects them 29 | mkdir -p $DFUS_NER/lib $DFUS_NER/MITIE-native/$OS 30 | cp lib/javamitie.jar $DFUS_NER/lib 31 | cp lib/libjavamitie.so $DFUS_NER/MITIE-native/$OS 32 | 33 | cd $DFUS_NER 34 | } 35 | 36 | # Install English NER model 37 | [ -r "$EN/$NER" ] || { 38 | echo "Downloading English models ..." 39 | EN_BZ2=MITIE-models-v0.2.tar.bz2 40 | curl --location https://github.com/mit-nlp/MITIE/releases/download/v0.4/$EN_BZ2 > $EN_BZ2 41 | tar xvfj $EN_BZ2 $EN/$NER # only extract EN NER model 42 | rm $EN_BZ2 43 | } 44 | 45 | # Install Spanish NER model 46 | if false; then 47 | [ -r "$ES/$NER" ] || { 48 | echo "Downloading Spanish models ..." 49 | ES_ZIP=MITIE-models-v0.2-Spanish.zip 50 | curl --location https://github.com/mit-nlp/MITIE/releases/download/v0.4/$ES_ZIP > $ES_ZIP 51 | unzip $ES_ZIP $ES/$NER # only extract ES NER model 52 | rm $ES_ZIP 53 | } 54 | fi 55 | 56 | # create a file that can be sourced to set required environment variables 57 | [ -r "../sh/setenv.$OS" ] || { 58 | cat > ../sh/setenv.$OS < 2 | 3 | 4 | 5 | %d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n 6 | 7 | 8 | 9 | 10 | ner.log 11 | 12 | %d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /dataFusion-ner/src/main/resources/opennlp-models-1.5/en-ner-date.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/data61/dataFusion/4b766ae65254eed6c5a095f3bb8b6b42b9500a9c/dataFusion-ner/src/main/resources/opennlp-models-1.5/en-ner-date.bin -------------------------------------------------------------------------------- /dataFusion-ner/src/main/resources/opennlp-models-1.5/en-ner-location.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/data61/dataFusion/4b766ae65254eed6c5a095f3bb8b6b42b9500a9c/dataFusion-ner/src/main/resources/opennlp-models-1.5/en-ner-location.bin -------------------------------------------------------------------------------- /dataFusion-ner/src/main/resources/opennlp-models-1.5/en-ner-money.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/data61/dataFusion/4b766ae65254eed6c5a095f3bb8b6b42b9500a9c/dataFusion-ner/src/main/resources/opennlp-models-1.5/en-ner-money.bin -------------------------------------------------------------------------------- /dataFusion-ner/src/main/resources/opennlp-models-1.5/en-ner-organization.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/data61/dataFusion/4b766ae65254eed6c5a095f3bb8b6b42b9500a9c/dataFusion-ner/src/main/resources/opennlp-models-1.5/en-ner-organization.bin -------------------------------------------------------------------------------- /dataFusion-ner/src/main/resources/opennlp-models-1.5/en-ner-percentage.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/data61/dataFusion/4b766ae65254eed6c5a095f3bb8b6b42b9500a9c/dataFusion-ner/src/main/resources/opennlp-models-1.5/en-ner-percentage.bin -------------------------------------------------------------------------------- /dataFusion-ner/src/main/resources/opennlp-models-1.5/en-ner-person.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/data61/dataFusion/4b766ae65254eed6c5a095f3bb8b6b42b9500a9c/dataFusion-ner/src/main/resources/opennlp-models-1.5/en-ner-person.bin -------------------------------------------------------------------------------- /dataFusion-ner/src/main/resources/opennlp-models-1.5/en-ner-time.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/data61/dataFusion/4b766ae65254eed6c5a095f3bb8b6b42b9500a9c/dataFusion-ner/src/main/resources/opennlp-models-1.5/en-ner-time.bin -------------------------------------------------------------------------------- /dataFusion-ner/src/main/resources/opennlp-models-1.5/en-sent.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/data61/dataFusion/4b766ae65254eed6c5a095f3bb8b6b42b9500a9c/dataFusion-ner/src/main/resources/opennlp-models-1.5/en-sent.bin -------------------------------------------------------------------------------- /dataFusion-ner/src/main/resources/opennlp-models-1.5/en-token.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/data61/dataFusion/4b766ae65254eed6c5a095f3bb8b6b42b9500a9c/dataFusion-ner/src/main/resources/opennlp-models-1.5/en-token.bin -------------------------------------------------------------------------------- /dataFusion-ner/src/main/scala/au/csiro/data61/dataFusion/ner/MITIE.scala: -------------------------------------------------------------------------------- 1 | package au.csiro.data61.dataFusion.ner 2 | 3 | import java.io.{ File, FileNotFoundException } 4 | import java.nio.charset.Charset 5 | 6 | import scala.language.{ implicitConversions, reflectiveCalls } 7 | 8 | import com.typesafe.config.ConfigFactory 9 | import com.typesafe.scalalogging.Logger 10 | 11 | import edu.mit.ll.mitie.{ NamedEntityExtractor, global } 12 | 13 | import au.csiro.data61.dataFusion.common.Data.Ner 14 | import java.nio.charset.Charset 15 | import au.csiro.data61.dataFusion.common.Timer 16 | 17 | /** 18 | * MITIE https://github.com/mit-nlp/MITIE 19 | * is C++ code that can be compiled to use optimised BLAS implementations (or use its own). 20 | * The Java wrapper is not available in maven repos so this entails sbt unmanaged jar in lib 21 | * and loading a native shared library. 22 | */ 23 | object MITIE { 24 | val log = Logger(getClass) 25 | val conf = ConfigFactory.load 26 | val utf8 = Charset.forName("UTF-8") 27 | 28 | log.info(s"loading MITIE native library") 29 | System.loadLibrary("javamitie") 30 | 31 | // the mitie.*Vector classes declare no common interface, so we resort to a "structural type" (i.e. duck type) 32 | implicit def toIter[T](v: { def get(i: Int): T; def size(): Long }) = new Iterator[T] { 33 | var i = 0 34 | override def hasNext = i < v.size 35 | override def next = { 36 | val n = v.get(i) 37 | i += 1 38 | n 39 | } 40 | } 41 | 42 | // // http://stackoverflow.com/questions/15038616/how-to-convert-between-character-and-byte-position-in-objective-c-c-c 43 | // // map a UTF-8 byte to it's width in UTF-16 ints: 44 | // // - leading byte of 1-3 byte UTF-8 chars -> 1 (these chars map to 1 UTF-16 int) 45 | // // - leading byte of 4 byte UTF-8 chars -> 2 (these chars map to 2 UTF-16 ints) 46 | // // - extension bytes -> 0 47 | // val utf16width = (0 until 256 map { 48 | // case i if Seq(0 to 0x7f, 0xc0 to 0xdf, 0xe0 to 0xef).exists(_.contains(i)) => 1 49 | // case i if (0xf0 to 0xf7).contains(i) => 2 50 | // case _ => 0 51 | // }).toArray 52 | // def javaOffset2(utf8Str: Array[Byte], from: Int, until: Int) = { 53 | // var x = 0 54 | // for (i <- from until until) { 55 | // x += utf16width(utf8Str(i) & 0xff) // prevent byte to int sign extension 56 | // } 57 | // x 58 | // } 59 | 60 | // Timing of this impl compared to above: 61 | // short strings: javaOffset 5.2 secs; javaOffset2 7.3 secs 62 | // long strings: javaOffset 2.891 secs; javaOffset2 2.892 secs 63 | // so there is no justification for the complexity of javaOffset2 64 | def javaOffset(utf8Str: Array[Byte], from: Int, until: Int) = new String(utf8Str, from, until - from, utf8).length 65 | 66 | class Nlp(path: String) { 67 | if (!new File(path).canRead) throw new FileNotFoundException(s"Can't read $path") 68 | log.info(s"loading MITIE model $path") 69 | 70 | // multi-threading test shows that NamedEntityExtractor is not thread-safe 71 | // MITIE comes with a generic non-thread-safe warning in: https://github.com/mit-nlp/MITIE/blob/master/mitielib/include/mitie.h 72 | val neExtractor = new ThreadLocal[NamedEntityExtractor] { 73 | override protected def initialValue = new NamedEntityExtractor(path) 74 | } 75 | val neTypes = neExtractor.get.getPossibleNerTags.toIndexedSeq 76 | log.debug(s"Nlp: posible ner types from $path are $neTypes") 77 | 78 | // val t = Timer() 79 | 80 | def ner(in: String) = { 81 | 82 | val inUtf8 = in.getBytes(utf8) // MITIE's offsets are relative to this 83 | // we get the NERs in order of increasing offset, so we can calculate the offsets incrementally 84 | var o8 = 0 85 | var o16 = 0 86 | def toJavaOffset(utf8Off: Int) = { 87 | if (utf8Off < o8) { 88 | log.debug(s"resetting o8, o16: utf8Off = $utf8Off, o8 = $o8, o16 = $o16") 89 | o8 = 0 90 | o16 = 0 91 | } 92 | o16 += javaOffset(inUtf8, o8, utf8Off) 93 | o8 = utf8Off 94 | o16 95 | } 96 | 97 | val words = global.tokenizeWithOffsets(in) // multi-threading test appears to show that this is thread-safe 98 | neExtractor.get.extractEntities(words).map { e => 99 | val offStrUtf8 = words.get(e.getStart).getIndex.toInt 100 | val offStr = toJavaOffset(offStrUtf8) 101 | val end = words.get(e.getEnd - 1) 102 | val offEnd = toJavaOffset(end.getIndex.toInt) + end.getToken.length 103 | Ner( 104 | e.getStart, e.getEnd, offStr, offEnd, 105 | e.getScore, in.substring(offStr, offEnd), 106 | neTypes(e.getTag), "MITIE", None 107 | ) 108 | }.toList 109 | } 110 | } 111 | 112 | object English { 113 | val nlp = new Nlp(conf.getString("mitie.englishNerModel")) 114 | } 115 | 116 | // object Spanish { 117 | // val nlp = new Nlp(conf.getString("mitie.spanishNerModel")) 118 | // } 119 | 120 | // def ner(lang: String, in: String): List[Ner] = 121 | // lang match { 122 | // case "es" => { 123 | // // log.debug("Spanish") 124 | // Spanish.nlp.ner(in) 125 | // } 126 | // case _ => { 127 | // // log.debug("English") 128 | // English.nlp.ner(in) 129 | // } 130 | // } 131 | def ner(lang: String, in: String): List[Ner] = English.nlp.ner(in) 132 | } 133 | -------------------------------------------------------------------------------- /dataFusion-ner/src/main/scala/au/csiro/data61/dataFusion/ner/OpenNLP.scala: -------------------------------------------------------------------------------- 1 | package au.csiro.data61.dataFusion.ner 2 | 3 | import java.io.InputStream 4 | import java.util.Properties 5 | 6 | import scala.collection.JavaConverters._ 7 | 8 | import com.typesafe.scalalogging.Logger 9 | 10 | import edu.stanford.nlp.ling.CoreAnnotations.{ SentencesAnnotation, TokensAnnotation } 11 | import edu.stanford.nlp.pipeline.Annotator.{ STANFORD_SSPLIT, STANFORD_TOKENIZE } 12 | import edu.stanford.nlp.pipeline.StanfordCoreNLP 13 | import opennlp.tools.namefind.{ NameFinderME, TokenNameFinderModel } 14 | import opennlp.tools.sentdetect.{ SentenceDetectorME, SentenceModel } 15 | import opennlp.tools.tokenize.{ TokenizerME, TokenizerModel } 16 | import resource.managed 17 | 18 | import au.csiro.data61.dataFusion.common.Data.Ner 19 | 20 | object OpenNLP { 21 | val log = Logger(getClass) 22 | 23 | def loadModel[M](path: String, ctor: InputStream => M) = { 24 | log.info(s"loading OpenNLP model $path") 25 | managed(getClass.getResourceAsStream(path)).map(ctor).tried.get 26 | } 27 | 28 | // *Model's are thread-safe 29 | object English { 30 | val sentence = loadModel("/opennlp-models-1.5/en-sent.bin", in => new SentenceModel(in)) 31 | val tokenizer = loadModel("/opennlp-models-1.5/en-token.bin", in => new TokenizerModel(in)) 32 | val ners = Seq("date", "location", "money", "organization", "percentage", "person", "time").map { typ => 33 | loadModel(s"/opennlp-models-1.5/en-ner-${typ}.bin", in => new TokenNameFinderModel(in)) 34 | } 35 | } 36 | 37 | // object Spanish { 38 | // /** Spanish sentence & tokenizer models used in training Spanish NameFinder models are not available, 39 | // * so use CoreNLP (just for this) and hope it is not too different! 40 | // */ 41 | // val coreNLP = managed(getClass.getResourceAsStream("/StanfordCoreNLP-spanish.properties")).map { in => 42 | // val p = new Properties 43 | // p.load(in) 44 | // p.setProperty("annotators", Seq(STANFORD_TOKENIZE, STANFORD_SSPLIT).mkString(", ")) 45 | // CoreNLP.synchronized { new StanfordCoreNLP(p, true) } // synchronized else multi-threaded sbt test fails 46 | // }.tried.get 47 | // 48 | // val ners = Seq("location", "organization", "person", "misc").map { typ => 49 | // loadModel(s"/opennlp-models-1.5/es-ner-${typ}.bin", in => new TokenNameFinderModel(in)) 50 | // } 51 | // } 52 | 53 | /** 54 | * Not thread-safe 55 | */ 56 | class EnOpenNLP { 57 | // because *ME's are not thread-safe 58 | val sent = new SentenceDetectorME(English.sentence) 59 | val tok = new TokenizerME(English.tokenizer) 60 | val ners = English.ners.map(new NameFinderME(_)) 61 | 62 | def ner(in: String): List[Ner] = { 63 | var tokenIdx = 0; 64 | val r = for { 65 | sentencePos <- sent.sentPosDetect(in) 66 | sentence = in.substring(sentencePos.getStart, sentencePos.getEnd) 67 | pos = tok.tokenizePos(sentence) 68 | tIdx = tokenIdx 69 | _ = tokenIdx += pos.size // start of next sentence 70 | tokens = pos.map(s => sentence.substring(s.getStart, s.getEnd)) 71 | ner <- ners 72 | s <- ner.find(tokens) 73 | start = sentencePos.getStart + pos(s.getStart).getStart 74 | end = sentencePos.getStart + pos(s.getEnd - 1).getEnd 75 | } yield Ner(tIdx + s.getStart, tIdx + s.getEnd, start, end, s.getProb, in.substring(start, end), s.getType.toUpperCase, "OpenNLP", None) 76 | 77 | ners.foreach(_.clearAdaptiveData) 78 | r.toList 79 | } 80 | } 81 | val enOpenNLP = new ThreadLocal[EnOpenNLP] { 82 | override protected def initialValue = new EnOpenNLP 83 | } 84 | 85 | 86 | /** 87 | * Not thread-safe 88 | */ 89 | // class EsOpenNLP { 90 | // // because *ME are not thread-safe 91 | // val ners = Spanish.ners.map(new NameFinderME(_)) 92 | // 93 | // def ner(in: String): List[Ner] = { 94 | // var tokenIdx = 0; 95 | // val r = for { 96 | // sentence <- Spanish.coreNLP.process(in).get(classOf[SentencesAnnotation]).asScala 97 | // tokens = sentence.get(classOf[TokensAnnotation]).asScala.toArray 98 | // tIdx = tokenIdx 99 | // _ = tokenIdx += tokens.size // token index of start of next sentence 100 | // ner <- ners 101 | // s <- ner.find(tokens.map(_.originalText)) 102 | // start = tokens(s.getStart).beginPosition 103 | // end = tokens(s.getEnd - 1).endPosition 104 | // } yield Ner(tIdx + s.getStart, tIdx + s.getEnd, start, end, s.getProb, in.substring(start, end), s.getType.toUpperCase, "OpenNLP") 105 | // 106 | // ners.foreach(_.clearAdaptiveData) 107 | // r.toList 108 | // } 109 | // } 110 | // val esOpenNLP = new ThreadLocal[EsOpenNLP] { 111 | // override protected def initialValue = new EsOpenNLP 112 | // } 113 | 114 | /** thread-safe */ 115 | // def ner(lang: String, in: String): List[Ner] = 116 | // lang match { 117 | // case "es" => { 118 | // // log.debug("Spanish") 119 | // esOpenNLP.get.ner(in) 120 | // } 121 | // case _ => { 122 | // // log.debug("English") 123 | // enOpenNLP.get.ner(in) 124 | // } 125 | // } 126 | def ner(lang: String, in: String): List[Ner] = enOpenNLP.get.ner(in) 127 | } 128 | -------------------------------------------------------------------------------- /dataFusion-ner/src/main/scala/au/csiro/data61/dataFusion/ner/Split.scala: -------------------------------------------------------------------------------- 1 | package au.csiro.data61.dataFusion.ner 2 | 3 | import com.typesafe.scalalogging.Logger 4 | 5 | object Split { 6 | private val log = Logger(getClass) 7 | 8 | val allAlpha = """\p{Alpha}{3,}""".r 9 | val hasVowel = """(?i)[aeiou]""".r.unanchored 10 | val hasConsonant = """(?i)[a-z&&[^aeiouy]]""".r.unanchored 11 | 12 | def wordLike(word: String) = 13 | (for { 14 | a <- allAlpha.unapplySeq(word) 15 | b <- hasVowel.unapplySeq(word) 16 | c <- hasConsonant.unapplySeq(word) 17 | } yield (a, b, c)).isDefined 18 | 19 | def containsWordLike(line: String) = line.split(" +") exists wordLike 20 | 21 | /** 22 | * CoreNLP doesn't terminate on long input, so split on lines where containsWordLike is false 23 | * (that includes blank lines so it splits on paragraphs), 24 | * but don't split again for the next splitmin lines. 25 | * Paragraphs longer than splitmax are split in segments of splitmax lines, without considering sentence breaks. 26 | * This is a bit drastic, but less so than non-terminating processing. 27 | * This handles spreadsheet data with potentially very long "paragraphs". 28 | */ 29 | def splitParagraphs(lines: IndexedSeq[String], splitmin: Int, splitmax: Int): Iterator[(Int, Int, String)] = { 30 | val splits = (for ((l, i) <- lines.zipWithIndex if !containsWordLike(l)) yield i).toList :+ lines.size 31 | log.debug(s"main: splits = $splits") 32 | val splitsFiltered = (splits.foldLeft((splitmin, 0, List(0))){ case ((maxi, prev, result), x) => 33 | log.debug(s"splitParagraphs: x = $x, maxi = $maxi") 34 | if (x <= maxi) { 35 | (maxi, x, result) // drop values within range of splitmin 36 | } else { 37 | val z = ((prev + splitmax) until x by splitmax).toList :+ x // if bigger than splitmax then split every splitmax 38 | (x + splitmin, x, z.reverse ++ result) 39 | } 40 | })._3.reverse 41 | log.debug(s"splitParagraphs: splitsFiltered = $splitsFiltered") 42 | if (splitsFiltered.size < 2) { 43 | Iterator.single((0, lines.size, lines.mkString("", "\n", "\n"))) 44 | } else { 45 | splitsFiltered.sliding(2).flatMap { 46 | case a :: b :: Nil if a < b => List((a, b, lines.slice(a, b).mkString("", "\n", "\n"))) // include trailing \n so concatenating splits gives original String 47 | case _ => List.empty 48 | } 49 | } 50 | } 51 | 52 | // def main(args: Array[String]): Unit = { 53 | // implicit val utf8 = io.Codec.UTF8 54 | // val lines = io.Source.fromFile("/home/bac003/sw/submissions/data/submissions/sub-043-part-1.txt").getLines.take(100).toIndexedSeq 55 | // val splits = split(lines, 5).toList 56 | // log.debug(s"main: splits = $splits") 57 | // } 58 | } -------------------------------------------------------------------------------- /dataFusion-ner/src/test/resources/logback-test.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | %d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n 6 | 7 | 8 | 9 | 10 | ner-test.log 11 | 12 | %d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /dataFusion-ner/src/test/scala/au/csiro/data61/dataFusion/ner/CoreNLPTest.scala: -------------------------------------------------------------------------------- 1 | package au.csiro.data61.dataFusion.ner 2 | 3 | import org.scalatest.{ Finders, FlatSpec, Matchers } 4 | 5 | import com.typesafe.scalalogging.Logger 6 | 7 | import au.csiro.data61.dataFusion.common.Data.Ner 8 | import CoreNLP._ 9 | 10 | class CoreNLPTest extends FlatSpec with Matchers { 11 | val log = Logger(getClass) 12 | 13 | val en ="en" 14 | val enTxt = """The Clinton Engineer Works was the site of the Manhattan Project's World War II production facilities that provided the enriched uranium used in the bombing of Hiroshima in August 1945. 15 | 16 | Its X-10 Graphite Reactor produced the first samples of plutonium from a reactor. 17 | 18 | Located just south of the town of Clinton, Tennessee, it included the production facilities of the K-25, Y-12 and S-50 projects, various utilities, and the township of Oak Ridge. 19 | 20 | The Manhattan District Engineer, Kenneth Nichols, moved the Manhattan District headquarters there from Manhattan in August 1943. 21 | """ 22 | 23 | // val es = "es" 24 | // val esTxt = """Cristóbal Colón, Cristoforo Colombo en italiano o Christophorus Columbus en latín (Génova,n. 1 1 2 c. 1436-14513 -Valladolid, 20 de mayo de 1506), fue un navegante, cartógrafo, almirante, virrey y gobernador general de las Indias Occidentales al servicio de la Corona de Castilla. 25 | // 26 | //Es famoso por haber realizado el descubrimiento de América, el 12 de octubre de 1492, al llegar a la isla de Guanahani, actualmente en las Bahamas. 27 | // 28 | //Efectuó cuatro viajes a las Indias —denominación del continente americano hasta la publicación del Planisferio de Martín Waldseemüller en 1507— y aunque posiblemente no fue el primer explorador europeo de América, se le considera el descubridor de un nuevo continente —por eso llamado el Nuevo Mundo— para Europa, al ser el primero que trazó una ruta de ida y vuelta a través del océano Atlántico y dio a conocer la noticia. 29 | // 30 | //Este hecho impulsó decisivamente la expansión mundial de la civilización europea, y la conquista y colonización por varias de sus potencias del continente americano. 31 | //""" 32 | 33 | "CoreNLP NER" should "get English entities" in { 34 | val ners = nerSplitParagraphs(en, enTxt, 1, 1) // split into small chunks 35 | log.debug(s"ners = ${ners}") 36 | // 3.9.1 has many new tags over 3.8.0, e.g. CITY, STATE_OR_PROVINCE (rather than just LOCATION), TITLE for job title, CAUSE_OF_DEATH (for "war"), CRIMINAL_CHARGE for "bombing" 37 | assert(ners.contains(Ner(12, 15, 67, 79, 1.0, "World War II", "MISC", "CoreNLP", None))) 38 | assert(ners.contains(Ner(98, 100, 566, 577, 1.0, "August 1943", "DATE", "CoreNLP", None))) 39 | } 40 | 41 | // CoreNLP 3.8.0, 3.9.1 fail this test 42 | // For ATO project we built corenlp from latest in github on 2017-09-22 (while 3.8.0 was the current release) and this passed. 43 | // it should "handle no space between digits and mutiplier" in { 44 | // for (mult <- Seq("hundred", "thousand", "million", "billion", "trillion")) { 45 | // val text = "Henry bought Sally a new car for $3.75" + mult + " for her birthday." 46 | // val ners = ner("en", text) 47 | // log.debug(s"text = $text, ners = ${ners}") 48 | // assert(ners.exists(_.typ == "MONEY")) 49 | // } 50 | // } 51 | 52 | // it should "get Spanish entities" in { 53 | // val ners = nerSplit(es, esTxt, 1) // split into small chunks 54 | // log.debug(s"ners = ${ners}") 55 | // assert(ners.contains(Ner(0, 2, 0, 15, 1.0, "Cristóbal Colón", "PERSON", "CoreNLP"))) 56 | // assert(ners.contains(Ner(141, 142, 737, 743, 1.0, "Europa", "LOCATION", "CoreNLP"))) 57 | // } 58 | // 59 | // it should "get Spanish entities in mutiple threads" in { 60 | // val expected = nerSplit(es, esTxt, 1) // split into small chunks 61 | // 62 | // val r = new Runnable { 63 | // override def run = { 64 | // val ners = ner(es, esTxt) // would split to 100 lines, but text is smaller than that, so no split 65 | // ners should be(expected) 66 | // } 67 | // } 68 | // val threads = Iterator.range(0, 8).map { _ => 69 | // val t = new Thread(r) 70 | // t.start 71 | // t 72 | // }.toList 73 | // threads.foreach(_.join) 74 | // } 75 | } -------------------------------------------------------------------------------- /dataFusion-ner/src/test/scala/au/csiro/data61/dataFusion/ner/OpenNLPTest.scala: -------------------------------------------------------------------------------- 1 | package au.csiro.data61.dataFusion.ner 2 | 3 | import org.scalatest.{ FlatSpec, Matchers } 4 | 5 | import com.typesafe.scalalogging.Logger 6 | 7 | import au.csiro.data61.dataFusion.common.Data.Ner 8 | import OpenNLP.ner 9 | 10 | class OpennlpNerTest extends FlatSpec with Matchers { 11 | val log = Logger(getClass) 12 | 13 | val en = "en" 14 | val enTxt = """The Clinton Engineer Works was the site of the Manhattan Project's World War II production facilities that provided the enriched uranium used in the bombing of Hiroshima in August 1945. Its X-10 Graphite Reactor produced the first samples of plutonium from a reactor. Located just south of the town of Clinton, Tennessee, it included the production facilities of the K-25, Y-12 and S-50 projects, various utilities, and the township of Oak Ridge. The Manhattan District Engineer, Kenneth Nichols, moved the Manhattan District headquarters there from Manhattan in August 1943. """ 15 | 16 | val es = "es" 17 | val esTxt = """Cristóbal Colón, Cristoforo Colombo en italiano o Christophorus Columbus en latín (Génova,n. 1 1 2 c. 1436-14513 -Valladolid, 20 de mayo de 1506), fue un navegante, cartógrafo, almirante, virrey y gobernador general de las Indias Occidentales al servicio de la Corona de Castilla. Es famoso por haber realizado el descubrimiento de América, el 12 de octubre de 1492, al llegar a la isla de Guanahani, actualmente en las Bahamas. 18 | Efectuó cuatro viajes a las Indias —denominación del continente americano hasta la publicación del Planisferio de Martín Waldseemüller en 1507— y aunque posiblemente no fue el primer explorador europeo de América, se le considera el descubridor de un nuevo continente —por eso llamado el Nuevo Mundo— para Europa, al ser el primero que trazó una ruta de ida y vuelta a través del océano Atlántico y dio a conocer la noticia. Este hecho impulsó decisivamente la expansión mundial de la civilización europea, y la conquista y colonización por varias de sus potencias del continente americano.""" 19 | 20 | "OpenNLP NER" should "get English entities" in { 21 | val ners = ner(en, enTxt) 22 | log.debug(s"ners = $ners") 23 | assert(ners.map(_.copy(score = 1.0)).contains(Ner(78, 80, 436, 445, 1.0, "Oak Ridge", "LOCATION", "OpenNLP", None))) 24 | } 25 | 26 | // it should "get Spanish entities" in { 27 | // val ners = ner(es, esTxt) 28 | // log.debug(s"ners = ${ners}") 29 | // assert(ners.map(_.copy(score = 1.0)).contains(Ner(80, 81, 390, 399, 1.0, "Guanahani", "LOCATION", "OpenNLP"))) 30 | // } 31 | // 32 | // it should "get Spanish entities in mutiple threads" in { 33 | // val expected = ner(es, esTxt) 34 | // 35 | // val r = new Runnable { 36 | // override def run = { 37 | // val ners = ner(es, esTxt) 38 | // ners should be(expected) 39 | // } 40 | // } 41 | // val threads = Iterator.range(0, 8).map { _ => 42 | // val t = new Thread(r) 43 | // t.start 44 | // t 45 | // }.toList 46 | // threads.foreach(_.join) 47 | // } 48 | } -------------------------------------------------------------------------------- /dataFusion-ner/src/test/scala/au/csiro/data61/dataFusion/ner/SplitTest.scala: -------------------------------------------------------------------------------- 1 | package au.csiro.data61.dataFusion.ner 2 | 3 | import org.scalatest.{ Finders, FlatSpec, Matchers } 4 | 5 | import com.typesafe.scalalogging.Logger 6 | import Split._ 7 | 8 | class SplitTest extends FlatSpec with Matchers { 9 | val log = Logger(getClass) 10 | val wrdLike = Seq("word", "Word", "zzza") 11 | val nonWrdLike = Seq("", "an", "zebra.", "sky", "aeiou") 12 | 13 | "wordLike" should "detect tokens looking like words" in { 14 | for (w <- wrdLike) assert(wordLike(w)) 15 | } 16 | 17 | it should "reject words < 3 chars, with non alpha chars, with no vowels or no consonants" in { 18 | for (w <- nonWrdLike) assert(!wordLike(w)) 19 | } 20 | 21 | "containsWordLike" should "detect lines containing a wordLike" in { 22 | for { 23 | i <- 0 to nonWrdLike.size // index where we'll insert the wrdLike 24 | w <- wrdLike 25 | } { 26 | val line = (nonWrdLike.take(i) ++ Seq(w) ++ nonWrdLike.drop(i)).mkString(" ") 27 | assert(containsWordLike(line)) 28 | } 29 | } 30 | 31 | it should "reject lines with no wordLike" in { 32 | for (line <- Seq("", nonWrdLike.mkString(" "))) assert(!containsWordLike(line)) 33 | } 34 | 35 | val longText = """ 36 | Fiction House apparently made the decision to launch Planet Stories 37 | so quickly that there was little time for Reiss to obtain new stories, 38 | so he worked with Julius Schwartz and other authors' agents to fill the 39 | first issue. The results were unremarkable, but Reiss was energetic, and 40 | was able to improve the quality of fiction in succeeding issues, though 41 | he occasionally apologized to the readers for printing weak material. 42 | The magazine was exclusively focused on interplanetary adventures, 43 | often taking place in primitive societies that would now be regarded as 44 | "sword and sorcery" settings, and was aimed at a young readership; the 45 | result was a mixture of what became known as space opera and planetary 46 | romances—melodramatic tales of action and adventure on alien planets 47 | and in interplanetary space. Planet relied on a few authors to 48 | provide the bulk of its fiction in the early years, with Nelson Bond 49 | providing eight lead stories, some of them novels. Fourteen more were 50 | written by Ray Cummings and Ross Rocklynne; and Leigh Brackett was also 51 | a regular contributor, with seventeen stories in total published over 52 | the lifetime of the magazine. 53 | 54 | The letter column in Planet was titled "The Vizigraph"; it was very 55 | active, with long letters from an engaged readership. It often printed 56 | letters from established writers, and from fans who would go on to become 57 | well known professionally: Damon Knight's letters are described by sf 58 | historian Mike Ashley as "legendary"; and Robert Silverberg commented 59 | in a letter in the Summer 1950 issue that Ray Bradbury "certainly gets 60 | some original ideas, if not good ones". The editors put a good 61 | deal of effort into keeping the letter column friendly and lively; 62 | contemporary writer and editor Robert Lowndes recalls that "Reiss was 63 | sincere and urbane; Wilbur [Peacock] enjoyed taking his coat off and 64 | being one of the crowd".""" 65 | 66 | "splitParagraphs" should "split paragraphs" in { 67 | val paras = splitParagraphs(longText.split("\n"), 3, 200).toList 68 | // log.debug(s"paras = $paras") 69 | paras.map(x => (x._1, x._2)) should be(List((0, 18), (18, 30))) 70 | } 71 | 72 | it should "further split long paragraphs" in { 73 | val paras = splitParagraphs(longText.split("\n"), 3, 8).toList 74 | // log.debug(s"paras = $paras") 75 | paras.map(x => (x._1, x._2)) should be(List((0, 8), (8, 16), (16, 18), (18, 26), (26, 30))) 76 | } 77 | } -------------------------------------------------------------------------------- /dataFusion-search-service/README.md: -------------------------------------------------------------------------------- 1 | # dataFusion-search-service 2 | 3 | ## Introduction 4 | 5 | This project provides [RESTful](https://en.wikipedia.org/wiki/Representational_state_transfer) web services based on [dataFusion-search](../dataFusion-search). 6 | 7 | ## Build, Configuration, Running and Swagger Support 8 | 9 | See the top level [README](../README.md). This will not run concurrently with the dataFusion-search CLI, unless they are configured to use different search indices, because Lucene takes an exclusive lock on its index. 10 | 11 | -------------------------------------------------------------------------------- /dataFusion-search-service/build.sbt: -------------------------------------------------------------------------------- 1 | name := "dataFusion-search-service" 2 | 3 | libraryDependencies ++= Seq( 4 | "com.github.swagger-akka-http" %% "swagger-akka-http" % "0.9.1", 5 | "com.typesafe.akka" %% "akka-http-spray-json" % "10.0.7", 6 | "ch.megard" %% "akka-http-cors" % "0.2.1", 7 | "com.github.scopt" %% "scopt" % "3.5.0", 8 | "com.jsuereth" %% "scala-arm" % "2.0", 9 | "org.scalatest" %% "scalatest" % "3.0.0" % "test" 10 | ) 11 | 12 | com.github.retronym.SbtOneJar.oneJarSettings 13 | 14 | mainClass in Compile := Some("au.csiro.data61.dataFusion.search.service.Main") 15 | -------------------------------------------------------------------------------- /dataFusion-search-service/src/main/resources/application.conf: -------------------------------------------------------------------------------- 1 | http = { 2 | host = "0.0.0.0" 3 | port = 8087 4 | 5 | host = ${?SEARCH_HTTP_HOST} 6 | port = ${?SEARCH_HTTP_PORT} 7 | } -------------------------------------------------------------------------------- /dataFusion-search-service/src/main/resources/logback.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | search-service.log 11 | 12 | %d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /dataFusion-search-service/src/test/resources/logback-test.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | %d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /dataFusion-search/3rd-party-licenses.md: -------------------------------------------------------------------------------- 1 | # datafusion-search-licenses 2 | 3 | Category | License | Dependency | Notes 4 | --- | --- | --- | --- 5 | Apache | [Apache 2](http://www.apache.org/licenses/LICENSE-2.0.txt) | io.spray # spray-json_2.12 # 1.3.3 | 6 | Apache | [Apache 2.0 License](http://www.apache.org/licenses/LICENSE-2.0.html) | com.typesafe.scala-logging # scala-logging_2.12 # 3.5.0 | 7 | Apache | [Apache License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0) | com.typesafe # config # 1.3.1 | 8 | Apache | [Apache License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0.txt) | commons-io # commons-io # 2.5 | 9 | Apache | [The Apache Software License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0.txt) | com.google.guava # guava # 18.0 | 10 | Apache | [The Apache Software License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0.txt) | org.apache.lucene # lucene-analyzers-common # 7.0.1 | 11 | Apache | [The Apache Software License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0.txt) | org.apache.lucene # lucene-core # 7.0.1 | 12 | Apache | [The Apache Software License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0.txt) | org.apache.lucene # lucene-highlighter # 7.0.1 | 13 | Apache | [The Apache Software License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0.txt) | org.apache.lucene # lucene-join # 7.0.1 | 14 | Apache | [The Apache Software License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0.txt) | org.apache.lucene # lucene-memory # 7.0.1 | 15 | Apache | [The Apache Software License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0.txt) | org.apache.lucene # lucene-queries # 7.0.1 | 16 | Apache | [The Apache Software License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0.txt) | org.apache.lucene # lucene-queryparser # 7.0.1 | 17 | Apache | [The Apache Software License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0.txt) | org.apache.lucene # lucene-sandbox # 7.0.1 | 18 | Apache | [the Apache License, ASL Version 2.0](http://www.apache.org/licenses/LICENSE-2.0) | org.scalactic # scalactic_2.12 # 3.0.0 | 19 | Apache | [the Apache License, ASL Version 2.0](http://www.apache.org/licenses/LICENSE-2.0) | org.scalatest # scalatest_2.12 # 3.0.0 | 20 | BSD | [BSD 3-Clause](http://www.scala-lang.org/license.html) | org.scala-lang # scala-library # 2.12.2 | 21 | BSD | [BSD 3-Clause](http://www.scala-lang.org/license.html) | org.scala-lang # scala-reflect # 2.12.2 | 22 | BSD | [BSD 3-clause](http://opensource.org/licenses/BSD-3-Clause) | org.scala-lang.modules # scala-parser-combinators_2.12 # 1.0.4 | 23 | BSD | [BSD 3-clause](http://opensource.org/licenses/BSD-3-Clause) | org.scala-lang.modules # scala-xml_2.12 # 1.0.5 | 24 | BSD | [BSD-Style](http://www.opensource.org/licenses/bsd-license.php) | com.jsuereth # scala-arm_2.12 # 2.0 | 25 | GPL | [GPL](https://www.gnu.org/licenses/gpl-3.0.en.html) | au.csiro.data61 # datafusion-common_2.12 # 1.1-SNAPSHOT | 26 | LGPL | [EPL + GNU Lesser General Public License](http://logback.qos.ch/license.html) | ch.qos.logback # logback-classic # 1.2.3 | 27 | LGPL | [EPL + GNU Lesser General Public License](http://logback.qos.ch/license.html) | ch.qos.logback # logback-core # 1.2.3 | 28 | MIT | [MIT License](http://www.opensource.org/licenses/mit-license.php) | com.github.scopt # scopt_2.12 # 3.5.0 | 29 | MIT | [MIT License](http://www.slf4j.org/license.html) | org.slf4j # slf4j-api # 1.7.25 | 30 | 31 | -------------------------------------------------------------------------------- /dataFusion-search/README.md: -------------------------------------------------------------------------------- 1 | # dataFusion-search 2 | 3 | ## Introduction 4 | This project provides: 5 | - a search library 6 | - an indexer 7 | - a multi-threaded CLI (command line interface) for high performance bulk searching for known entities 8 | - other specialised command line tools (see --help) 9 | 10 | Search results are at the level of embedded document (e.g. a main document with embIdx = -1 or a specific embedded document with embIdx >= 0). Please see [Search Result JSON format](../dataFusion-common#search-result-json-format) for details of the output. 11 | 12 | ## Indexing 13 | The `--index` CLI option creates the search index (at a location specified in [configuration](../README.md#configuration)). The input is in the [Document JSON format](../dataFusion-common#document-json-format) with the `content` and `embedded[].content` fields containing the text which is searched. The `meta` and `ner` data (again both main and embedded) is also separately indexed and can be searched using the [dataFusion-search-service](./dataFusion-search-service). 14 | 15 | ## Search Strategy 16 | ### Tokenization and Punctuation 17 | Lucene's default `StandardTokenizer` removes punctuation, but as some organizations use punctuation as significant parts of their name this project uses Lucene's `WhitespaceTokenizer` and `LowerCaseFilter` with a custom `TrailingPunctuationFilter` to remove trailing commas, full stops etc. for a search which is case insensitive, but sensitive to non-trailing punctuation. 18 | ### Synonyms 19 | Lucene's `SynonymGraphFilter` is used to map synonyms specified in a file `synonyms.txt` (the location is specified in [configuration](../README.md#configuration)), initially set to map "proprietary" to "pty" and "limited" to "ltd", but can be updated by the user. The synonym mapping should be consistent for indexing and searching. 20 | ### Organizations 21 | A search hit must match all tokens in the query with tokens in the same order. 22 | ### People 23 | A search hit must match all tokens in the query, but the tokens may appear in any order. 24 | A phrase search for unordered terms (e.g. for PERSON|PERSON2) produces spurious matches where all terms are matched but not with the correct number of occurrences e.g. “Aaron H Aaron” matches “Aaron H H”. Fetching the text to check the number of occurrences would negatively impact the performance of the search, so this check is deferred to `dataFusion-util --hits` processing where the text is already available. Consequently the Search Result JSON (hits.json) contains the spurious matches, but they are filtered out from gaz.json. 25 | ### Scoring 26 | A query is assigned an [IDF](https://en.wikipedia.org/wiki/Tf%E2%80%93idf#Inverse_document_frequency) score calculated using [Lucene’s formula](https://lucene.apache.org/core/7_1_0/core/org/apache/lucene/search/similarities/TFIDFSimilarity.html). If this score is below the threshold set by the `--minScore` CLI option (default 3.5) then the query is deemed to be insufficiently distiguishing and is skipped. 27 | ### Query Generation from CSV 28 | The `--searchCsv` option generates queries from CSV data. If a type field is 'BUS' the record represents an organization, otherwise it represent a person. 29 | - People's names are expected to be segmented into 3 fields for the person's family, first given and other names. 30 | - Where the 3 name fields for a person are non-blank a query is generated to search for all tokens in the name. The query and any resultant hits have `typ=PERSON`. 31 | - Where the first and family name fields are non-blank (whether or not the other names field is non-blank) a query is generated to search for all tokens in these two fields. The query and any resultant hits have `typ=PERSON2`. 32 | - Organization names are in a single field. Where this contains at least 2 tokems a query is generated to search for all tokens in the name. The query and any resultant hits have `typ=ORGANIZATION`. 33 | - A numeric id field is carried through from the CSV to the query and the results, to facilitate integration with other systems. 34 | - Queries for the same name and `typ` are combined into a single query with multiple id values in `ExtRef.ids[]`. 35 | 36 | ## Build, Configuration and Running 37 | 38 | See the top level [README](../README.md). 39 | 40 | -------------------------------------------------------------------------------- /dataFusion-search/build.sbt: -------------------------------------------------------------------------------- 1 | name := "dataFusion-search" 2 | 3 | libraryDependencies ++= Seq( 4 | "lucene-core", 5 | "lucene-analyzers-common", 6 | "lucene-queryparser", 7 | "lucene-highlighter" 8 | ).map("org.apache.lucene" % _ % "7.0.1") 9 | 10 | libraryDependencies ++= Seq( 11 | "com.google.guava" % "guava" % "18.0", // not "23.0", for compatability with search-service dependencies 12 | "commons-io" % "commons-io" % "2.5", 13 | "com.typesafe" % "config" % "1.3.1", 14 | "com.github.scopt" %% "scopt" % "3.5.0", 15 | "com.jsuereth" %% "scala-arm" % "2.0", 16 | "org.scalatest" %% "scalatest" % "3.0.0" % "test" 17 | ) 18 | 19 | com.github.retronym.SbtOneJar.oneJarSettings 20 | 21 | mainClass in Compile := Some("au.csiro.data61.dataFusion.search.Main") 22 | -------------------------------------------------------------------------------- /dataFusion-search/src/main/resources/application.conf: -------------------------------------------------------------------------------- 1 | search = { 2 | // See: https://lucene.apache.org/core/6_6_0/analyzers-common/org/apache/lucene/analysis/synonym/SolrSynonymParser.html 3 | synonyms = "synonyms.txt" 4 | synonyms = ${?SEARCH_SYNONYMS} 5 | 6 | docIndex = "docIndex" 7 | docIndex = ${?SEARCH_DOC_INDEX} 8 | 9 | metaIndex = "metaIndex" 10 | metaIndex = ${?SEARCH_META_INDEX} 11 | 12 | nerIndex = "nerIndex" 13 | nerIndex = ${?SEARCH_NER_INDEX} 14 | } 15 | -------------------------------------------------------------------------------- /dataFusion-search/src/main/resources/logback.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | search.log 11 | 12 | %d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /dataFusion-search/src/main/scala/au/csiro/data61/dataFusion/search/DocFreq.scala: -------------------------------------------------------------------------------- 1 | package au.csiro.data61.dataFusion.search 2 | 3 | import java.io.OutputStreamWriter 4 | import java.nio.charset.Charset 5 | 6 | import scala.io.Source 7 | 8 | import org.apache.lucene.index.{ DirectoryReader, MultiFields } 9 | 10 | import com.google.common.hash.{ BloomFilter, Funnels } 11 | import com.typesafe.scalalogging.Logger 12 | 13 | import DataFusionLucene.{ F_CONTENT, analyzer, docIndex } 14 | import LuceneUtil.{ directory, termIter, tokenIter } 15 | import Main.CliOption 16 | import au.csiro.data61.dataFusion.common.Data.{ PosQuery, T_ORGANIZATION } 17 | import au.csiro.data61.dataFusion.common.Data.ExtRef 18 | import au.csiro.data61.dataFusion.common.Data.JsonProtocol.posQueryCodec 19 | import au.csiro.data61.dataFusion.common.Timer 20 | import resource.managed 21 | import spray.json.{ pimpAny, pimpString } 22 | 23 | object DocFreq { 24 | private val log = Logger(getClass) 25 | 26 | /** 27 | * Output docFreq,term 28 | */ 29 | def writeDocFreqs(c: CliOption) = { 30 | for { 31 | r <- managed(DirectoryReader.open(directory(docIndex))) 32 | ti <- termIter(MultiFields.getFields(r).terms(F_CONTENT)) 33 | } { 34 | println(s"${ti.docFreq},${ti.term.utf8ToString}") 35 | } 36 | } 37 | 38 | def loadTermFilter(expectedInsertions: Int) = { 39 | val timer = Timer() 40 | val termFilter = BloomFilter.create(Funnels.stringFunnel(Charset.forName("UTF-8")), expectedInsertions) 41 | var n = 0 42 | for { 43 | r <- managed(DirectoryReader.open(directory(docIndex))) 44 | ti <- termIter(MultiFields.getFields(r).terms(F_CONTENT)) // we could filter this: /^[A-Z](?:['A-Z-]*[A-Z])$/, but there are not too many without filtering 45 | } { 46 | termFilter put ti.term.utf8ToString 47 | n += 1 48 | } 49 | log.info(s"loadTermSet: $n terms loaded in ${timer.elapsedSecs} secs. Max expectedInsertions = $expectedInsertions") 50 | if (n > expectedInsertions) log.error(s"Exceeded expectedInsertions = $expectedInsertions") 51 | termFilter 52 | } 53 | 54 | /** 55 | * true iff termFilter mightContain all the tokens in query 56 | */ 57 | def containsAllTokens(termFilter: BloomFilter[CharSequence], query: String) = { 58 | val tokens = tokenIter(analyzer, F_CONTENT, query).toList 59 | log.debug(s"containsAllTokens: analyzed tokens = ${tokens.toList}") 60 | tokens forall termFilter.mightContain // if false the filter definitely does not contain the term 61 | } 62 | 63 | def filterQuery(c: CliOption) = { 64 | val termFilter = loadTermFilter(c.maxTerms) 65 | for (w <- managed(new OutputStreamWriter(System.out, "UTF-8"))) { 66 | for (line <- Source.fromInputStream(System.in, "UTF-8").getLines) { 67 | val q = line.parseJson.convertTo[PosQuery] 68 | if (containsAllTokens(termFilter, q.extRef.name)) { 69 | w.write(line) 70 | w.write('\n') 71 | } else log.debug(s"filterQuery: not all tokens in index") 72 | } 73 | } 74 | } 75 | 76 | /** 77 | * read NER results, filter, write queries 78 | */ 79 | def nerToQuery(c: CliOption) = { 80 | val rNonName = "[^A-Za-z.'-]".r 81 | val rBigSpace = " {2,}".r 82 | def clean(q: String) = { 83 | val q2 = rNonName.replaceAllIn(q, " ").trim 84 | rBigSpace.replaceAllIn(q2, " ") 85 | } 86 | 87 | val termFilter = loadTermFilter(c.maxTerms) 88 | for (w <- managed(new OutputStreamWriter(System.out, "UTF-8"))) { 89 | for (line <- Source.fromInputStream(System.in, "UTF-8").getLines) { 90 | val query = clean(line.parseJson.toString) 91 | if (query.length >= 6 && containsAllTokens(termFilter, query)) { 92 | val q = PosQuery(ExtRef(query, List.empty), T_ORGANIZATION) 93 | w.write(q.toJson.compactPrint) 94 | w.write('\n') 95 | } else log.debug(s"nerToQuery: shorter than 6 chars or not all tokens in index") 96 | } 97 | } 98 | } 99 | 100 | } -------------------------------------------------------------------------------- /dataFusion-search/src/main/scala/au/csiro/data61/dataFusion/search/Indexer.scala: -------------------------------------------------------------------------------- 1 | package au.csiro.data61.dataFusion.search 2 | 3 | import scala.io.{ Codec, Source } 4 | 5 | import org.apache.lucene.index.IndexWriter 6 | 7 | import com.typesafe.config.ConfigFactory 8 | import com.typesafe.scalalogging.Logger 9 | 10 | import DataFusionLucene.{ docIndex, metaIndex, nerIndex } 11 | import DataFusionLucene.DFIndexing.{ ldoc2doc, lmeta2doc, lner2doc, mkIndexer } 12 | import LuceneUtil.directory 13 | import Main.CliOption 14 | import au.csiro.data61.dataFusion.common.Data.{ Doc, EMB_IDX_MAIN, IdEmbIdx, LDoc, LMeta, LNer } 15 | import au.csiro.data61.dataFusion.common.Data.JsonProtocol.docFormat 16 | import au.csiro.data61.dataFusion.common.Parallel.doParallel 17 | import resource.managed 18 | import spray.json.pimpString 19 | 20 | object Indexer { 21 | private val log = Logger(getClass) 22 | implicit val codec = Codec.UTF8 23 | 24 | def indexer(docIndexer: IndexWriter, metaIndexer: IndexWriter, nerIndexer: IndexWriter)(d: Doc): Unit = { 25 | val idMain = IdEmbIdx(d.id, EMB_IDX_MAIN) 26 | docIndexer.addDocument(LDoc(idMain, d.content.getOrElse(""), d.path)) 27 | for { 28 | (k, v) <- d.meta 29 | } metaIndexer.addDocument(LMeta(idMain, k, v)) 30 | for { 31 | n <- d.ner 32 | } nerIndexer.addDocument(LNer(idMain, n.posStr, n.posEnd, n.offStr, n.offEnd, n.text, n.typ, n.impl)) 33 | 34 | for { 35 | (e, embIdx) <- d.embedded.zipWithIndex 36 | } { 37 | val idEmb = IdEmbIdx(d.id, embIdx) 38 | docIndexer.addDocument(LDoc(idEmb, e.content.getOrElse(""), d.path)) 39 | for { 40 | (k, v) <- e.meta 41 | } metaIndexer.addDocument(LMeta(idEmb, k, v)) 42 | for { 43 | n <- e.ner 44 | } nerIndexer.addDocument(LNer(idEmb, n.posStr, n.posEnd, n.offStr, n.offEnd, n.text, n.typ, n.impl)) 45 | } 46 | } 47 | 48 | /** 49 | * Reads JSON Doc's from stdin (one per line) and indexes them. 50 | */ 51 | def run(c: CliOption) = { 52 | val conf = ConfigFactory.load.getConfig("search") 53 | 54 | for { 55 | docIndexer <- managed(mkIndexer(directory(docIndex))) 56 | metaIndexer <- managed(mkIndexer(directory(metaIndex))) 57 | nerIndexer <- managed(mkIndexer(directory(nerIndex))) 58 | } { 59 | val index: Doc => Unit = indexer(docIndexer, metaIndexer, nerIndexer) 60 | 61 | var count = 0 62 | val in: Iterator[String] = Source.fromInputStream(System.in).getLines.map { json => 63 | count += 1 64 | if (count % 1000 == 0) log.info(s"run.in: Queued $count docs ...") 65 | json 66 | } 67 | def work(json: String): Boolean = { 68 | index(json.parseJson.convertTo[Doc]) 69 | true 70 | } 71 | def out(more: Boolean): Unit = () 72 | 73 | doParallel(in, work, out, "", false, c.numWorkers) 74 | log.info(s"run: complete. Indexed $count docs") 75 | } 76 | } 77 | 78 | } -------------------------------------------------------------------------------- /dataFusion-search/src/main/scala/au/csiro/data61/dataFusion/search/LuceneUtil.scala: -------------------------------------------------------------------------------- 1 | package au.csiro.data61.dataFusion.search 2 | 3 | import java.io.{ Closeable, File } 4 | 5 | import scala.util.Try 6 | 7 | import org.apache.lucene.analysis.{ Analyzer, TokenStream } 8 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute 9 | import org.apache.lucene.document.Document 10 | import org.apache.lucene.index.{ DirectoryReader, PostingsEnum, Terms, TermsEnum } 11 | import org.apache.lucene.search.{ IndexSearcher, Query, ScoreDoc } 12 | import org.apache.lucene.store.{ Directory, FSDirectory } 13 | 14 | import com.typesafe.scalalogging.Logger 15 | 16 | import au.csiro.data61.dataFusion.common.Timer 17 | import org.apache.lucene.analysis.TokenFilter 18 | 19 | 20 | 21 | /** 22 | * Generic Lucene indexing and searching. 23 | * 24 | * simplified from: https://github.csiro.au/bac003/social-watch/blob/master/analytics/src/main/scala/org/t3as/socialWatch/analytics/LuceneUtil.scala 25 | */ 26 | object LuceneUtil { 27 | private val log = Logger(getClass) 28 | 29 | def tokenIter(ts: TokenStream): Iterator[String] = { 30 | ts.reset 31 | Iterator.continually { 32 | val more = ts.incrementToken 33 | if (!more) { 34 | ts.end 35 | ts.close 36 | // log.debug("tokenIter: TokenStream closed") 37 | } 38 | more 39 | }.takeWhile(identity).map(_ => ts.getAttribute(classOf[CharTermAttribute]).toString) 40 | } 41 | 42 | def tokenIter(analyzer: Analyzer, fieldName: String, text: String): Iterator[String] 43 | = tokenIter(analyzer.tokenStream(fieldName, text)) 44 | 45 | def directory(indexDir: File) = FSDirectory.open(indexDir.toPath) 46 | 47 | /** unsafe - returns the same TermsEnum but repositioned each iteration */ 48 | def termIter(terms: Terms): Iterator[TermsEnum] = { 49 | val ti = terms.iterator 50 | Iterator.continually(ti.next).takeWhile(_ != null).map(_ => ti) 51 | } 52 | 53 | /** unsafe - returns the same PostingsEnum but repositioned each iteration. Int value is position (index of term/word in field). */ 54 | def postIter(p: PostingsEnum): Iterator[(Int, PostingsEnum)] = { 55 | p.nextDoc 56 | Iterator.range(0, p.freq).map { _ => 57 | val pos = p.nextPosition 58 | (pos, p) 59 | } 60 | } 61 | 62 | /** 63 | * TokenFilter that removes all trailing chars after the last letter or digit. 64 | * Based on: org.apache.lucene.analysis.en.EnglishPossessiveFilter. 65 | */ 66 | class TrailingPunctuationFilter(in: TokenStream) extends TokenFilter(in) { 67 | val termAtt = addAttribute(classOf[CharTermAttribute]) 68 | 69 | override def incrementToken: Boolean = { 70 | if (!in.incrementToken()) { 71 | return false; 72 | } 73 | val buf = termAtt.buffer 74 | val len = termAtt.length 75 | 76 | val lastAlphaNum = { 77 | var last = -1 78 | var i = len - 1 79 | while (i >= 0 && last == -1) { 80 | if (Character.isLetterOrDigit(buf(i))) last = i 81 | i -= 1 82 | } 83 | last 84 | } 85 | 86 | if (lastAlphaNum != -1) termAtt.setLength(lastAlphaNum + 1) 87 | return true; 88 | } 89 | } 90 | 91 | class Searcher[Hit, Results]( 92 | directory: Directory, 93 | toHit: (ScoreDoc, Document) => Hit, // convert score and map of fields to Hit 94 | toResults: (Int, Float, Seq[Hit], Option[String]) => Results // convert totalHits, elapsedSecs, Seq[Hit], Option[error] to Results 95 | ) extends Closeable { 96 | val log = Logger(getClass) 97 | 98 | val searcher = open 99 | protected def open = new IndexSearcher(DirectoryReader.open(directory)) 100 | 101 | log.debug(s"Searcher: numDocs = ${searcher.getIndexReader.numDocs}") 102 | 103 | def search(q: Query, numHits: Int = 20) = { 104 | val timer = Timer() 105 | 106 | val result = for { 107 | topDocs <- Try { 108 | searcher.search(q, numHits) 109 | } 110 | hits <- Try { 111 | topDocs.scoreDocs map { scoreDoc => toHit(scoreDoc, searcher.doc(scoreDoc.doc)) } 112 | } 113 | } yield toResults(topDocs.totalHits.toInt, timer.elapsedSecs.toFloat, hits, None) 114 | 115 | result.recover { case e => toResults(0, timer.elapsedSecs.toFloat, List(), Some(e.getMessage)) }.get 116 | } 117 | 118 | def close = searcher.getIndexReader.close 119 | } 120 | 121 | } -------------------------------------------------------------------------------- /dataFusion-search/src/main/scala/au/csiro/data61/dataFusion/search/Main.scala: -------------------------------------------------------------------------------- 1 | package au.csiro.data61.dataFusion.search 2 | 3 | import java.io.File 4 | 5 | import scala.util.control.NonFatal 6 | 7 | import com.typesafe.scalalogging.Logger 8 | 9 | object Main { 10 | private val log = Logger(getClass) 11 | 12 | case class CliOption(output: File, index: Boolean, searchJson: Boolean, searchCsv: Boolean, csvDelim: Char, csvFields: Seq[String], csvPersonWith2Names: Boolean, minScore: Float, docFreq: Boolean, export: Boolean, filterQueryOnly: Boolean, filterQuery: Boolean, maxTerms: Int, nerToQuery: Boolean, slop: Int, numWorkers: Int) 13 | 14 | val defaultCliOption = CliOption(new File("hits.json"), false, false, false, '\t', Seq("STRCTRD_FMLY_NM", "STRCTRD_GVN_NM", "STRCTRD_OTHR_GVN_NM", "SEX_CD", "USTRCTRD_FULL_NM", "CLNT_INTRNL_ID"), true, 3.5f, false, false, false, true, 10000000, false, 0, Runtime.getRuntime.availableProcessors) 15 | 16 | val parser = new scopt.OptionParser[CliOption]("search") { 17 | head("search", "0.x") 18 | opt[File]("output") action { (v, c) => 19 | c.copy(output = v) 20 | } text (s"output JSON file, (default ${defaultCliOption.output.getPath})") 21 | opt[Unit]("index") action { (_, c) => 22 | c.copy(index = true, numWorkers = Math.min(12, c.numWorkers)) // slower with more than 12 workers, if you really want more put --index before --numWorkers 23 | } text (s"create Lucene indices from JSON input (default ${defaultCliOption.index})") 24 | opt[Unit]("searchJson") action { (_, c) => 25 | c.copy(searchJson = true, numWorkers = Math.min(25, c.numWorkers)) // slower with more than 25 workers, if you really want more put --searchJson before --numWorkers 26 | } text (s"search with JSON queries on stdin (default ${defaultCliOption.searchJson})") 27 | opt[Unit]("searchCsv") action { (_, c) => 28 | c.copy(searchCsv = true, numWorkers = Math.min(25, c.numWorkers)) // slower with more than 25 workers, if you really want more put --searchCsv before --numWorkers 29 | } text (s"search with CSV queries on stdin (default ${defaultCliOption.searchCsv})") 30 | opt[String]("csvDelim") action { (v, c) => 31 | c.copy(csvDelim = v.headOption.getOrElse(defaultCliOption.csvDelim)) 32 | } text (s"CSV field delimeter (default ${if (defaultCliOption.csvDelim == '\t') "tab" else defaultCliOption.csvDelim.toString})") 33 | opt[Seq[String]]("csvFields") action { (v, c) => 34 | c.copy(csvFields = v) 35 | } validate { v => 36 | if (v.size == 6) success 37 | else failure("6 field names are required") 38 | } text (s"CSV field names (6) for person's family, first given and other names, record type ('BUS' for organization or gender? for a person), business name, id (default ${defaultCliOption.csvFields.toList})") 39 | opt[Boolean]("csvPersonWith2Names") action { (v, c) => 40 | c.copy(csvPersonWith2Names = v) 41 | } text (s"CSV used to generate 2 name (omitting middle name) searches for people in addition to 3 name search (default ${defaultCliOption.csvPersonWith2Names})") 42 | opt[Double]("minScore") action { (v, c) => 43 | c.copy(minScore = v.toFloat) 44 | } text (s"minScore queries with a (IDF) score below this are skipped, (default ${defaultCliOption.minScore})") 45 | opt[Unit]("docFreq") action { (_, c) => 46 | c.copy(docFreq = true) 47 | } text (s"output term document frequencies from index as CSV (default ${defaultCliOption.docFreq})") 48 | opt[Unit]("export") action { (_, c) => 49 | c.copy(export = true) 50 | } text (s"output the stored JSON for each doc (default ${defaultCliOption.export})") 51 | opt[Unit]("filterQueryOnly") action { (_, c) => 52 | c.copy(filterQueryOnly = true) 53 | } text (s"filter Query JSON from stdin to stdout, outputing only lines with all query terms most likely in the index (default ${defaultCliOption.filterQueryOnly})") 54 | opt[Boolean]("filterQuery") action { (v, c) => 55 | c.copy(filterQuery = v) 56 | } text (s"search CLI skips search if any query term is definitely not in the index (default ${defaultCliOption.filterQuery})") 57 | opt[Int]("maxTerms") action { (v, c) => 58 | c.copy(maxTerms = v) 59 | } text (s"maxTerms for Bloom Filter used with filterQuery, (default ${defaultCliOption.maxTerms})") 60 | opt[Unit]("nerToQuery") action { (_, c) => 61 | c.copy(nerToQuery = true) 62 | } text (s"filter JSON names from stdin to stdout, outputing queries only for lines with all specified query terms in the index (default ${defaultCliOption.filterQuery})") 63 | opt[Int]("slop") action { (v, c) => 64 | c.copy(slop = v) 65 | } text (s"slop for posQuery, (default ${defaultCliOption.slop})") 66 | opt[Int]("numWorkers") action { (v, c) => 67 | c.copy(numWorkers = v) 68 | } text (s"numWorkers for CLI queries, (default ${defaultCliOption.numWorkers} the number of CPUs)") 69 | help("help") text ("prints this usage text") 70 | } 71 | 72 | def main(args: Array[String]): Unit = { 73 | try { 74 | parser.parse(args, defaultCliOption).foreach { c => 75 | log.info(s"main: cliOptions = $c") 76 | if (c.index) Indexer.run(c) 77 | else if (c.docFreq) DocFreq.writeDocFreqs(c) 78 | else if (c.filterQueryOnly) DocFreq.filterQuery(c) 79 | else if (c.nerToQuery) DocFreq.nerToQuery(c) 80 | else if (c.export) Search.cliExportDocIds(c) 81 | else if (c.searchJson || c.searchCsv) Search.cliPosDocSearch(c) 82 | else log.info("Nothing to do. Try --help") 83 | } 84 | } catch { 85 | case NonFatal(e) => log.error("main:", e) 86 | } 87 | } 88 | 89 | } -------------------------------------------------------------------------------- /dataFusion-search/src/test/resources/logback-test.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | %d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /dataFusion-search/src/test/scala/au/csiro/data61/dataFusion/search/DataFusionLuceneTest.scala: -------------------------------------------------------------------------------- 1 | package au.csiro.data61.dataFusion.search 2 | 3 | import org.apache.lucene.index.DirectoryReader 4 | import org.apache.lucene.search.IndexSearcher 5 | import org.apache.lucene.store.RAMDirectory 6 | import org.scalatest.{ FlatSpec, Matchers } 7 | 8 | import com.typesafe.scalalogging.Logger 9 | 10 | import DataFusionLucene.{ F_CONTENT, analyzer, synonymAnalyzer } 11 | import DataFusionLucene.DFIndexing.{ ldoc2doc, mkIndexer } 12 | import DataFusionLucene.DFSearching.PosDocSearch.searchSpans 13 | import LuceneUtil.tokenIter 14 | import au.csiro.data61.dataFusion.common.Data.{ ExtRef, IdEmbIdx, LDoc, PosQuery, T_ORGANIZATION, T_PERSON } 15 | 16 | class DataFusionLuceneTest extends FlatSpec with Matchers { 17 | val log = Logger(getClass) 18 | 19 | "SynonymAnalyzer" should "work" in { 20 | // depends on mapping: limited => ltd in synonyms.txt 21 | tokenIter(synonymAnalyzer, F_CONTENT, "AA AA Pty. Limited").mkString(" ") should be ("aa aa pty ltd") 22 | } 23 | 24 | val doc1 = "doc1: Sarah Jones\nAA AA Pty. Limited" 25 | val doc2 = "doc2: John Jones\nMs. AA\nMr. AA BB AA" 26 | val doc3 = "doc3: @ PTY. LIMITED is a subsidiary of $ PTY LIMITED" 27 | 28 | def mkTestSearcher = { 29 | val dir = new RAMDirectory 30 | val xer = mkIndexer(dir) 31 | for { 32 | (content, idx) <- Seq(doc1, doc2, doc3).zipWithIndex 33 | } xer.addDocument(LDoc(IdEmbIdx(idx, -1), content, "path")) 34 | xer.close 35 | new IndexSearcher(DirectoryReader.open(dir)) 36 | } 37 | 38 | "SpanQuery" should "provide positions" in { 39 | val searcher = mkTestSearcher 40 | log.debug(s"numDocs = ${searcher.getIndexReader.numDocs}") 41 | 42 | { 43 | val q = PosQuery(ExtRef("AA AA Proprietary Ltd.", List(1L)), T_ORGANIZATION) 44 | val x = searchSpans(searcher, 0, q, 0.0f) 45 | log.debug(s"SpanQuery: x = $x") 46 | x.stats.totalHits should be(1) 47 | x.hits.size should be(1) 48 | x.hits.head.posInfos.size should be(1) 49 | val pi = x.hits.head.posInfos.head 50 | doc1.substring(pi.offStr, pi.offEnd) should be ("AA AA Pty. Limited") 51 | } 52 | 53 | // TODO: this is known to fail, "@ PTY LTD" and "$ PTY LTD" are tokenized to "PTY LTD" 54 | // We could use WhitespaceTokenizer with LuceneUtil.TrailingPunctuationFilter to fix, 55 | // but the current StandardTokenizer might be addressing issues we don't know about so this might cause other issues. 56 | { 57 | val q = PosQuery(ExtRef("$ Proprietary Ltd.", List(1L)), T_ORGANIZATION) 58 | val tokens = tokenIter(analyzer, F_CONTENT, q.extRef.name).toList 59 | log.debug(s"SpanQuery: tokens = $tokens") 60 | tokens.size should be(3) 61 | 62 | val x = searchSpans(searcher, 0, q, 0.0f) 63 | log.debug(s"SpanQuery: x = $x") 64 | x.stats.totalHits should be(1) 65 | x.hits.size should be(1) 66 | x.hits.head.posInfos.size should be(1) 67 | val pi = x.hits.head.posInfos.head 68 | doc3.substring(pi.offStr, pi.offEnd) should be ("$ PTY LIMITED") 69 | } 70 | 71 | { 72 | val q = PosQuery(ExtRef("Jones Sarah", List(2L)), T_PERSON) 73 | val x = searchSpans(searcher, 0, q, 0.0f) 74 | log.debug(s"SpanQuery: x = $x") 75 | x.stats.totalHits should be(1) 76 | x.hits.size should be(1) 77 | x.hits.head.posInfos.size should be(1) 78 | 79 | val pi = x.hits.head.posInfos.head 80 | doc1.substring(pi.offStr, pi.offEnd) should be ("Sarah Jones") 81 | } 82 | 83 | { 84 | val q = PosQuery(ExtRef("AA AA", List(1L)), T_PERSON) 85 | val x = searchSpans(searcher, 0, q, 0.0f) 86 | log.debug(s"SpanQuery: x = $x") 87 | x.stats.totalHits should be(1) 88 | x.hits.size should be(1) 89 | x.hits.head.posInfos.size should be(1) 90 | val pi = x.hits.head.posInfos.head 91 | doc1.substring(pi.offStr, pi.offEnd) should be ("AA AA") 92 | } 93 | 94 | { 95 | val q = PosQuery(ExtRef("AA AA BB", List(1L)), T_PERSON) 96 | val x = searchSpans(searcher, 0, q, 0.0f) 97 | log.debug(s"SpanQuery: x = $x") 98 | x.stats.totalHits should be(1) 99 | x.hits.size should be(1) 100 | x.hits.head.posInfos.size should be(1) 101 | val pi = x.hits.head.posInfos.head 102 | doc2.substring(pi.offStr, pi.offEnd) should be ("AA BB AA") 103 | } 104 | 105 | 106 | { 107 | val q = PosQuery(ExtRef("AA AA CC", List(1L)), T_PERSON) 108 | val x = searchSpans(searcher, 0, q, 0.0f) 109 | log.debug(s"SpanQuery: x = $x") 110 | x.stats.totalHits should be(0) 111 | } 112 | 113 | // // TODO: this is known to fail, single term search is not working 114 | // { 115 | // val q = PosQuery(ExtRef("John", List(1L)), T_PERSON) 116 | // val x = searchSpans(searcher, 0, q, 0.0f) 117 | // log.debug(s"SpanQuery: x = $x") 118 | // x.stats.totalHits should be(1) 119 | // x.hits.size should be(1) 120 | // x.hits.head.posInfos.size should be(1) 121 | // val pi = x.hits.head.posInfos.head 122 | // doc2.substring(pi.offStr, pi.offEnd) should be ("JOHN") 123 | // } 124 | } 125 | 126 | } -------------------------------------------------------------------------------- /dataFusion-search/src/test/scala/au/csiro/data61/dataFusion/search/JsonTest.scala: -------------------------------------------------------------------------------- 1 | package au.csiro.data61.dataFusion.search 2 | 3 | import org.scalatest.{ FlatSpec, Matchers } 4 | 5 | import com.typesafe.scalalogging.Logger 6 | 7 | import au.csiro.data61.dataFusion.common.Data.{ DHits, EMB_IDX_MAIN, IdEmbIdx, LDoc, Stats } 8 | import au.csiro.data61.dataFusion.common.Data.JsonProtocol.dHitsCodec 9 | import spray.json.{ pimpAny, pimpString } 10 | 11 | class JsonTest extends FlatSpec with Matchers { 12 | val log = Logger(getClass) 13 | 14 | val hits = DHits(Stats(1, 0.5f), List((12.3f, LDoc(IdEmbIdx(1, EMB_IDX_MAIN), "some content", "a/path"))), None) 15 | 16 | "DocHits" should "ser/deserialize" in { 17 | val json = hits.toJson.compactPrint 18 | log.debug(s"json = $json") 19 | val d2 = json.parseJson.convertTo[DHits] 20 | d2 should be(hits) 21 | } 22 | 23 | } -------------------------------------------------------------------------------- /dataFusion-search/src/test/scala/au/csiro/data61/dataFusion/search/SearchTest.scala: -------------------------------------------------------------------------------- 1 | package au.csiro.data61.dataFusion.search 2 | 3 | import org.scalatest.{ FlatSpec, Matchers } 4 | 5 | import com.typesafe.scalalogging.Logger 6 | 7 | import Main.defaultCliOption 8 | import Search.inCsv 9 | import au.csiro.data61.dataFusion.common.Data.{ ExtRef, PosQuery, T_ORGANIZATION, T_PERSON, T_PERSON2 } 10 | 11 | class SearchTest extends FlatSpec with Matchers { 12 | val log = Logger(getClass) 13 | 14 | "inCsv" should "parse CSV" in { 15 | val lines = Seq( 16 | "Clnt_Intrnl_Id|SEX_CD|STRCTRD_FMLY_NM|STRCTRD_GVN_NM|STRCTRD_OTHR_GVN_NM|USTRCTRD_FULL_NM", 17 | "1|M|BLOGGS|FREDERICK|A|", 18 | "2|BUS||||COSMIC HOLDINGS INCORPORATED", 19 | ) 20 | val qs = inCsv(defaultCliOption.copy(csvDelim = '|'), lines.iterator).toList 21 | log.debug(s"qs = $qs") 22 | val x1 = PosQuery(ExtRef("FREDERICK A BLOGGS", List(1L)), T_PERSON) 23 | val x2 = PosQuery(ExtRef("FREDERICK BLOGGS", List(1L)), T_PERSON2) 24 | val x3 = PosQuery(ExtRef("COSMIC HOLDINGS INCORPORATED", List(2L)), T_ORGANIZATION) 25 | qs.toSet should be(Set(x1, x2, x3)) // inCsv is parallelized so results not ordered 26 | } 27 | 28 | } -------------------------------------------------------------------------------- /dataFusion-search/synonyms.txt: -------------------------------------------------------------------------------- 1 | # Grammar: https://lucene.apache.org/core/6_6_0/analyzers-common/index.html?org/apache/lucene/analysis/synonym/SolrSynonymParser.html 2 | # e.g. i-pod, i pod => ipod 3 | 4 | proprietary => pty 5 | limited => ltd 6 | -------------------------------------------------------------------------------- /dataFusion-tika-service/README.md: -------------------------------------------------------------------------------- 1 | # dataFusion-tika-service 2 | 3 | ## Introduction 4 | 5 | This project provides a [RESTful](https://en.wikipedia.org/wiki/Representational_state_transfer) web service based on dataFusion-tika. 6 | 7 | ## Build, Configuration and Running 8 | 9 | See the top level [README](../README.md). 10 | 11 | Example: 12 | 13 | # run web service 14 | java -jar target/scala-2.12/datafusion-tika_2.12-0.2-SNAPSHOT-one-jar.jar 15 | # get swagger description (useful when loaded into Swagger UI) 16 | curl http://localhost:9998/api-docs/swagger.json 17 | # process a file 18 | curl --upload-file src/test/resources/exampleData/PDF002.pdf http://localhost:9998/tika?path=PDF002.pdf 19 | -------------------------------------------------------------------------------- /dataFusion-tika-service/build.sbt: -------------------------------------------------------------------------------- 1 | name := "dataFusion-tika-service" 2 | 3 | // the one-jar classloader helpfully reports on conflicting classes (same package & name) from different jars 4 | // (including whether the byte-code differs) and this has been used to set the following exclusions: 5 | 6 | libraryDependencies ++= Seq( 7 | "com.github.swagger-akka-http" %% "swagger-akka-http" % "0.9.1" exclude("javax.ws.rs", "jsr311-api"), // replaced by javax.ws.rs-api 8 | "com.typesafe.akka" %% "akka-http-spray-json" % "10.0.7", 9 | "ch.megard" %% "akka-http-cors" % "0.2.1", 10 | 11 | "com.github.scopt" %% "scopt" % "3.7.0", 12 | "com.jsuereth" %% "scala-arm" % "2.0", 13 | "org.scalatest" %% "scalatest" % "3.0.4" % "test" 14 | ) 15 | 16 | com.github.retronym.SbtOneJar.oneJarSettings 17 | 18 | mainClass in Compile := Some("au.csiro.data61.dataFusion.tika.service.Main") 19 | -------------------------------------------------------------------------------- /dataFusion-tika-service/src/main/resources/application.conf: -------------------------------------------------------------------------------- 1 | http = { 2 | host = "0.0.0.0" 3 | port = 9998 4 | 5 | host = ${?TIKA_HTTP_HOST} 6 | port = ${?TIKA_HTTP_PORT} 7 | } -------------------------------------------------------------------------------- /dataFusion-tika-service/src/main/resources/logback.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | %d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n 6 | 7 | 8 | 9 | 10 | tika-service.log 11 | 12 | %d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /dataFusion-tika-service/src/test/resources/logback-test.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | %d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n 6 | 7 | 8 | 9 | 10 | tika-service-test.log 11 | 12 | %d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n 13 | 14 | 15 | 16 | 17 | 18 | 23 | 24 | 25 | 26 | 27 | 28 | -------------------------------------------------------------------------------- /dataFusion-tika/README.md: -------------------------------------------------------------------------------- 1 | # dataFusion-tika 2 | 3 | ## Introduction 4 | 5 | This project provides a library and multi-threaded CLI (command line interface) for bulk processing. It provides: 6 | 7 | - access to [Apache Tika](https://tika.apache.org/) customized to OCR images embedded in PDFs (including TIFF, JPEG2000 and JBIG2, which are not handled by Tika out-of-the-box); 8 | - some cleaning and filtering of Tika metadata; 9 | - augmentation of the metadata with the language of the text (`language-code` and `language-prob`) and a score for how closely the text matches a simple model for English sentences `english-score`; and 10 | - results in the [Document JSON format](../dataFusion-common#document-json-format). 11 | 12 | ## Build, Configuration and Running 13 | 14 | See the top level [README](../README.md). 15 | 16 | Example: 17 | 18 | # CLI processing, with one file path per input line 19 | ls -1 src/test/resources/exampleData/PDF00{2,3}* | \ 20 | java -jar target/scala-2.12/datafusion-tika_2.12-0.2-SNAPSHOT-one-jar.jar 21 | -------------------------------------------------------------------------------- /dataFusion-tika/build.sbt: -------------------------------------------------------------------------------- 1 | name := "dataFusion-tika" 2 | 3 | // the one-jar classloader helpfully reports on conflicting classes (same package & name) from different jars 4 | // (including whether the byte-code differs) and this has been used to set the following exclusions: 5 | // jj2000 is older fork of jai-imageio-jpeg2000 6 | // 7 | // tika-parsers and sentiment-analysis-parser both contain org/apache/tika/parser/sentiment/analysis/SentimentParser 8 | // I guess the tika-parsers one is newer but still relies on other code in sentiment-analysis-parser? 9 | // We don't use it so exclude sentiment-analysis-parser to avoid the conflict. 10 | // 11 | // The junrar and jcip-annotations dependencies of tika-parsers have dubious licenses, so these are excluded. 12 | // An alternative jcip-annotations is used (no alternative for unrar) 13 | 14 | 15 | libraryDependencies ++= Seq( 16 | "org.apache.tika" % "tika-parsers" % "1.16" exclude("edu.ucar", "jj2000") exclude("edu.usc.ir", "sentiment-analysis-parser") exclude("com.github.junrar", "junrar") exclude("net.jcip", "jcip-annotations"), 17 | "com.github.stephenc.jcip" % "jcip-annotations" % "1.0-1", 18 | "com.github.jai-imageio" % "jai-imageio-core" % "1.3.1", // add PDFBox support for TIFF 19 | "com.github.jai-imageio" % "jai-imageio-jpeg2000" % "1.3.0", // add PDFBox support for jpeg2000 20 | "com.levigo.jbig2" % "levigo-jbig2-imageio" % "2.0", // add PDFBox support for jbig2 21 | "org.xerial" % "sqlite-jdbc" % "3.19.3", // add to 'parse' sqlite files and embedded files 22 | "com.optimaize.languagedetector" % "language-detector" % "0.6", // tika-langdetect-1.15 dependency is 0.5, but we use language-detector directly, not via tika-langdetect 23 | "com.typesafe" % "config" % "1.3.1", 24 | "com.github.scopt" %% "scopt" % "3.7.0", 25 | "com.jsuereth" %% "scala-arm" % "2.0", 26 | "org.scalatest" %% "scalatest" % "3.0.4" % "test" 27 | ) 28 | 29 | com.github.retronym.SbtOneJar.oneJarSettings 30 | 31 | mainClass in Compile := Some("au.csiro.data61.dataFusion.tika.Main") 32 | -------------------------------------------------------------------------------- /dataFusion-tika/src/main/resources/META-INF/services/javax.imageio.spi.ImageReaderSpi: -------------------------------------------------------------------------------- 1 | # from jai-imageio-core for TIFF support 2 | 3 | #com.github.jaiimageio.impl.plugins.jpeg.CLibJPEGImageReaderSpi 4 | #com.github.jaiimageio.impl.plugins.png.CLibPNGImageReaderSpi 5 | #com.github.jaiimageio.impl.plugins.jpeg2000.J2KImageReaderSpi 6 | #com.github.jaiimageio.impl.plugins.jpeg2000.J2KImageReaderCodecLibSpi 7 | com.github.jaiimageio.impl.plugins.wbmp.WBMPImageReaderSpi 8 | com.github.jaiimageio.impl.plugins.bmp.BMPImageReaderSpi 9 | com.github.jaiimageio.impl.plugins.pcx.PCXImageReaderSpi 10 | com.github.jaiimageio.impl.plugins.pnm.PNMImageReaderSpi 11 | com.github.jaiimageio.impl.plugins.raw.RawImageReaderSpi 12 | com.github.jaiimageio.impl.plugins.tiff.TIFFImageReaderSpi 13 | 14 | 15 | # from jai-imageio-jpeg2000 for jpeg2000 support 16 | 17 | com.github.jaiimageio.jpeg2000.impl.J2KImageReaderSpi 18 | 19 | 20 | # from levigo-jbig2-imageio for jbig2 support 21 | 22 | com.levigo.jbig2.JBIG2ImageReaderSpi 23 | -------------------------------------------------------------------------------- /dataFusion-tika/src/main/resources/META-INF/services/javax.imageio.spi.ImageWriterSpi: -------------------------------------------------------------------------------- 1 | # from jai-imageio-core for TIFF support 2 | 3 | #com.github.jaiimageio.impl.plugins.jpeg.CLibJPEGImageWriterSpi 4 | #com.github.jaiimageio.impl.plugins.png.CLibPNGImageWriterSpi 5 | #com.github.jaiimageio.impl.plugins.jpeg2000.J2KImageWriterSpi 6 | #com.github.jaiimageio.impl.plugins.jpeg2000.J2KImageWriterCodecLibSpi 7 | com.github.jaiimageio.impl.plugins.wbmp.WBMPImageWriterSpi 8 | com.github.jaiimageio.impl.plugins.bmp.BMPImageWriterSpi 9 | com.github.jaiimageio.impl.plugins.gif.GIFImageWriterSpi 10 | com.github.jaiimageio.impl.plugins.pcx.PCXImageWriterSpi 11 | com.github.jaiimageio.impl.plugins.pnm.PNMImageWriterSpi 12 | com.github.jaiimageio.impl.plugins.raw.RawImageWriterSpi 13 | com.github.jaiimageio.impl.plugins.tiff.TIFFImageWriterSpi 14 | 15 | 16 | # from jai-imageio-jpeg2000 for jpeg2000 support 17 | 18 | com.github.jaiimageio.jpeg2000.impl.J2KImageWriterSpi 19 | 20 | # no jbig2 support 21 | -------------------------------------------------------------------------------- /dataFusion-tika/src/main/resources/application.conf: -------------------------------------------------------------------------------- 1 | tika { 2 | // switch to command line args 3 | // timeout = 600 4 | // timeout = ${?TIKA_TIMEOUT} 5 | } 6 | -------------------------------------------------------------------------------- /dataFusion-tika/src/main/resources/logback.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | %d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n 6 | 7 | 8 | 9 | 10 | tika.log 11 | 12 | %d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | -------------------------------------------------------------------------------- /dataFusion-tika/src/main/resources/org/apache/tika/parser/ocr/TesseractOCRConfig.properties: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one or more 2 | # contributor license agreements. See the NOTICE file distributed with 3 | # this work for additional information regarding copyright ownership. 4 | # The ASF licenses this file to You under the Apache License, Version 2.0 5 | # (the "License"); you may not use this file except in compliance with 6 | # the License. You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | # Tesseract properties 17 | tesseractPath= 18 | language=eng 19 | pageSegMode=1 20 | maxFileSizeToOcr=2147483647 21 | minFileSizeToOcr=1024 22 | timeout=300 23 | #txt or hocr 24 | outputType=txt 25 | preserveInterwordSpacing=true 26 | 27 | # properties for image processing 28 | # to enable processing, set enableImageProcessing to 1 29 | enableImageProcessing=1 30 | ImageMagickPath= 31 | density=300 32 | depth=4 33 | colorspace=gray 34 | filter=triangle 35 | resize=200 -------------------------------------------------------------------------------- /dataFusion-tika/src/main/resources/org/apache/tika/parser/ocr/rotation.py: -------------------------------------------------------------------------------- 1 | """ 2 | Licensed to the Apache Software Foundation (ASF) under one or more 3 | contributor license agreements. See the NOTICE file distributed with 4 | this work for additional information regarding copyright ownership. 5 | The ASF licenses this file to You under the Apache License, Version 2.0 6 | (the "License"); you may not use this file except in compliance with 7 | the License. You may obtain a copy of the License at 8 | 9 | http://www.apache.org/licenses/LICENSE-2.0 10 | 11 | Unless required by applicable law or agreed to in writing, software 12 | distributed under the License is distributed on an "AS IS" BASIS, 13 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | See the License for the specific language governing permissions and 15 | limitations under the License. 16 | """ 17 | 18 | from __future__ import division, print_function 19 | import numpy 20 | from skimage.transform import radon 21 | from PIL import Image 22 | from numpy import asarray, mean, array, blackman 23 | from numpy.fft import rfft 24 | import matplotlib 25 | matplotlib.use("Agg") 26 | import matplotlib.pyplot as plt 27 | from matplotlib.mlab import rms_flat 28 | 29 | import sys 30 | import getopt 31 | 32 | def main(argv): 33 | filename = '' 34 | 35 | if len(sys.argv) < 3: 36 | print('Usage: rotation.py -f ') 37 | sys.exit() 38 | try: 39 | opts, args = getopt.getopt(argv,"hf:",["file="]) 40 | except getopt.GetoptError: 41 | print('rotation.py -f ') 42 | sys.exit(2) 43 | for opt, arg in opts: 44 | if opt == '-h': 45 | print('Usage: rotation.py -f ') 46 | sys.exit() 47 | elif opt in ("-f", "--file"): 48 | filename = arg 49 | 50 | try: 51 | from parabolic import parabolic 52 | 53 | def argmax(x): 54 | return parabolic(x, numpy.argmax(x))[0] 55 | except ImportError: 56 | from numpy import argmax 57 | 58 | # Load file, converting to grayscale 59 | I = asarray(Image.open(filename).convert('L')) 60 | I = I - mean(I) # Demean; make the brightness extend above and below zero 61 | 62 | # Do the radon transform and display the result 63 | sinogram = radon(I) 64 | 65 | # Find the RMS value of each row and find "busiest" rotation, 66 | # where the transform is lined up perfectly with the alternating dark 67 | # text and white lines 68 | r = array([rms_flat(line) for line in sinogram.transpose()]) 69 | rotation = argmax(r) 70 | 71 | print('{:.2f}'.format(-(90-rotation))) 72 | 73 | if __name__ == "__main__": 74 | main(sys.argv[1:]) 75 | # print('{:.2f}'.format(0)) 76 | -------------------------------------------------------------------------------- /dataFusion-tika/src/main/resources/org/apache/tika/parser/pdf/PDFParser.properties: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one or more 2 | # contributor license agreements. See the NOTICE file distributed with 3 | # this work for additional information regarding copyright ownership. 4 | # The ASF licenses this file to You under the Apache License, Version 2.0 5 | # (the "License"); you may not use this file except in compliance with 6 | # the License. You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | enableAutoSpace true 17 | extractAnnotationText true 18 | sortByPosition false 19 | suppressDuplicateOverlappingText false 20 | extractAcroFormContent true 21 | extractInlineImages true 22 | extractUniqueInlineImagesOnly true 23 | checkExtractAccessPermission false 24 | allowExtractionForAccessibility true 25 | ifXFAExtractOnlyXFA false 26 | catchIntermediateIOExceptions true 27 | #options: no_ocr, ocr_only, ocr_and_text_extraction 28 | ocrStrategy no_ocr 29 | #dots per inch for the ocr rendering of the page image 30 | ocrDPI 300 31 | #if you request tif, make sure you have imageio jars on your classpath! 32 | ocrImageFormatName png 33 | #options: argb, binary, gray, rgb 34 | ocrImageType gray 35 | #scale to use when rendering a page image for OCR 36 | ocrImageScale 2.0 37 | -------------------------------------------------------------------------------- /dataFusion-tika/src/main/scala/au/csiro/data61/dataFusion/tika/LangDetect.scala: -------------------------------------------------------------------------------- 1 | package au.csiro.data61.dataFusion.tika 2 | 3 | import com.optimaize.langdetect.{ LanguageDetector, LanguageDetectorBuilder } 4 | import com.optimaize.langdetect.ngram.NgramExtractors 5 | import com.optimaize.langdetect.profiles.LanguageProfileReader 6 | import com.optimaize.langdetect.text.CommonTextObjectFactories 7 | 8 | object LangDetect { 9 | case class Lang(lang: String, prob: Float) 10 | 11 | val languageProfiles = new LanguageProfileReader().readAllBuiltIn 12 | val languageDetector: LanguageDetector = LanguageDetectorBuilder.create(NgramExtractors.standard).withProfiles(languageProfiles).build 13 | val textObjectFactory = CommonTextObjectFactories.forDetectingOnLargeText 14 | 15 | def headOption[T](jl: java.util.List[T]): Option[T] = if (jl.isEmpty) None else Some(jl.get(0)) 16 | 17 | def lang(text: String): Option[Lang] = { 18 | headOption(languageDetector.getProbabilities(textObjectFactory.forText(text))) 19 | .map(l => Lang(l.getLocale.getLanguage, l.getProbability.toFloat)) 20 | } 21 | } 22 | 23 | -------------------------------------------------------------------------------- /dataFusion-tika/src/test/resources/exampleData/AAA.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/data61/dataFusion/4b766ae65254eed6c5a095f3bb8b6b42b9500a9c/dataFusion-tika/src/test/resources/exampleData/AAA.pptx -------------------------------------------------------------------------------- /dataFusion-tika/src/test/resources/exampleData/Email001.msg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/data61/dataFusion/4b766ae65254eed6c5a095f3bb8b6b42b9500a9c/dataFusion-tika/src/test/resources/exampleData/Email001.msg -------------------------------------------------------------------------------- /dataFusion-tika/src/test/resources/exampleData/PDF001.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/data61/dataFusion/4b766ae65254eed6c5a095f3bb8b6b42b9500a9c/dataFusion-tika/src/test/resources/exampleData/PDF001.pdf -------------------------------------------------------------------------------- /dataFusion-tika/src/test/resources/exampleData/PDF002.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/data61/dataFusion/4b766ae65254eed6c5a095f3bb8b6b42b9500a9c/dataFusion-tika/src/test/resources/exampleData/PDF002.pdf -------------------------------------------------------------------------------- /dataFusion-tika/src/test/resources/exampleData/PDF003.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/data61/dataFusion/4b766ae65254eed6c5a095f3bb8b6b42b9500a9c/dataFusion-tika/src/test/resources/exampleData/PDF003.pdf -------------------------------------------------------------------------------- /dataFusion-tika/src/test/resources/exampleData/PDF004.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/data61/dataFusion/4b766ae65254eed6c5a095f3bb8b6b42b9500a9c/dataFusion-tika/src/test/resources/exampleData/PDF004.pdf -------------------------------------------------------------------------------- /dataFusion-tika/src/test/resources/exampleData/README.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/data61/dataFusion/4b766ae65254eed6c5a095f3bb8b6b42b9500a9c/dataFusion-tika/src/test/resources/exampleData/README.txt -------------------------------------------------------------------------------- /dataFusion-tika/src/test/resources/exampleData/TIF001.tif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/data61/dataFusion/4b766ae65254eed6c5a095f3bb8b6b42b9500a9c/dataFusion-tika/src/test/resources/exampleData/TIF001.tif -------------------------------------------------------------------------------- /dataFusion-tika/src/test/resources/exampleData/TIF002.tif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/data61/dataFusion/4b766ae65254eed6c5a095f3bb8b6b42b9500a9c/dataFusion-tika/src/test/resources/exampleData/TIF002.tif -------------------------------------------------------------------------------- /dataFusion-tika/src/test/resources/exampleData/TIF003.tif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/data61/dataFusion/4b766ae65254eed6c5a095f3bb8b6b42b9500a9c/dataFusion-tika/src/test/resources/exampleData/TIF003.tif -------------------------------------------------------------------------------- /dataFusion-tika/src/test/resources/exampleData/Thumbs.db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/data61/dataFusion/4b766ae65254eed6c5a095f3bb8b6b42b9500a9c/dataFusion-tika/src/test/resources/exampleData/Thumbs.db -------------------------------------------------------------------------------- /dataFusion-tika/src/test/resources/exampleData/data-prob-2-12.XLS: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/data61/dataFusion/4b766ae65254eed6c5a095f3bb8b6b42b9500a9c/dataFusion-tika/src/test/resources/exampleData/data-prob-2-12.XLS -------------------------------------------------------------------------------- /dataFusion-tika/src/test/resources/exampleData/doc001.doc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/data61/dataFusion/4b766ae65254eed6c5a095f3bb8b6b42b9500a9c/dataFusion-tika/src/test/resources/exampleData/doc001.doc -------------------------------------------------------------------------------- /dataFusion-tika/src/test/resources/exampleData/doc002.doc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/data61/dataFusion/4b766ae65254eed6c5a095f3bb8b6b42b9500a9c/dataFusion-tika/src/test/resources/exampleData/doc002.doc -------------------------------------------------------------------------------- /dataFusion-tika/src/test/resources/exampleData/html001.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/data61/dataFusion/4b766ae65254eed6c5a095f3bb8b6b42b9500a9c/dataFusion-tika/src/test/resources/exampleData/html001.html -------------------------------------------------------------------------------- /dataFusion-tika/src/test/resources/exampleData/image001.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/data61/dataFusion/4b766ae65254eed6c5a095f3bb8b6b42b9500a9c/dataFusion-tika/src/test/resources/exampleData/image001.png -------------------------------------------------------------------------------- /dataFusion-tika/src/test/resources/exampleData/image002.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/data61/dataFusion/4b766ae65254eed6c5a095f3bb8b6b42b9500a9c/dataFusion-tika/src/test/resources/exampleData/image002.gif -------------------------------------------------------------------------------- /dataFusion-tika/src/test/resources/exampleData/image003.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/data61/dataFusion/4b766ae65254eed6c5a095f3bb8b6b42b9500a9c/dataFusion-tika/src/test/resources/exampleData/image003.jpeg -------------------------------------------------------------------------------- /dataFusion-tika/src/test/resources/exampleData/image004.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/data61/dataFusion/4b766ae65254eed6c5a095f3bb8b6b42b9500a9c/dataFusion-tika/src/test/resources/exampleData/image004.png -------------------------------------------------------------------------------- /dataFusion-tika/src/test/resources/exampleData/xls001.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/data61/dataFusion/4b766ae65254eed6c5a095f3bb8b6b42b9500a9c/dataFusion-tika/src/test/resources/exampleData/xls001.xls -------------------------------------------------------------------------------- /dataFusion-tika/src/test/resources/logback-test.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | %d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n 6 | 7 | 8 | 9 | 10 | tika-test.log 11 | 12 | %d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n 13 | 14 | 15 | 16 | 17 | 18 | 23 | 24 | 25 | 26 | 27 | 28 | -------------------------------------------------------------------------------- /dataFusion-tika/src/test/scala/au/csiro/data61/dataFusion/tika/TikaTest.scala: -------------------------------------------------------------------------------- 1 | package au.csiro.data61.dataFusion.tika 2 | 3 | import org.scalatest.{ FlatSpec, Matchers } 4 | 5 | import com.typesafe.scalalogging.Logger 6 | 7 | class TikaTest extends FlatSpec with Matchers { 8 | private val log = Logger(getClass) 9 | val tikaUtil = new TikaUtil(Main.defaultCliOption) 10 | 11 | "Tika" should "extract 1 page of PDF" in { 12 | val path = "/exampleData/PDF002.pdf" // born digital, has logo image with no text 13 | val docIn = tikaUtil.tika(getClass.getResourceAsStream(path), path, 0L) 14 | // log.debug(s"docIn = ${docIn}") 15 | docIn.content.map(_.size).getOrElse(0) > 100 should be(true) // born digital text 16 | docIn.embedded.size should be(1) // has 1 embedded doc - the logo 17 | 18 | // log.debug(s"content = ${docIn.embedded(0).content}") 19 | // docIn.embedded(0).content.isDefined should be(false) // for which we get no text 20 | // we got content = None with tesseract3 but Some with tesseract4, so commented out this bit 21 | } 22 | 23 | it should "extract 5 pages of PDF" in { 24 | val path = "/exampleData/PDF003.pdf" // scanned doc 25 | val docIn = tikaUtil.tika(getClass.getResourceAsStream(path), path, 0L) 26 | // log.debug(s"docIn = ${docIn}") 27 | docIn.content.map(_.size).getOrElse(0) > 100 should be(true) // text OCR by scanner 28 | docIn.embedded.size should be(5) // 5 embedded page images 29 | docIn.embedded.foreach(_.content.map(_.size).getOrElse(0) > 100 should be(true)) // tesseract got text from each page 30 | } 31 | 32 | it should "extract from good Excel" in { 33 | val path = "/exampleData/xls001.xls" 34 | val d = tikaUtil.tika(getClass.getResourceAsStream(path), path, 0L) 35 | // log.debug(s"d = $d") 36 | d.content.get.contains("Principality of Liechtenstein") should be(true) 37 | d.meta.get("Content-Type") should be(Some("application/vnd.ms-excel")) 38 | } 39 | 40 | it should "convert good Excel to opendocument.spreadsheet (only when explicitly asked to) and extract" in { 41 | val path = "/exampleData/xls001.xls" 42 | val d = tikaUtil.convertAndParseDoc(getClass.getResourceAsStream(path), path, 0L) 43 | // log.debug(s"d = $d") 44 | d.content.get.contains("Principality of Liechtenstein") should be(true) 45 | d.meta.get("Content-Type") should be(Some("application/vnd.oasis.opendocument.spreadsheet")) 46 | } 47 | 48 | it should "convert bad Excel to opendocument.spreadsheet (when not explicitly asked to) and extract" in { 49 | // test Excel file is attachment from: https://bz.apache.org/bugzilla/show_bug.cgi?id=57104 50 | val path = "/exampleData/data-prob-2-12.XLS" 51 | val d = tikaUtil.tika(getClass.getResourceAsStream(path), path, 0L) 52 | // log.debug(s"d = $d") 53 | d.content.get.contains("562.03") should be(true) 54 | d.meta.get("Content-Type") should be(Some("application/vnd.oasis.opendocument.spreadsheet")) 55 | } 56 | 57 | } -------------------------------------------------------------------------------- /dataFusion-util/3rd-party-licenses.md: -------------------------------------------------------------------------------- 1 | # datafusion-util-licenses 2 | 3 | Category | License | Dependency | Notes 4 | --- | --- | --- | --- 5 | Apache | [Apache 2](http://www.apache.org/licenses/LICENSE-2.0.txt) | io.spray # spray-json_2.12 # 1.3.3 | 6 | Apache | [Apache 2.0 License](http://www.apache.org/licenses/LICENSE-2.0.html) | com.typesafe.scala-logging # scala-logging_2.12 # 3.5.0 | 7 | Apache | [Apache License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0) | com.typesafe # config # 1.3.1 | 8 | Apache | [Apache License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0.txt) | commons-io # commons-io # 2.5 | 9 | Apache | [The Apache Software License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0.txt) | com.google.guava # guava # 18.0 | 10 | Apache | [The Apache Software License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0.txt) | org.apache.lucene # lucene-analyzers-common # 7.0.1 | 11 | Apache | [The Apache Software License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0.txt) | org.apache.lucene # lucene-core # 7.0.1 | 12 | Apache | [The Apache Software License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0.txt) | org.apache.lucene # lucene-highlighter # 7.0.1 | 13 | Apache | [The Apache Software License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0.txt) | org.apache.lucene # lucene-join # 7.0.1 | 14 | Apache | [The Apache Software License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0.txt) | org.apache.lucene # lucene-memory # 7.0.1 | 15 | Apache | [The Apache Software License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0.txt) | org.apache.lucene # lucene-queries # 7.0.1 | 16 | Apache | [The Apache Software License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0.txt) | org.apache.lucene # lucene-queryparser # 7.0.1 | 17 | Apache | [The Apache Software License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0.txt) | org.apache.lucene # lucene-sandbox # 7.0.1 | 18 | Apache | [the Apache License, ASL Version 2.0](http://www.apache.org/licenses/LICENSE-2.0) | org.scalactic # scalactic_2.12 # 3.0.0 | 19 | Apache | [the Apache License, ASL Version 2.0](http://www.apache.org/licenses/LICENSE-2.0) | org.scalatest # scalatest_2.12 # 3.0.0 | 20 | BSD | [BSD 3-Clause](http://www.scala-lang.org/license.html) | org.scala-lang # scala-library # 2.12.2 | 21 | BSD | [BSD 3-Clause](http://www.scala-lang.org/license.html) | org.scala-lang # scala-reflect # 2.12.2 | 22 | BSD | [BSD 3-clause](http://opensource.org/licenses/BSD-3-Clause) | org.scala-lang.modules # scala-parser-combinators_2.12 # 1.0.4 | 23 | BSD | [BSD 3-clause](http://opensource.org/licenses/BSD-3-Clause) | org.scala-lang.modules # scala-xml_2.12 # 1.0.5 | 24 | BSD | [BSD-Style](http://www.opensource.org/licenses/bsd-license.php) | com.jsuereth # scala-arm_2.12 # 2.0 | 25 | GPL | [GPL](https://www.gnu.org/licenses/gpl-3.0.en.html) | au.csiro.data61 # datafusion-common_2.12 # 1.1-SNAPSHOT | 26 | GPL | [GPL](https://www.gnu.org/licenses/gpl-3.0.en.html) | au.csiro.data61 # datafusion-search_2.12 # 1.1-SNAPSHOT | 27 | LGPL | [EPL + GNU Lesser General Public License](http://logback.qos.ch/license.html) | ch.qos.logback # logback-classic # 1.2.3 | 28 | LGPL | [EPL + GNU Lesser General Public License](http://logback.qos.ch/license.html) | ch.qos.logback # logback-core # 1.2.3 | 29 | MIT | [MIT License](http://www.opensource.org/licenses/mit-license.php) | com.github.scopt # scopt_2.12 # 3.5.0 | 30 | MIT | [MIT License](http://www.slf4j.org/license.html) | org.slf4j # slf4j-api # 1.7.25 | 31 | 32 | -------------------------------------------------------------------------------- /dataFusion-util/README.md: -------------------------------------------------------------------------------- 1 | # dataFusion-util 2 | 3 | ## Introduction 4 | This project provides command line utilities for: 5 | - Filtering and merging [Search Result JSON format](../dataFusion-common#search-result-json-format) into the [Document JSON format](../dataFusion-common#document-json-format) (`--hits` CLI option). 6 | Search results for `typ=PERSON2` (using only first and family names) often overlap with `typ=PERSON` (using the full name). 7 | In this case the `typ=PERSON2` result is an inferior match and is filtered out. 8 | This processing also filters out the spurious matches described in [People](../dataFusion-search#people). 9 | - Parsing content for mentions of people in email headers and merging results into the [Document JSON format](../dataFusion-common#document-json-format) (`--email` CLI option). 10 | If the resulting `offStr` (see [NER Structure](../dataFusion-common#ner-structure)) matches that of a NER with `impl=D61GAZ` and `typ=PERSON|PERSON2` then the `score` and `extRef` are taken from that NER. 11 | Otherwise extRef is not set and score is computed using the Lucene's IDF formula if the `--emailIDF` option is true (default) else it's set to 1.0. 12 | - Parsing content for age soon after a person's name and merging results into the [Document JSON format](../dataFusion-common#document-json-format) (`--age` CLI option). 13 | Age is recognized as a number from 18 - 99 inclusive, either: 14 | parenenthesized immediately after a name (a NER with `impl=D61GAZ` and `typ=PERSON|PERSON2`) 15 | and not followed by further digits (to avoid telephone number area codes); 16 | or within 50 chars and following the word "age" or "aged" (only applied to the closest preceding person's name). 17 | The `extRef` is set from from the NER representing the name and `score` is set to 1.0. 18 | - Network building from the [Document JSON format](../dataFusion-common#document-json-format) as detailed below (`--proximity` CLI option). 19 | - Reallocating the id's in a [Document JSON format](../dataFusion-common#document-json-format) file, 20 | which can be useful in the case of merging multiple partial tika runs where the joint ids would otherwise not be unique (`--resetId` CLI option). 21 | 22 | The CLI options `--hits`, `--email` and `--age` can be used jointly. 23 | 24 | ## Network Building 25 | ### Input 26 | Network building uses the follow named entities (see [NER Structure](../dataFusion-common#ner-structure) for details): 27 | - `impl=D61GAZ` and `typ=PERSON|PERSON2|ORGANIZATION`; 28 | - `impl=D61EMAIL` and `typ=FROM|TO|CC|BCC` and no extRef (if extRef is set it is a duplicate of a `impl=D61GAZ` named entity). 29 | ### Node Generation 30 | Nodes generated from `impl=D61GAZ` named entities aggregate all named entity mentions with the same `typ` and `extRef`. 31 | 32 | Nodes generated from `impl=D61EMAIL` named entities aggregate all named entity mentions with the same `text` irrespective of `typ`. Node attributes are set: 33 | - `node.extRef.name=ner.text` 34 | - `node.extRef.ids=[]` 35 | - `node.typ=D61EMAIL` 36 | 37 | The mapping from `ner.typ=FROM|TO|CC|BCC` to `node.typ=D61EMAIL` prevents separate nodes appearing in the visualization for the same person depending on the particular email header, which would also diminish connection weights by spreading them across mutiple edges. 38 | 39 | ### Collections 40 | Documents are grouped into collections. 41 | Documents in the filesystem are under (but not necessarily directly under) a directory that represents their collection. 42 | The CLI option `--collectionRe` specifies a [regex](https://en.wikipedia.org/wiki/Regular_expression) to extract the collection from a document's path. 43 | The default value for this option, `/collection/([^/]+)/`, is suitable if `collection` is the common parent directory for all collections. 44 | 45 | ### Edge Generation 46 | Parameters are the decay value (set by the `--decay` CLI option with default value 500 characters) and a cutoff which is `5 * decay`. 47 | 48 | (weight, count) for an edge representing co-occurrences of named entities n1 and n2 in collection c = 49 | sum over documents d in collection c 50 | sum over sub-documents e in d (main content and each embedded document) 51 | sum over pairs of instances of n1 & n2 in e, where dist = abs( n2.offStr - n1.offStr ) < cutoff 52 | weight = exp( - dist / decay ), count = 1 53 | 54 | ### Output 55 | The edges computed above (with count > 0) are written in [Edge JSON format](../dataFusion-common#node-and-edge-json-formats) to proximity-edge.json and the nodes referenced in these edges are written in [Node JSON format](../dataFusion-common#node-and-edge-json-formats) to proximity-node.json. 56 | 57 | ## Build, Configuration and Running 58 | 59 | See the top level [README](../README.md). 60 | The score computation for the `--emailIDF` option requires term document frequencies from the Lucene index, which is located using the configuration from dataFusion-search. 61 | -------------------------------------------------------------------------------- /dataFusion-util/build.sbt: -------------------------------------------------------------------------------- 1 | name := "dataFusion-util" 2 | 3 | libraryDependencies ++= Seq( 4 | "com.github.scopt" %% "scopt" % "3.5.0", 5 | "com.jsuereth" %% "scala-arm" % "2.0", 6 | "org.scalatest" %% "scalatest" % "3.0.0" % "test" 7 | ) 8 | 9 | com.github.retronym.SbtOneJar.oneJarSettings 10 | 11 | mainClass in Compile := Some("au.csiro.data61.dataFusion.util.Main") 12 | -------------------------------------------------------------------------------- /dataFusion-util/src/main/resources/application.conf: -------------------------------------------------------------------------------- 1 | search = { 2 | // See: https://lucene.apache.org/core/6_6_0/analyzers-common/org/apache/lucene/analysis/synonym/SolrSynonymParser.html 3 | synonyms = "../dataFusion-search/synonyms.txt" 4 | synonyms = ${?SEARCH_SYNONYMS} 5 | } 6 | -------------------------------------------------------------------------------- /dataFusion-util/src/main/resources/logback.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | util.log 11 | 12 | %d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /dataFusion-util/src/main/scala/au/csiro/data61/dataFusion/util/Age.scala: -------------------------------------------------------------------------------- 1 | package au.csiro.data61.dataFusion.util 2 | 3 | import com.typesafe.scalalogging.Logger 4 | 5 | import au.csiro.data61.dataFusion.common.Data.{ Doc, GAZ, Ner, T_PERSON, T_PERSON2 } 6 | 7 | /** create AGE Ner's from GAZ PERSON{,2} Ners followed by a parenthesized number 18-99 */ 8 | object Age { 9 | private val log = Logger(getClass) 10 | 11 | val ageRe1 = """\s*\((\d{2})\)(?!\s*\d)""".r // look for " (dd)" after a name not followed by further digits (a phone number) 12 | val ageRe2 = """(.{0,50}\baged?) (\d{2})\b""".r // look for "aged dd" within 50 chars after a name (to allow for a title in beween) 13 | def wordCount(s: String) = s.split("\\s+").length 14 | def find(s: String) = ageRe1.findPrefixMatchOf(s).map((_, 1, 0)).orElse(ageRe2.findPrefixMatchOf(s).map(m => (m, 2, wordCount(m.group(1))))) 15 | 16 | def toNer(content: String, ner: List[Ner]): Iterator[Ner] = { 17 | val it = for { 18 | n <- ner.sortBy(_.offStr).iterator if n.impl == GAZ && (n.typ == T_PERSON || n.typ == T_PERSON2) 19 | (m, grp, posOffset) <- find(content.substring(n.offEnd)) if m.group(grp).toInt >= 18 // must be adult 20 | } yield Ner(n.posEnd + posOffset, n.posEnd + posOffset + 1, n.offEnd + m.start(grp), n.offEnd + m.end(grp), 1.0, m.group(grp), "AGE", "D61AGE", n.extRef) 21 | 22 | // if we have two close PERSONs followed by an AGE the above could associate the same AGE with both of them 23 | // with Ners sorted as above, we only want the last one 24 | val dummy = Ner(0, 0, 0, 0, 0.0, "text", "typ", "impl", None) 25 | (it ++ Iterator.single(dummy)).sliding(2).flatMap { 26 | case Seq(n1, n2) if n1.offStr != n2.offStr => Iterator.single(n1) 27 | case _ => Iterator.empty 28 | } 29 | } 30 | 31 | val augment: Doc => Doc = { d => 32 | val ner = d.ner ++ d.content.toList.flatMap(toNer(_, d.ner)) 33 | val embedded = d.embedded.map { e => 34 | val ner = e.ner ++ e.content.toList.flatMap(toNer(_, e.ner)) 35 | e.copy(ner = ner) 36 | } 37 | d.copy(ner = ner, embedded = embedded) 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /dataFusion-util/src/main/scala/au/csiro/data61/dataFusion/util/Hits.scala: -------------------------------------------------------------------------------- 1 | package au.csiro.data61.dataFusion.util 2 | 3 | import java.io.InputStream 4 | 5 | import scala.io.Source 6 | 7 | import com.typesafe.scalalogging.Logger 8 | 9 | import au.csiro.data61.dataFusion.common.Data._ 10 | import au.csiro.data61.dataFusion.common.Data.{ LPosDoc, Ner, PHits, PosInfo } 11 | import au.csiro.data61.dataFusion.common.Data.JsonProtocol.pHitsCodec 12 | import spray.json.pimpString 13 | import scala.annotation.tailrec 14 | 15 | import au.csiro.data61.dataFusion.search.DataFusionLucene._ 16 | import au.csiro.data61.dataFusion.search.LuceneUtil._ 17 | import java.util.Comparator 18 | import java.util.Arrays 19 | 20 | /** 21 | * In dataFusion-search the phrase search for PERSON|PERSON2, with terms in any order, can make some incorrect matches. 22 | * This happens when the query contains repeated tokens e.g. "Aaron H Aaron" in which case text "H H Aaron" will match. 23 | * This is corrected here by checking that matches have the same term frequencies as the query. 24 | * Unfortunately this check cannot be done at search time in dataFusion-search because fetching the text at that point would 25 | * negatively impact performance (not an issue here because we already have the text). 26 | * 27 | * A dependency on dataFusion-search has been added so that we can use the same term tokenization as in the search. 28 | */ 29 | object Hits { 30 | private val log = Logger(getClass) 31 | 32 | def hitIter(hIn: InputStream): Iterator[PHits] = Source.fromInputStream(hIn, "UTF-8").getLines.map(_.parseJson.convertTo[PHits]) 33 | 34 | /** idEmbIdx -> extRefId, score, typ, lposdoc */ 35 | type HitsMap = Map[IdEmbIdx, Seq[(ExtRef, Double, String, LPosDoc)]] 36 | 37 | def hitsMap(iter: Iterator[PHits]): HitsMap = 38 | iter.flatMap { x => 39 | x.hits.map(lposdoc => (x.extRef, x.score, x.typ, lposdoc)) 40 | }.toSeq.groupBy(_._4.idEmbIdx) 41 | 42 | def termFreq(t: String) = tokenIter(analyzer, F_CONTENT, t).toList.groupBy(identity).map { case (t, lst) => (t, lst.size) } 43 | 44 | /** 45 | * @return Some(termFreq) if the hits need to be checked against this query term freq (a PERSON|PERSON2 search with repeated terms) 46 | */ 47 | def qTermFreq(t: String, typ: String) = 48 | if (typ == T_ORGANIZATION) None // not needed for "terms in order" search 49 | else { 50 | val tf = termFreq(t) 51 | if (tf.values.exists(_ > 1)) Some(tf) 52 | else None // not needed if no duplicate terms 53 | } 54 | 55 | def toNer(text: String, pi: PosInfo, extRef: ExtRef, score: Double, typ: String) = 56 | Ner(pi.posStr, pi.posEnd, pi.offStr, pi.offEnd, score, text, typ, GAZ, Some(extRef)) 57 | 58 | /** 59 | * find if any PERSON NER overlaps with a PERSON2 60 | * A----------B n1 = PERSON 61 | * C--------D n2 = PERSON2 62 | * no overlap = D < A or B < C 63 | * overlap = D >= A and B >= C 64 | * A & C are offStr; B & D are offEnd - 1 because offEnd is exclusive (1 past the end) 65 | * So overlap = n2.offEnd - 1 >= n1.offStr && n1.offEnd - 1 >= n2.offStr 66 | * = n2.offEnd > n1.offStr && n1.offEnd > n2.offStr 67 | * 68 | * Sort n1 = PERSON on offEnd asc 69 | * Binary search to find first n1: n1.offEnd > n2.offStr (the bit after &&) 70 | * Scan n1's until n2.offEnd < n1.offStr for overlap 71 | */ 72 | val nerCmp = new Comparator[Ner] { 73 | override def compare(a: Ner, b:Ner) = a.offEnd - b.offEnd 74 | } 75 | 76 | def filterPer2(ners: Seq[Ner]): Seq[Ner] = { 77 | val per = { 78 | val a = ners.view.filter(n => n.impl == GAZ && n.typ == T_PERSON).toArray 79 | Arrays.sort(a, nerCmp) 80 | log.debug(s"filterPer2.per: ${a.toList}") 81 | a 82 | } 83 | 84 | def pred(n: Ner): Boolean = n.typ != T_PERSON2 || { 85 | val i = Arrays.binarySearch(per, n.copy(offEnd = n.offStr + 1), nerCmp) // find 1st per(j).offEnd > n.offStr (>= n.offStr + 1) 86 | val j = if (i >= 0) i else -(i + 1) 87 | val overlaps = j < per.length && per(j).offStr < n.offEnd // assume per(j)'s don't overlap so no need to scan 88 | log.debug(s"i = $i, j = $j, overlaps = $overlaps, n = $n") 89 | !overlaps 90 | } 91 | 92 | ners filter pred 93 | } 94 | 95 | def augment(hs: HitsMap): Doc => Doc = { d => 96 | 97 | def searchNers(content: Option[String], idEmbIdx: IdEmbIdx): Seq[Ner] = for { 98 | c <- content.toSeq 99 | hits <- hs.get(idEmbIdx).toSeq 100 | (extRefId, score, typ, lposdoc) <- hits 101 | qtf = qTermFreq(extRefId.name, typ) // query: term -> freq but only if it needs to be checked 102 | pi <- lposdoc.posInfos 103 | text = c.substring(pi.offStr, pi.offEnd) 104 | ok <- qtf.map(_ == termFreq(text)).orElse(Some(true)) if ok // skip if there's a term freq mismatch 105 | } yield toNer(text, pi, extRefId, score, typ) 106 | 107 | def newNers(content: Option[String], idEmbIdx: IdEmbIdx): Seq[Ner] = filterPer2(searchNers(content, idEmbIdx)) 108 | 109 | val ner = d.ner ++ newNers(d.content, IdEmbIdx(d.id, EMB_IDX_MAIN)) 110 | val embedded = d.embedded.zipWithIndex.map { case (e, embIdx) => 111 | val ner = e.ner ++ newNers(e.content, IdEmbIdx(d.id, embIdx)) 112 | e.copy(ner = ner) 113 | } 114 | d.copy(ner = ner, embedded = embedded) 115 | } 116 | 117 | } -------------------------------------------------------------------------------- /dataFusion-util/src/main/scala/au/csiro/data61/dataFusion/util/Proximity.scala: -------------------------------------------------------------------------------- 1 | package au.csiro.data61.dataFusion.util 2 | 3 | import java.io.File 4 | import java.util.concurrent.ConcurrentHashMap 5 | import java.util.concurrent.atomic.AtomicInteger 6 | 7 | import scala.collection.JavaConverters.{ asScalaSetConverter, collectionAsScalaIterableConverter } 8 | import scala.io.Source 9 | 10 | import com.typesafe.scalalogging.Logger 11 | 12 | import Main.CliOption 13 | import au.csiro.data61.dataFusion.common.Data.{ Doc, EMAIL, Edge, ExtRef, GAZ } 14 | import au.csiro.data61.dataFusion.common.Data.{ Ner, Node, T_ORGANIZATION, T_PERSON, T_PERSON2, WeightMap } 15 | import au.csiro.data61.dataFusion.common.Data.JsonProtocol.{ docFormat, edgeFormat, nodeFormat } 16 | import au.csiro.data61.dataFusion.common.Parallel.doParallel 17 | import au.csiro.data61.dataFusion.common.Util.bufWriter 18 | import resource.managed 19 | import spray.json.{ pimpAny, pimpString } 20 | 21 | object Proximity { 22 | private val log = Logger(getClass) 23 | 24 | def fileWithSuffix(f: File, suffix: String) = new File(f.getPath + suffix) 25 | 26 | /** 27 | * GAZ NERs and EMAIL NERs with no extRef (possibly non-Australian - avoids duplicates with GAZ NERs). 28 | * Also map all EMAIL typ (FROM|TO|CC|BCC) to a single typ=EMAIL so we only get one node per person. 29 | */ 30 | def nerFilter(ner: List[Ner]): Iterator[Ner] = { 31 | // val emailNer = ner.filter(n => n.impl == EMAIL) 32 | // val offStr = emailNer.view.map(_.offStr).toSet 33 | // emailNer.iterator ++ ner.view.filter(n => n.impl == GAZ && (n.typ == T_PERSON || n.typ == T_PERSON2 || n.typ == T_ORGANIZATION) && !offStr.contains(n.offStr)) 34 | ner.iterator.filter(n => n.impl == GAZ || (n.impl == EMAIL && n.extRef.isEmpty)).map(n => if (n.impl == EMAIL) n.copy(typ = EMAIL) else n) 35 | } 36 | 37 | def doProximity(cliOption: CliOption) = { 38 | val prox = new Proximity(cliOption, nerFilter) 39 | 40 | val in = Source.fromInputStream(System.in, "UTF-8").getLines 41 | def work(json: String) = { 42 | prox.accDoc(json.parseJson.convertTo[Doc]) 43 | "more" 44 | } 45 | def out(s: String) = {} 46 | doParallel(in, work, out, "done", "done", cliOption.numWorkers) 47 | log.info("load complete") 48 | 49 | type JOB = () => String 50 | 51 | val job1: JOB = () => { 52 | for { 53 | o <- cliOption.output 54 | w <- managed(bufWriter(fileWithSuffix(o, "node.json"))) 55 | n <- prox.nodeMap.values.asScala 56 | } { 57 | w.write(n.toJson.compactPrint) 58 | w.write('\n') 59 | } 60 | "more" 61 | } 62 | 63 | val job2: JOB = () => { 64 | for { 65 | o <- cliOption.output 66 | w <- managed(bufWriter(fileWithSuffix(o, "edge.json"))) 67 | e <- prox.edgeMap.entrySet.asScala 68 | } { 69 | w.write(Edge(e.getKey._1, e.getKey._2, e.getValue, GAZ).toJson.compactPrint) 70 | w.write('\n') 71 | } 72 | "more" 73 | } 74 | 75 | val in2 = Iterator(job1, job2) 76 | def work2(job: JOB) = job() 77 | doParallel(in2, work2, out, () => "done", "done", Math.min(2, cliOption.numWorkers)) 78 | } 79 | 80 | case class NodeKey(name: String, typ: String) 81 | } 82 | 83 | /** thread-safe for concurrent accDoc's */ 84 | class Proximity(cliOption: CliOption, nerFilter: List[Ner]=> Iterator[Ner]) { 85 | import Proximity.NodeKey 86 | 87 | val nextId = new AtomicInteger(0) 88 | val nodeMap = new ConcurrentHashMap[NodeKey, Node]() 89 | 90 | // Scala's concurrent map TrieMap does not have anything like Java's ConcurrentHashMap.compute, which I think makes it rather useless! 91 | 92 | def accNode(k: NodeKey, score: Double, extRef: ExtRef): Int = 93 | nodeMap.computeIfAbsent(k, k => Node(nextId.getAndIncrement, extRef, score, k.typ)).nodeId 94 | 95 | val edgeMap = new ConcurrentHashMap[(Int, Int), WeightMap] 96 | 97 | def accEdge(source: Int, target: Int, collection: String, weight: Double): Unit = { 98 | val k = if (source < target) (source, target) else (target, source) 99 | edgeMap.compute(k, (k, v) => 100 | if (v == null) Map(collection -> (weight, 1)) withDefaultValue (0.0, 0) 101 | else { 102 | val (w0, c0) = v(collection) 103 | v + (collection -> (w0 + weight, c0 + 1)) 104 | } 105 | ) 106 | } 107 | 108 | val collectionRE = cliOption.collectionRe.r 109 | def collection(path: String) = collectionRE.findFirstMatchIn(path).map(_.group(1)).getOrElse("UNKNOWN") 110 | 111 | // used concurrently 112 | def accDoc(d: Doc): Unit = { 113 | val cutoff = (cliOption.decay * 5).toInt 114 | for { 115 | ners <- nerFilter(d.ner) +: d.embedded.view.map(e => nerFilter(e.ner)) 116 | v = ners.toIndexedSeq.sortBy(_.offStr) 117 | // _ = log.info(s"v.size = ${v.size}") 118 | i <- 0 until v.size - 1 // exclude last 119 | ni = v(i) 120 | extRefi = ni.extRef.getOrElse(ExtRef(ni.text, List.empty)) 121 | (j, dist) <- (i + 1 until v.size).view.map { j => (j, v(j).offStr - ni.offStr) }.takeWhile(_._2 < cutoff) 122 | nj = v(j) 123 | extRefj = nj.extRef.getOrElse(ExtRef(nj.text, List.empty)) 124 | } { 125 | // log.info(s"$i, $j -> $dist") 126 | val idi = accNode(NodeKey(extRefi.name, ni.typ), ni.score, extRefi) 127 | val idj = accNode(NodeKey(extRefj.name, nj.typ), nj.score, extRefj) 128 | if (idi != idj) accEdge(idi, idj, collection(d.path), Math.exp(-dist/cliOption.decay)) // additive weight (distance = 1/sum(weights)) 129 | } 130 | } 131 | 132 | } 133 | 134 | -------------------------------------------------------------------------------- /dataFusion-util/src/main/scala/au/csiro/data61/dataFusion/util/TmNer.scala: -------------------------------------------------------------------------------- 1 | package au.csiro.data61.dataFusion.util 2 | 3 | import spray.json.DefaultJsonProtocol._ 4 | import java.io.InputStream 5 | import scala.io.Source 6 | import spray.json._ 7 | import au.csiro.data61.dataFusion.common.Data._ 8 | 9 | /** 10 | * Merge Debbie's ner results. 11 | * Data provided as CSV files with Windows line endings and with our doc id in the filename but not in the data. 12 | * Steps to clean the data:
    13 | *
  • sed -i 's/\r//' * # get rid of Windows \r 14 | *
  • awk -f /data/neil/tmner.awk tmner/ *.csv > tmner.json # convert to JSON with id in the data 15 | *
16 | * This code merges in the resulting JSON which has the structure of case class Tmner. 17 | */ 18 | object TmNer { 19 | case class Tmner(id: Long, typ: String, offStr: Int, offEnd: Int, text: String) 20 | 21 | implicit val tmnerFormat = jsonFormat5(Tmner) 22 | 23 | def tmnerIter(hIn: InputStream): Iterator[Tmner] = Source.fromInputStream(hIn, "UTF-8").getLines.map(_.parseJson.convertTo[Tmner]) 24 | 25 | type TMap = Map[Long, Seq[Tmner]] 26 | def tmnerMap(iter: Iterator[Tmner]): TMap = iter.toSeq.groupBy(_.id) 27 | 28 | def toNer(t: Tmner) = Ner(-1, -1, t.offStr, t.offEnd, 1.0f, t.text, t.typ, "TMNER", None) 29 | 30 | def augment(m: TMap): Doc => Doc = { d => 31 | m.get(d.id) match { 32 | case Some(s) => d.copy(ner = d.ner ++ s.map(toNer)) 33 | case None => d 34 | } 35 | } 36 | 37 | } -------------------------------------------------------------------------------- /dataFusion-util/src/test/resources/logback-test.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | %d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /dataFusion-util/src/test/scala/au/csiro/data61/dataFusion/util/AgeTest.scala: -------------------------------------------------------------------------------- 1 | package au.csiro.data61.dataFusion.util 2 | 3 | import org.scalatest.{ FlatSpec, Matchers } 4 | 5 | import com.typesafe.scalalogging.Logger 6 | 7 | import au.csiro.data61.dataFusion.common.Data._ 8 | import Age._ 9 | 10 | class AgeTest extends FlatSpec with Matchers { 11 | private val log = Logger(getClass) 12 | 13 | def mkNer(content: String, name: String, ids: List[Long]) = { 14 | val offStr = content.indexOf(name) 15 | assert(offStr != -1) 16 | val posStr = wordCount(content.substring(0, offStr)) 17 | Ner(posStr, posStr + wordCount(name), offStr, offStr + name.length, 0.0, name, T_PERSON2, GAZ, Some(ExtRef(name, ids))) 18 | } 19 | 20 | "Age.toNer" should "find parenthesized age" in { 21 | val c = "The newbie Jacinda Ardern (37) was selected." 22 | val n1 = mkNer(c, "Jacinda Ardern", List(101, 102)) 23 | val d = Doc(1, Some(c), Map.empty, "path", List(n1), List.empty) 24 | val d2 = augment(d) 25 | log.debug(s"d2 = $d2") 26 | d2.ner.size should be(2) 27 | val expected = Ner(n1.posEnd, n1.posEnd + 1, n1.offEnd + 2, n1.offEnd + 4, 1.0, "37", "AGE", "D61AGE", n1.extRef) 28 | assert(d2.ner.contains(expected)) 29 | } 30 | 31 | it should "not find parenthesized age after other text" in { 32 | val c = "The newbie Jacinda Ardern blah (37) was selected." 33 | val n1 = mkNer(c, "Jacinda Ardern", List(101, 102)) 34 | val d = Doc(1, Some(c), Map.empty, "path", List(n1), List.empty) 35 | val d2 = augment(d) 36 | log.debug(s"d2 = $d2") 37 | d2 should be(d) 38 | } 39 | 40 | it should "not find age in a phone number" in { 41 | val c = "The newbie Jacinda Ardern (65) 3214-3456 was selected." 42 | val n1 = mkNer(c, "Jacinda Ardern", List(101, 102)) 43 | val d = Doc(1, Some(c), Map.empty, "path", List(n1), List.empty) 44 | val d2 = augment(d) 45 | log.debug(s"d2 = $d2") 46 | d2 should be(d) 47 | } 48 | 49 | it should "find age in 'name, aged dd'" in { 50 | val c = "The newbie Jacinda Ardern, aged 37, was selected." 51 | val n1 = mkNer(c, "Jacinda Ardern", List(101, 102)) 52 | val d = Doc(1, Some(c), Map.empty, "path", List(n1), List.empty) 53 | val d2 = augment(d) 54 | log.debug(s"d2 = $d2") 55 | d2.ner.size should be(2) 56 | val expected = Ner(n1.posEnd + 2, n1.posEnd + 3, n1.offEnd + 7, n1.offEnd + 9, 1.0, "37", "AGE", "D61AGE", n1.extRef) 57 | assert(d2.ner.contains(expected)) 58 | } 59 | 60 | it should "find age within 50 chars after name" in { 61 | val c = "Jacinda Ardern, future PM, aged 37, was selected." 62 | val n1 = mkNer(c, "Jacinda Ardern", List(101, 102)) 63 | val d = Doc(1, Some(c), Map.empty, "path", List(n1), List.empty) 64 | val d2 = augment(d) 65 | log.debug(s"d2 = $d2") 66 | d2.ner.size should be(2) 67 | val expected = Ner(n1.posEnd + 4, n1.posEnd + 5, n1.offEnd + 18, n1.offEnd + 20, 1.0, "37", "AGE", "D61AGE", n1.extRef) 68 | assert(d2.ner.contains(expected)) 69 | } 70 | 71 | it should "associate an age only with the last preceeding PERSON" in { 72 | val c = "Frederick Bloggs CEO and Jacinda Ardern, future PM, aged 37, were selected." 73 | val n1 = mkNer(c, "Jacinda Ardern", List(101, 102)) 74 | val n2 = mkNer(c, "Frederick Bloggs", List(201, 202)) 75 | val d = Doc(1, Some(c), Map.empty, "path", List(n1, n2), List.empty) 76 | val d2 = augment(d) 77 | log.debug(s"d2 = $d2") 78 | d2.ner.size should be(3) 79 | val expected = Ner(n1.posEnd + 4, n1.posEnd + 5, n1.offEnd + 18, n1.offEnd + 20, 1.0, "37", "AGE", "D61AGE", n1.extRef) 80 | assert(d2.ner.contains(expected)) 81 | } 82 | } -------------------------------------------------------------------------------- /dataFusion-util/src/test/scala/au/csiro/data61/dataFusion/util/EmailTest.scala: -------------------------------------------------------------------------------- 1 | package au.csiro.data61.dataFusion.util 2 | 3 | import org.scalatest.{ FlatSpec, Matchers } 4 | 5 | import com.typesafe.scalalogging.Logger 6 | 7 | import au.csiro.data61.dataFusion.common.Data._ 8 | 9 | class EmailTest extends FlatSpec with Matchers { 10 | private val log = Logger(getClass) 11 | 12 | val text = """ 13 | Some junk before 14 | the headers 15 | From: Ardern Jacinda (Wellington) 16 | 17 | To: Bloggs Frederick (Akaroa); Smith 18 | Michael (Ekatahuna); Walters Roger (Pink Floyd) 19 | 20 | Cc: Zealand New (Aotearoa) 21 | 22 | Bcc: Peters Winston (Wellington) 23 | 24 | Sent: Today 25 | 26 | Subject: Forming Government 27 | in an MMP System 28 | 29 | It's Labour ... and Prime Minister Jacinda Ardern. 30 | 31 | New Zealand First has crowned Ardern the next prime minister with its decision to back a Labour-led government, which will also need the Green Party to govern. 32 | 33 | Ardern will claim the top job after only two and a-half months as Labour leader - and follows her former mentor Helen Clark into the top job. 34 | 35 | """ 36 | 37 | "Email.toNer" should "find names" in { 38 | val extRef = Some(ExtRef("Jacinda Ardern", List(1, 2))) 39 | val gazNer = List(Ner(7, 9, 36, 50, 1.0, "Jacinda Ardern", T_PERSON2, GAZ , extRef)) 40 | val ners = Email.toNer(Email.extRefNer(gazNer), _ => 1.0)(text).toList 41 | for (n <- ners) log.debug(s"ner = $n") 42 | val expected = Seq( 43 | Ner(7, 10, 36, 63, 1.0, "Ardern Jacinda (Wellington)", "FROM", "D61EMAIL" ,extRef), 44 | Ner(14, 17, 96, 121, 1.0, "Smith\nMichael (Ekatahuna)", "TO", "D61EMAIL", None) 45 | ) 46 | for (e <- expected) assert(ners.contains(e)) 47 | } 48 | 49 | } -------------------------------------------------------------------------------- /dataFusion-util/src/test/scala/au/csiro/data61/dataFusion/util/HitsTest.scala: -------------------------------------------------------------------------------- 1 | package au.csiro.data61.dataFusion.util 2 | 3 | import org.scalatest.{ FlatSpec, Matchers } 4 | 5 | import com.typesafe.scalalogging.Logger 6 | 7 | import au.csiro.data61.dataFusion.common.Data._ 8 | 9 | class HitsTest extends FlatSpec with Matchers { 10 | val log = Logger(getClass) 11 | 12 | val id = 31L 13 | val extRef = ExtRef("Jane", List(123L)) 14 | val score = 9.876f 15 | val typ = "PERSON" 16 | val path = "path" 17 | val content = "I saw SARAH ANNE JONES here!" 18 | 19 | // case class Embedded(content: Option[String], meta: Map[String, String], ner: List[Ner]) 20 | val emb = Embedded(Some(content), Map.empty, List.empty) 21 | 22 | // case class Doc(id: Long, content: Option[String], meta: Map[String, String], path: String, ner: List[Ner], embedded: List[Embedded]) 23 | val doc = Doc(id, Some(content), Map.empty, path, List.empty, List(emb)) 24 | 25 | // case class PosInfo(posStr: Int, posEnd: Int, offStr: Int, offEnd: Int) 26 | val pi = PosInfo(1, 4, doc.content.get.indexOf("SARAH"), doc.content.get.indexOf(" here!")) 27 | 28 | val expected = Ner(pi.posStr, pi.posEnd, pi.offStr, pi.offEnd, score, content.substring(pi.offStr, pi.offEnd), typ, GAZ, Some(extRef)) 29 | 30 | "augment" should "add hit to doc.ner" in { 31 | // case class LPosDoc(idEmbIdx: IdEmbIdx, posInfos: List[PosInfo]) 32 | val lPosDoc = LPosDoc(IdEmbIdx(id, EMB_IDX_MAIN), List(pi)) 33 | val hits = Seq(PHits(Stats(0, 0), List(lPosDoc), None, extRef, score, typ)) 34 | 35 | val augment: Doc => Doc = Hits.augment(Hits.hitsMap(hits.iterator)) 36 | val doc2 = augment(doc) 37 | log.debug(s"doc2 = $doc2") 38 | doc2.ner.size should be(1) 39 | doc2.ner(0) should be(expected) 40 | doc2.embedded.size should be(1) 41 | doc2.embedded(0).ner.size should be(0) 42 | } 43 | 44 | it should "add hit to doc.embedded.ner" in { 45 | // case class LPosDoc(idEmbIdx: IdEmbIdx, posInfos: List[PosInfo]) 46 | val lPosDoc = LPosDoc(IdEmbIdx(id, 0), List(pi)) 47 | val hits = Seq(PHits(Stats(0, 0), List(lPosDoc), None, extRef, 9.876f, "PERSON")) 48 | 49 | val augment: Doc => Doc = Hits.augment(Hits.hitsMap(hits.iterator)) 50 | val doc2 = augment(doc) 51 | log.debug(s"doc2 = $doc2") 52 | doc2.ner.size should be(0) 53 | doc2.embedded.size should be(1) 54 | doc2.embedded(0).ner.size should be(1) 55 | doc2.embedded(0).ner(0) should be(expected) 56 | } 57 | 58 | "termFreq" should "count terms" in { 59 | Hits.termFreq("Aaron H Aaron") should be(Map("aaron" -> 2, "h" -> 1)) 60 | Hits.qTermFreq("Aaron H Aaron", T_PERSON) should be(Some(Map("aaron" -> 2, "h" -> 1))) 61 | Hits.qTermFreq("Aaron H Aaron", T_ORGANIZATION) should be(None) 62 | Hits.qTermFreq("Aaron H Bloggs", T_PERSON) should be(None) 63 | } 64 | 65 | def mkNer(offStr: Int, offEnd: Int, typ: String) = Ner(0, 0, offStr, offEnd, 1.0, "text", typ, GAZ, None) 66 | 67 | "filterPer2" should "filter PERSON2 within PERSON" in { 68 | val p = (0 until 3).map(i => mkNer(10 * i, 10 * i + 6, T_PERSON)) 69 | val p2 = Seq(mkNer(2, 6, T_PERSON2), mkNer(20, 24, T_PERSON2), mkNer(26, 28, T_PERSON2)) 70 | val x = Hits.filterPer2(p ++ p2) 71 | x should be(p :+ p2(2)) // (0) & (1) are filtered out 72 | log.debug(s"x = $x") 73 | } 74 | 75 | } -------------------------------------------------------------------------------- /dataFusion-util/src/test/scala/au/csiro/data61/dataFusion/util/ProximityTest.scala: -------------------------------------------------------------------------------- 1 | package au.csiro.data61.dataFusion.util 2 | 3 | import org.scalatest.{ FlatSpec, Matchers } 4 | 5 | import com.typesafe.scalalogging.Logger 6 | 7 | import au.csiro.data61.dataFusion.common.Data.{ Doc, Embedded, ExtRef, GAZ, Ner, T_PERSON } 8 | import scala.collection.JavaConverters._ 9 | 10 | 11 | class ProximityTest extends FlatSpec with Matchers { 12 | val log = Logger(getClass) 13 | 14 | val offStr = 80 15 | val cli = Main.defaultCliOption 16 | val dist = (cli.decay/5).toInt 17 | val weight = Math.exp(-dist/cli.decay) 18 | 19 | // case class Ner(posStr: Int, posEnd: Int, offStr: Int, offEnd: Int, score: Double, text: String, typ: String, impl: String, extRefId: Option[List[Long]]) 20 | val ner1 = Ner(0, 0, offStr, 0, 1.0, "text", T_PERSON, GAZ, Some(ExtRef("Fred", List(1, 3)))) 21 | val ner2 = Ner(0, 0, offStr + dist, 0, 1.0, "text", T_PERSON, GAZ, Some(ExtRef("Jane", List(2, 4)))) 22 | val ners = List(ner1, ner2) 23 | 24 | 25 | "Proximity" should "find close ners" in { 26 | // case class Doc(id: Long, content: Option[String], meta: Map[String, String], path: String, ner: List[Ner], embedded: List[Embedded]) 27 | val doc = Doc(0, Some("text"), Map.empty, "path", ners, List.empty) 28 | val prox = new Proximity(cli, n => n.iterator) 29 | prox.accDoc(doc) 30 | for (x <- prox.nodeMap.values.asScala) log.info(s"$x") 31 | for (x <- prox.edgeMap.entrySet.asScala) log.info(s"$x") 32 | prox.nodeMap.size should be(2) 33 | prox.edgeMap.size should be(1) 34 | prox.edgeMap.get((0,1)) should be(Map("UNKNOWN" -> (weight, 1))) 35 | 36 | prox.accDoc(doc) 37 | for (x <- prox.nodeMap.values.asScala) log.info(s"$x") 38 | for (x <- prox.edgeMap.entrySet.asScala) log.info(s"$x") 39 | prox.nodeMap.size should be(2) 40 | prox.edgeMap.size should be(1) 41 | prox.edgeMap.get((0,1)) should be(Map("UNKNOWN" -> (2*weight, 2))) 42 | 43 | // case class Embedded(content: Option[String], meta: Map[String, String], ner: List[Ner]) 44 | val emb = Embedded(Some("text"), Map.empty, ners) 45 | val doc2 = doc.copy(embedded = List(emb)) 46 | prox.accDoc(doc2) 47 | for (x <- prox.nodeMap.values.asScala) log.info(s"$x") 48 | for (x <- prox.edgeMap.entrySet.asScala) log.info(s"$x") 49 | prox.nodeMap.size should be(2) 50 | prox.edgeMap.size should be(1) 51 | prox.edgeMap.get((0,1)) should be(Map("UNKNOWN" -> (4*weight, 4))) 52 | } 53 | 54 | } -------------------------------------------------------------------------------- /docker/.dockerignore: -------------------------------------------------------------------------------- 1 | Dockerfile-ubuntu 2 | Dockerfile-centos 3 | -------------------------------------------------------------------------------- /docker/Dockerfile-centos: -------------------------------------------------------------------------------- 1 | FROM centos:latest 2 | 3 | ENV LANGUAGE=en 4 | ENV LC_ALL=C 5 | ENV LANG=C 6 | 7 | # openblas and build tools are to build MITIE (used by dataFusion-ner) 8 | # graphviz is for dependency graphs generated as part of the sbt build 9 | # libreoffice, tesseract and ImageMagick are used by dataFusion-tika (and its unit tests) 10 | 11 | RUN yum -y groupinstall 'Development Tools' && \ 12 | yum -y install https://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm && \ 13 | yum -y install openblas-devel cmake swig git java-1.8.0-openjdk-devel graphviz libreoffice tesseract ImageMagick 14 | 15 | RUN curl https://bintray.com/sbt/rpm/rpm > /etc/yum.repos.d/bintray-sbt-rpm.repo && \ 16 | yum -y install sbt 17 | 18 | ENTRYPOINT ["bash"] 19 | -------------------------------------------------------------------------------- /docker/Dockerfile-ubuntu: -------------------------------------------------------------------------------- 1 | FROM ubuntu:latest 2 | 3 | ENV LANGUAGE=en 4 | ENV LC_ALL=C 5 | ENV LANG=C 6 | ENV JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/jre 7 | 8 | # openblas and build tools are to build MITIE (used by dataFusion-ner) 9 | # graphviz is for dependency graphs generated as part of the sbt build 10 | # libreoffice, tesseract and imagemagick are used by dataFusion-tika (and its unit tests) 11 | 12 | RUN apt-get update && \ 13 | apt-get install -y gnupg curl unzip libopenblas-dev build-essential gfortran cmake swig git openjdk-8-jdk graphviz libreoffice tesseract-ocr tesseract-ocr-eng imagemagick && \ 14 | update-alternatives --set java /usr/lib/jvm/java-8-openjdk-amd64/jre/bin/java 15 | 16 | RUN echo "deb https://dl.bintray.com/sbt/debian /" > /etc/apt/sources.list.d/sbt.list && \ 17 | apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv 2EE0EA64E40A89B84B2DF73499E82A75642AC823 && \ 18 | apt-get update && \ 19 | apt-get install -y sbt 20 | 21 | ENTRYPOINT ["bash"] 22 | -------------------------------------------------------------------------------- /images/JSONFormatsUML.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/data61/dataFusion/4b766ae65254eed6c5a095f3bb8b6b42b9500a9c/images/JSONFormatsUML.png -------------------------------------------------------------------------------- /images/dataFusion.zargo: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/data61/dataFusion/4b766ae65254eed6c5a095f3bb8b6b42b9500a9c/images/dataFusion.zargo -------------------------------------------------------------------------------- /images/datafusion.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/data61/dataFusion/4b766ae65254eed6c5a095f3bb8b6b42b9500a9c/images/datafusion.png -------------------------------------------------------------------------------- /images/network.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/data61/dataFusion/4b766ae65254eed6c5a095f3bb8b6b42b9500a9c/images/network.png -------------------------------------------------------------------------------- /project/build.properties: -------------------------------------------------------------------------------- 1 | sbt.version=0.13.16 2 | -------------------------------------------------------------------------------- /project/plugins.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("io.spray" % "sbt-revolver" % "0.8.0") 2 | 3 | addSbtPlugin("com.typesafe.sbteclipse" % "sbteclipse-plugin" % "5.1.0") 4 | 5 | addSbtPlugin("org.scala-sbt.plugins" % "sbt-onejar" % "0.8") 6 | 7 | addSbtPlugin("net.virtual-void" % "sbt-dependency-graph" % "0.8.2") 8 | 9 | addSbtPlugin("com.typesafe.sbt" % "sbt-license-report" % "1.2.0") 10 | 11 | addSbtPlugin("com.github.gseitz" % "sbt-release" % "1.0.5") 12 | -------------------------------------------------------------------------------- /sh/dfus: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | # error if these vars not set 4 | : ${DFUS_DIR:?must be set to the location of the dataFusion source tree cloned from from https://github.com/data61/dataFusion/. \"source sh/setenv\" from the dataFusion directory to set the environment.} 5 | : ${SCALA_VER:?must be set to the Scala version. \"source sh/setenv\" from the dataFusion directory to set the environment.} 6 | : ${DFUS_VER:?must be set to the dataFusion version. \"source sh/setenv\" from the dataFusion directory to set the environment.} 7 | 8 | while getopts ":hm:s:" opt; do 9 | case $opt in 10 | m) 11 | HEAP=-Xmx${OPTARG}G 12 | ;; 13 | s) 14 | STACK=-Xss${OPTARG}M 15 | ;; 16 | h) 17 | cat <&2 24 | exit -1 25 | ;; 26 | \?) 27 | echo "Invalid option: -$OPTARG" >&2 28 | exit -2 29 | ;; 30 | esac 31 | done 32 | 33 | eval cmd=\$$OPTIND 34 | shift $OPTIND 35 | # echo $cmd "$@" 36 | 37 | getJar() { 38 | echo ${DFUS_DIR}/dataFusion-${1}/target/scala-${SCALA_VER}/datafusion-${1}_${SCALA_VER}-${DFUS_VER}-one-jar.jar 39 | } 40 | 41 | java $HEAP $STACK -jar `getJar $cmd` "$@" 42 | -------------------------------------------------------------------------------- /sh/setenv.centos: -------------------------------------------------------------------------------- 1 | #! /not/to/be/execed 2 | 3 | # used by sh/dfus 4 | export DFUS_DIR=${PWD} 5 | export SCALA_VER=2.12 6 | export DFUS_VER=1.1-SNAPSHOT 7 | 8 | # needed by dataFusion-ner (including sbt tests) 9 | export LD_LIBRARY_PATH=${DFUS_DIR}/dataFusion-ner/MITIE-native/centos # directory containing libjavamitie.so 10 | export NER_MITIE_ENGLISH_MODEL=${DFUS_DIR}/dataFusion-ner/MITIE-models/english/ner_model.dat 11 | # export NER_MITIE_SPANISH_MODEL=${DFUS_DIR}/dataFusion-ner/MITIE-models/spanish/ner_model.dat 12 | 13 | SEARCH_DIR=${DFUS_DIR}/dataFusion-search 14 | export SEARCH_SYNONYMS=${SEARCH_DIR}/synonyms.txt 15 | export SEARCH_DOC_INDEX=${SEARCH_DIR}/docIndex 16 | export SEARCH_META_INDEX=${SEARCH_DIR}/metaIndex 17 | export SEARCH_NER_INDEX=${SEARCH_DIR}/nerIndex 18 | 19 | export PATH=${DFUS_DIR}/sh:/usr/sbin:/usr/bin:/sbin:/bin 20 | 21 | cat < -1 && !forceClose) { 35 | cn = cn.slice(0, closedIdx) 36 | } else { 37 | cn = cn + " closed" 38 | } 39 | el.parentNode.className = cn 40 | } 41 | 42 | let clickers = Array.prototype.slice.call(document.querySelectorAll(".graph-form h2")) 43 | 44 | clickers.forEach((clicker, idx, arr) => { 45 | clicker.onclick = function() { 46 | return function (el, idx) { 47 | let others = clickers.filter((el, currIdx, arr) => idx !== currIdx) 48 | others.forEach(clicker => toggleClosedClass(clicker, true)) 49 | toggleClosedClass(el, false) 50 | }(this, idx) 51 | } 52 | }) 53 | 54 | // Form actions 55 | document.querySelector("#fetchForm").onsubmit = evt => { 56 | evt.preventDefault() 57 | getGraph() 58 | } 59 | 60 | document.querySelector("#optsForm").onsubmit = evt => { 61 | evt.preventDefault() 62 | drawGraph() 63 | } 64 | -------------------------------------------------------------------------------- /ui/network/graph.css: -------------------------------------------------------------------------------- 1 | input[type=text] { 2 | width: 60px; 3 | } 4 | -------------------------------------------------------------------------------- /ui/network/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 15 | 16 |
17 | 18 |
19 | collections: 20 | 21 | 22 | 23 | 24 | 25 |
26 | 27 |
28 | distance range: 29 | 30 | 31 | 32 |
33 | node radius range: 34 | 35 | 36 | 37 |
38 | edge width range: 39 | 40 | 41 | 42 |
43 | 44 | 45 | 46 |
47 | 48 |
49 | 50 | 51 | 52 | 53 |
54 | 55 | 56 | -------------------------------------------------------------------------------- /version.sbt: -------------------------------------------------------------------------------- 1 | version in ThisBuild := "1.1-SNAPSHOT" 2 | --------------------------------------------------------------------------------