├── .gitignore ├── README.md ├── assembly.sbt ├── build.sbt ├── project ├── Dependencies.scala ├── assembly.sbt ├── build.properties └── plugins.sbt ├── sbt └── src └── main ├── resources └── reference.conf └── scala └── de └── knutwalker └── dbpedia ├── Import.scala ├── components ├── GraphComponent.scala ├── HandlerComponent.scala ├── ImporterComponent.scala ├── MetricsComponent.scala ├── ParserComponent.scala └── SettingsComponent.scala ├── disruptor ├── StatementEvent.scala ├── StatementEventHandler.scala └── StatementEventProducer.scala ├── impl ├── ConfigSettingsComponent.scala ├── DefaultHandlerComponent.scala ├── DefaultImporterComponent.scala ├── DefaultMetricsComponent.scala ├── DefaultParserComponent.scala ├── DisruptorImporterComponent.scala └── FastBatchGraphComponent.scala ├── util ├── Tx.scala └── itertools.scala └── wire.scala /.gitignore: -------------------------------------------------------------------------------- 1 | *# 2 | *.iml 3 | *.ipr 4 | *.iws 5 | *.pyc 6 | *.tm.epoch 7 | *.vim 8 | */project/boot 9 | */project/build/target 10 | */project/project.target.config-classes 11 | *-shim.sbt 12 | *~ 13 | .#* 14 | .*.swp 15 | .DS_Store 16 | .cache 17 | .cache 18 | .classpath 19 | .codefellow 20 | .ensime* 21 | .eprj 22 | .history 23 | .idea 24 | .manager 25 | .multi-jvm 26 | .project 27 | .scala_dependencies 28 | .scalastyle 29 | .settings 30 | .tags 31 | .tags_sorted_by_file 32 | .target 33 | .worksheet 34 | Makefile 35 | TAGS 36 | lib_managed 37 | logs 38 | project/boot/* 39 | project/plugins/project 40 | src_managed 41 | target 42 | tm*.lck 43 | tm*.log 44 | tm.out 45 | worker*.log 46 | /bin 47 | *.db 48 | sbt-launch.jar 49 | dbpedia-neo4j.jar 50 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | DBpedia Batch Importer for Neo4j 2 | ================================ 3 | 4 | Imports [DBPedia](http://dbpedia.org/) dumps into [Neo4j](http://www.neo4j.org/) 5 | 6 | This importer aims for faster import times, but less restrictive (and thus, less performant) implementations might follow. 7 | In particular, it can only perform a full-load on an offline, ideally empty, database. 8 | Importing the same file with two executions might result in duplicated nodes or a corrupt database. 9 | 10 | 11 | ## Quickstart 12 | 13 | ```bash 14 | ./sbt "run dewiki-20140320-article-categories.nt.gz" 15 | ``` 16 | 17 | 18 | ## Usage 19 | 20 | ### Requirements 21 | 22 | - JDK7 (Probably Oracle JDK) 23 | - (SBT and Scala 2.11) - will be pulled automatically, if you do not have these installed 24 | 25 | 26 | _1_. Build the project 27 | 28 | ```bash 29 | ./sbt assembly 30 | ``` 31 | 32 | _2_. Run the jar against one or more scripts. Always run all files in one run, otherwise the database will have incorrect data 33 | 34 | ```bash 35 | ./dbpedia-neo4j.jar ./dewiki-20140320-article-categories.nt.gz 36 | ``` 37 | 38 | _3_. Specify run options 39 | e.g. You can change the Neo4j database dir with the `dbpedia.db-dir` system property and give the script more memory to run 40 | 41 | ```bash 42 | java -server -Xms1g -Xmx8g -Ddbpedia.db-dir=import.db -jar dbpedia-neo4j.jar {article_categories,category_labels,instance_types,labels,skos_categories}_de.nt.bz2 43 | ``` 44 | 45 | 46 | ## Input files 47 | 48 | The importer can work regular `nt` or `nq` files, or de-compress `gz` or `bz2` archives on the fly (plain-text files are the fastest, of course). 49 | It helps a lot, if the files are ordered by subject, so that every statement for one particular subject comes in one block; otherwise the data might be incorrect (esp. in regard to the `rdf:type` and `rdfs:label` handling). 50 | 51 | 52 | ## Data model 53 | 54 | Generally, a triple will be stored as a `(s)-[p]->(o)` with the following additions: 55 | - the subject gets the label `:Resource` or `:BNode` 56 | - the object gets the label `:Resource` or `:BNode` or `:Literal` 57 | - the relationship gets a type, that is equivalent to the predicate's URI 58 | - `:Resource`s have a `uri` property and `:Literal`s a `value` property 59 | 60 | On top of that, that following changes are implemented: 61 | - relationships of `rdf:type` will be discarded and the particular object uri will be added as a label to the subject 62 | - relationships of `rdfs:label` or `skos:prefLabel` will be discarded and the particular object literal will be added as a `value` property to the subject. (When there might be more than one of these labels, currently only one of these gets actually stored) 63 | 64 | That means, you can use the `uri` property as a unique-id-kind-of property and the `value` property as a pretty-output-end-user-kind-of property (this also makes great node captions for the neo4j browser UI) 65 | 66 | The importer can create or maintain a schema index for later use. See [Configuration](#configuration) 67 | 68 | 69 | ## Configuration 70 | 71 | The importer uses [Typesafe Config](https://github.com/typesafehub/config#overview) for configuration, for details on how to write the config, please see their documentation. 72 | 73 | The following keys are used, see [reference.conf](src/main/resources/reference.conf) for details 74 | - `dbpedia.db-dir` 75 | - `dbpedia.tx-size` 76 | - `dbpedia.approx-resources` 77 | - `dbpedia.deferred-index` 78 | 79 | Probably the most important setting would be `db-dir`, which is the location of the database directory for neo4j. 80 | To be precise, this must match your `org.neo4j.server.database.location` property from `conf/neo4j-server.properties` 81 | 82 | If you set `dbpedia.deferred-index` to true, the importer will create a schema index on `:Resource(uri)` and `:Literal(value)`. 83 | These indices will (and can) not be used during the importing process, they only release you from the burden, to remember to add these later on by yourself. 84 | 85 | 86 | ## Under the hood 87 | 88 | The importer uses the [batch API](http://docs.neo4j.org/chunked/2.0.2/batchinsert.html) with all its implications, such as only using a single thread with no transactions. 89 | The importer maintains an in-memory map for URIs <-> NodeIDs mappings, that is, the node cache, that prevents multiple resources nodes for the same URI. 90 | This cache is ephemeral and will be destroyed when the importer terminates. There is no usage of any index, lookup, or cache that actually goes to the database. 91 | This is also the reason, that you have to do everything in one go. 92 | 93 | 94 | ## Metrics 95 | 96 | The importer logs several runtime metrics through the excellent [metrics library](http://metrics.codahale.com/). 97 | Meters for inserted nodes and relationships are reported during the process, a full report is given at the end. 98 | here's an example output: 99 | 100 | [nodes]: count=2800168 rate=17652.78/s 101 | [rels]: count=5953013 rate=37528.85/s 102 | [triples]: count=11975105 rate=75493.05/s 103 | [create-rel]: count=5953013 rate=37807.85/s [0.00ms, 0.01ms] ~0.00ms ±0.00ms p95=0.00ms p99=0.01ms p999=0.01ms 104 | [create-resource]: count=2800168 rate=17781.77/s [0.00ms, 0.09ms] ~0.00ms ±0.00ms p95=0.01ms p99=0.01ms p999=0.09ms 105 | [graph-tx]: count=498 rate=3.15/s [83.61ms, 2586.21ms] ~281.81ms ±271.38ms p95=681.58ms p99=1828.87ms p999=2586.21ms 106 | [import]: count=1 rate=0.01/s [151272.85ms, 151272.85ms] ~151272.85ms ±0.00ms p95=151272.85ms p99=151272.85ms p999=151272.85ms 107 | [lookup-resource]: count=10930877 rate=69411.97/s [0.00ms, 0.01ms] ~0.00ms ±0.00ms p95=0.00ms p99=0.00ms p999=0.01ms 108 | [shutdown]: count=1 rate=0.14/s [7243.15ms, 7243.15ms] ~7243.15ms ±0.00ms p95=7243.15ms p99=7243.15ms p999=7243.15ms 109 | [subject-nodes]: count=4977864 rate=31606.02/s [0.00ms, 0.81ms] ~0.02ms ±0.04ms p95=0.04ms p99=0.15ms p999=0.80ms 110 | [update-resource]: count=2328089 rate=20607.95/s [0.01ms, 0.75ms] ~0.03ms ±0.06ms p95=0.03ms p99=0.33ms p999=0.75ms 111 | 112 | - `rels` and `nodes` count, whenever a relationship or node, resp., is added to the database. 113 | - `triples` counts how many triples were processed during the import, event the ones that did not result in new nodes or relationships 114 | - `create-rel` measures, how long it takes to create a new relationship. The count should be equivalent to `edges` 115 | - `create-resource` measures, how long it takes to create a resource node 116 | - `create-literal` measures, how long it takes to create a literal node 117 | - `create-bnode` measures, how long it takes to create a blank node. These last three should add up to `nodes` 118 | - `subject-nodes` measures, how long it takes to handle all statements for a particular subject. 119 | - `lookup-resource` measures the lookup time in the cache to determine, whether or not a resource already exists in the graph 120 | - `lookup-bnode` measures the lookup time in the cache to determine, whether or not a blank node already exists in the graph 121 | - `update-resource` measures, how long it takes to update labels and properties of an already existing resource 122 | - `graph-tx` measures the time of one "transaction" 123 | - `import` measures, how long the actual import process takes 124 | - `shutdown` measures, how long the shutdown process after the import takes. This includes flushing to disk and potentially creating schema indices 125 | 126 | ## Credits 127 | 128 | Credit goes to [@zazi](https://github.com/zazi) and [@sojoner](https://github.com/sojoner) for outlining the data model definition and possibly creating the need for this importer 129 | -------------------------------------------------------------------------------- /assembly.sbt: -------------------------------------------------------------------------------- 1 | import AssemblyKeys._ 2 | 3 | assemblySettings 4 | 5 | jarName in assembly := s"${name.value}.jar" 6 | 7 | outputPath in assembly := baseDirectory.value / (jarName in assembly).value 8 | 9 | mainClass in assembly := Some("de.knutwalker.dbpedia.Import") 10 | 11 | assemblyOption in assembly ~= { _.copy(prependShellScript = Some(Seq( 12 | "#!/usr/bin/env sh", 13 | """JAVA_OPTS="-server -d64 -Xms4G -Xmx4G -XX:NewRatio=5 -XX:+UseConcMarkSweepGC -XX:+UseParNewGC -XX:ParallelCMSThreads=4 -XX:+CMSParallelRemarkEnabled -XX:+CMSIncrementalMode -XX:+CMSIncrementalPacing -XX:CMSIncrementalDutyCycle=10 -XX:CMSFullGCsBeforeCompaction=1 ${JAVA_OPTS}"""", 14 | """IMPORT_OPTS="${IMPORT_OPTS:-}"""", 15 | """exec java $JAVA_OPTS $IMPORT_OPTS -jar "$0" "$@""""))) } 16 | 17 | mergeStrategy in assembly <<= (mergeStrategy in assembly) { (old) => { 18 | case x @ PathList("META-INF", xs @ _*) => 19 | (xs map (_.toLowerCase)) match { 20 | case ("changes.txt" :: Nil) | ("licenses.txt" :: Nil) => MergeStrategy.rename 21 | case _ => old(x) 22 | } 23 | case "CHANGES.txt" | "LICENSE.txt" => MergeStrategy.rename 24 | case nt if nt.endsWith(".gz") => MergeStrategy.discard 25 | case nt if nt.endsWith(".bz2") => MergeStrategy.discard 26 | case nt if nt.endsWith(".nt") => MergeStrategy.discard 27 | case x => old(x) 28 | }} 29 | -------------------------------------------------------------------------------- /build.sbt: -------------------------------------------------------------------------------- 1 | import scalariform.formatter.preferences._ 2 | 3 | name := """dbpedia-neo4j""" 4 | 5 | version := "1.0" 6 | 7 | scalaVersion := Version.scala 8 | 9 | resolvers ++= List( 10 | "Local Maven Repository" at "file://"+Path.userHome.absolutePath+"/.m2/repository", 11 | "NxParser" at "https://mvnrepository.com/artifact/org.semanticweb.yars/nxparser" 12 | ) 13 | 14 | scalacOptions ++= Seq( 15 | "-feature", 16 | "-unchecked", 17 | "-deprecation", 18 | "-Xlint", 19 | "-Ywarn-dead-code", 20 | "-target:jvm-1.7", 21 | "-encoding", "UTF-8" 22 | ) 23 | 24 | libraryDependencies ++= Dependencies.dbpedia 25 | 26 | net.virtualvoid.sbt.graph.Plugin.graphSettings 27 | 28 | Revolver.settings 29 | 30 | javaOptions in Revolver.reStart += "-Xmx8g" 31 | 32 | scalariformSettings 33 | 34 | ScalariformKeys.preferences := ScalariformKeys.preferences.value 35 | .setPreference(AlignSingleLineCaseStatements, true) 36 | .setPreference(DoubleIndentClassDeclaration, true) 37 | .setPreference(RewriteArrowSymbols, true) 38 | 39 | -------------------------------------------------------------------------------- /project/Dependencies.scala: -------------------------------------------------------------------------------- 1 | import sbt._ 2 | 3 | object Version { 4 | val scala = "2.11.0" 5 | val compress = "1.8" 6 | val config = "1.2.0" 7 | val disruptor = "3.2.1" 8 | val hppc = "0.6.0" 9 | val logback = "1.1.2" 10 | val metrics = "3.0.2" 11 | val neo4j = "3.3.1" 12 | val nxParser = "1.2.10" 13 | val slf4j = "1.7.7" 14 | } 15 | 16 | object Library { 17 | val neo4jExcludes = List( 18 | ExclusionRule("org.neo4j", "neo4j-cypher-compiler-2.0"), 19 | ExclusionRule("org.neo4j", "neo4j-cypher-compiler-3.1"), 20 | ExclusionRule("org.neo4j", "neo4j-cypher"), 21 | ExclusionRule("org.neo4j", "neo4j-udc"), 22 | ExclusionRule("org.neo4j", "neo4j-graph-algo"), 23 | ExclusionRule("org.neo4j", "neo4j-graph-matching"), 24 | ExclusionRule("org.scala-lang", "scala-library") 25 | ) 26 | 27 | val compress = "org.apache.commons" % "commons-compress" % Version.compress 28 | val config = "com.typesafe" % "config" % Version.config 29 | val disruptor = "com.lmax" % "disruptor" % Version.disruptor 30 | val hppc = "com.carrotsearch" % "hppc" % Version.hppc 31 | val logback = "ch.qos.logback" % "logback-classic" % Version.logback exclude("org.slf4j", "slf4j-api") 32 | val metrics = "com.codahale.metrics" % "metrics-core" % Version.metrics exclude("org.slf4j", "slf4j-api") 33 | val neo4j = "org.neo4j" % "neo4j" % Version.neo4j excludeAll (neo4jExcludes: _*) 34 | val nxParser = "org.semanticweb.yars" % "nxparser" % Version.nxParser 35 | val slf4j = "org.slf4j" % "slf4j-api" % Version.slf4j 36 | } 37 | 38 | object Dependencies { 39 | 40 | import Library._ 41 | 42 | val dbpedia = List( 43 | compress, 44 | config, 45 | disruptor, 46 | hppc, 47 | logback, 48 | metrics, 49 | neo4j, 50 | nxParser, 51 | slf4j 52 | ) 53 | } 54 | -------------------------------------------------------------------------------- /project/assembly.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.11.2") 2 | -------------------------------------------------------------------------------- /project/build.properties: -------------------------------------------------------------------------------- 1 | sbt.version=0.13.2 2 | -------------------------------------------------------------------------------- /project/plugins.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("net.virtual-void" % "sbt-dependency-graph" % "0.7.4") 2 | 3 | addSbtPlugin("io.spray" % "sbt-revolver" % "0.7.2") 4 | 5 | addSbtPlugin("com.typesafe.sbt" % "sbt-scalariform" % "1.3.0") 6 | -------------------------------------------------------------------------------- /sbt: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | sbtver=0.13.2 4 | sbtjar=sbt-launch.jar 5 | sbtsha128=d3237161dc38afd796d9e84ff202f8418cff98e2 6 | 7 | sbtrepo=http://repo.typesafe.com/typesafe/ivy-releases/org.scala-sbt/sbt-launch 8 | 9 | if [ ! -f $sbtjar ]; then 10 | echo "downloading $sbtjar" 1>&2 11 | if ! curl --location --silent --fail --remote-name $sbtrepo/$sbtver/$sbtjar; then 12 | exit 1 13 | fi 14 | fi 15 | 16 | checksum=`openssl dgst -sha1 $sbtjar | awk '{ print $2 }'` 17 | if [ "$checksum" != $sbtsha128 ]; then 18 | echo "bad $sbtjar. delete $sbtjar and run $0 again." 19 | exit 1 20 | fi 21 | 22 | [ -f ~/.sbtconfig ] && . ~/.sbtconfig 23 | 24 | java -ea \ 25 | $SBT_OPTS \ 26 | $JAVA_OPTS \ 27 | -Djava.net.preferIPv4Stack=true \ 28 | -XX:+AggressiveOpts \ 29 | -XX:+UseParNewGC \ 30 | -XX:+UseConcMarkSweepGC \ 31 | -XX:+CMSParallelRemarkEnabled \ 32 | -XX:+CMSClassUnloadingEnabled \ 33 | -XX:MaxPermSize=1024m \ 34 | -XX:SurvivorRatio=128 \ 35 | -XX:MaxTenuringThreshold=0 \ 36 | -Xss8M \ 37 | -Xms512M \ 38 | -Xmx1G \ 39 | -server \ 40 | -jar $sbtjar "$@" 41 | -------------------------------------------------------------------------------- /src/main/resources/reference.conf: -------------------------------------------------------------------------------- 1 | dbpedia { 2 | 3 | # The location of the database directory for Neo4j 4 | # does not necessarily need to end in .db 5 | db-dir = "graph.db" 6 | 7 | # The size of one transaction; this has no meaning for the batch importer 8 | # The size does not denoted the number of statements in one transaction 9 | # but the number of distinct subjects 10 | tx-size = 10000 11 | 12 | # an estimated (the more accurate, the better) value of how much resources 13 | # there will be created during the process 14 | approx-resources = 3000000 15 | 16 | # if true, create a schema index during shutdown phase 17 | # this will be costly at the end, but has no effect on the actual import performance 18 | deferred-index = false 19 | } 20 | -------------------------------------------------------------------------------- /src/main/scala/de/knutwalker/dbpedia/Import.scala: -------------------------------------------------------------------------------- 1 | package de.knutwalker.dbpedia 2 | 3 | 4 | object Import extends ParallelBatchImportComponent 5 | object SerialImport extends SerialBatchImportComponent 6 | -------------------------------------------------------------------------------- /src/main/scala/de/knutwalker/dbpedia/components/GraphComponent.scala: -------------------------------------------------------------------------------- 1 | package de.knutwalker.dbpedia.components 2 | 3 | import org.neo4j.graphdb.{ DynamicLabel, Label, RelationshipType } 4 | import scala.collection.immutable.Seq 5 | import scala.util.Try 6 | 7 | trait GraphComponent { 8 | this: SettingsComponent with MetricsComponent ⇒ 9 | 10 | type NodeType 11 | 12 | def graph: Graph 13 | 14 | object Labels { 15 | val subj: Label = DynamicLabel.label("Subject") 16 | val pred: Label = DynamicLabel.label("Predicate") 17 | val obj: Label = DynamicLabel.label("Object") 18 | val bNode: Label = DynamicLabel.label("BNode") 19 | val resource: Label = DynamicLabel.label("Resource") 20 | val literal: Label = DynamicLabel.label("Literal") 21 | val bNodeType: Label = DynamicLabel.label("BNodeType") 22 | val resourceType: Label = DynamicLabel.label("ResourceType") 23 | } 24 | 25 | object Properties { 26 | val uri = "uri" 27 | val value = "value" 28 | val nodeType = "nodeType" 29 | val dataType = "dataType" 30 | } 31 | 32 | trait Graph { 33 | 34 | def DB_PATH = settings.graphDbDir 35 | 36 | private def timeGetBNode(subject: String) = metrics.time("lookup-bnode") { 37 | getBNode(subject) 38 | } 39 | 40 | private def timeCreateBNode(subject: String, labels: Seq[Label], dynamicLabels: Seq[String]) = metrics.time("create-bnode") { 41 | val r = createBNode(subject, labels, dynamicLabels) 42 | metrics.nodeAdded() 43 | r 44 | } 45 | 46 | private def timeGetResource(uri: String) = metrics.time("lookup-resource") { 47 | getResourceNode(uri) 48 | } 49 | 50 | private def timeCreateResource(uri: String, values: Seq[String], labels: Seq[Label], dynamicLabels: Seq[String]) = metrics.time("create-resource") { 51 | val r = createResourceNode(uri, values, labels, dynamicLabels) 52 | metrics.nodeAdded() 53 | r 54 | } 55 | 56 | private def timeUpdateResource(id: NodeType, uri: String, values: Seq[String], labels: Seq[Label], dynamicLabels: Seq[String]) = metrics.time("update-resource") { 57 | updateResourceNode(id, uri, values, labels, dynamicLabels) 58 | } 59 | 60 | private def timeCreateLiteral(literal: String, labels: Seq[String]) = metrics.time("create-literal") { 61 | val r = createLiteralNode(literal, labels) 62 | metrics.nodeAdded() 63 | r 64 | } 65 | 66 | private def timeCreateRelationship(src: NodeType, dest: NodeType, rel: String, relType: RelationshipType) = metrics.time("create-rel") { 67 | val r = createRelationship(src, dest, rel, relType) 68 | metrics.relAdded() 69 | r 70 | } 71 | 72 | final def getOrCreateBNode(subject: String, labels: Seq[Label] = Nil, dynamicLabels: Seq[String] = Nil) = 73 | timeGetBNode(subject).getOrElse(timeCreateBNode(subject, labels, dynamicLabels)) 74 | 75 | final def getOrCreateResource(uri: String, values: Seq[String], labels: Seq[Label] = Nil, dynamicLabels: Seq[String] = Nil) = 76 | timeGetResource(uri).getOrElse(timeCreateResource(uri, values, labels, dynamicLabels)) 77 | 78 | final def getAndUpdateResource(uri: String, values: Seq[String], labels: Seq[Label], dynamicLabels: Seq[String]): NodeType = { 79 | timeGetResource(uri) match { 80 | case Some(id) ⇒ timeUpdateResource(id, uri, values, labels, dynamicLabels) 81 | case None ⇒ timeCreateResource(uri, values, labels, dynamicLabels) 82 | } 83 | } 84 | 85 | final def createLiteral(literal: String, labels: Seq[String]) = timeCreateLiteral(literal, labels) 86 | 87 | final def createRel(src: NodeType, dest: NodeType, rel: String, relType: RelationshipType) = 88 | timeCreateRelationship(src, dest, rel, relType) 89 | 90 | protected def getBNode(subject: String): Option[NodeType] 91 | 92 | protected def createBNode(subject: String, labels: Seq[Label], dynamicLabels: Seq[String]): NodeType 93 | 94 | protected def getResourceNode(uri: String): Option[NodeType] 95 | 96 | protected def createResourceNode(uri: String, values: Seq[String], labels: Seq[Label], dynamicLabels: Seq[String]): NodeType 97 | 98 | protected def updateResourceNode(id: NodeType, uri: String, values: Seq[String], labels: Seq[Label], dynamicLabels: Seq[String]): NodeType 99 | 100 | protected def createLiteralNode(literal: String, labels: Seq[String]): NodeType 101 | 102 | protected def createRelationship(src: NodeType, dest: NodeType, rel: String, relType: RelationshipType): Unit 103 | 104 | def withinTx[A](body: ⇒ A): Try[A] 105 | 106 | def subjectAdded(): Unit 107 | 108 | def shutdown(): Unit 109 | } 110 | } 111 | -------------------------------------------------------------------------------- /src/main/scala/de/knutwalker/dbpedia/components/HandlerComponent.scala: -------------------------------------------------------------------------------- 1 | package de.knutwalker.dbpedia.components 2 | 3 | import org.semanticweb.yars.nx.Node 4 | 5 | trait HandlerComponent { 6 | import ParserComponent._ 7 | 8 | def handler: Handler 9 | 10 | trait Handler { 11 | 12 | def handleStatements(subject: Node, nodes: Statements): Unit 13 | 14 | def apply(nodesBatches: Iterator[Seq[Statements]]): Unit 15 | 16 | def shutdown(): Unit 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /src/main/scala/de/knutwalker/dbpedia/components/ImporterComponent.scala: -------------------------------------------------------------------------------- 1 | package de.knutwalker.dbpedia.components 2 | 3 | trait ImporterComponent { 4 | this: MetricsComponent with ParserComponent with HandlerComponent with SettingsComponent ⇒ 5 | 6 | def importer: Importer 7 | 8 | trait Importer { 9 | 10 | def apply(fileNames: Array[String], txSize: Int, p: Parser, h: Handler): Unit 11 | 12 | def apply(fileNames: Array[String]): Unit = { 13 | val p = parser 14 | val h = handler 15 | val txSize = settings.txSize 16 | 17 | sys.addShutdownHook(h.shutdown()) 18 | 19 | val elapsed = metrics.start() 20 | 21 | apply(fileNames, txSize, p, h) 22 | 23 | elapsed() 24 | 25 | h.shutdown() 26 | p.shutdown() 27 | 28 | metrics.reportAll() 29 | metrics.shutdown() 30 | } 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /src/main/scala/de/knutwalker/dbpedia/components/MetricsComponent.scala: -------------------------------------------------------------------------------- 1 | package de.knutwalker.dbpedia.components 2 | 3 | trait MetricsComponent { 4 | 5 | def metrics: Metrics 6 | 7 | trait Metrics { 8 | 9 | def tripleAdded(n: Long = 1): Unit 10 | 11 | def nodeAdded(): Unit 12 | 13 | def relAdded(): Unit 14 | 15 | def nodeUpdated(): Unit 16 | 17 | def time[A](name: String)(f: ⇒ A): A 18 | 19 | def start(): () ⇒ Long 20 | 21 | def report(): Unit 22 | 23 | def reportAll(): Unit 24 | 25 | def shutdown(): Unit 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /src/main/scala/de/knutwalker/dbpedia/components/ParserComponent.scala: -------------------------------------------------------------------------------- 1 | package de.knutwalker.dbpedia.components 2 | 3 | import org.semanticweb.yars.nx.Node 4 | import scala.collection.immutable.Seq 5 | 6 | trait ParserComponent { 7 | import ParserComponent._ 8 | 9 | def parser: Parser 10 | 11 | trait Parser { 12 | 13 | def apply(fileName: String): Iterator[Statement] 14 | 15 | def shutdown(): Unit 16 | } 17 | } 18 | 19 | object ParserComponent { 20 | type Statement = Array[Node] 21 | type Statements = Seq[Statement] 22 | } 23 | -------------------------------------------------------------------------------- /src/main/scala/de/knutwalker/dbpedia/components/SettingsComponent.scala: -------------------------------------------------------------------------------- 1 | package de.knutwalker.dbpedia.components 2 | 3 | trait SettingsComponent { 4 | 5 | def settings: Settings 6 | 7 | case class Settings(graphDbDir: String, txSize: Int, approximatedResources: Int, createDeferredIndices: Boolean) 8 | } 9 | -------------------------------------------------------------------------------- /src/main/scala/de/knutwalker/dbpedia/disruptor/StatementEvent.scala: -------------------------------------------------------------------------------- 1 | package de.knutwalker.dbpedia.disruptor 2 | 3 | import com.lmax.disruptor.EventFactory 4 | import de.knutwalker.dbpedia.components.ParserComponent._ 5 | import org.semanticweb.yars.nx.{ BNode, Node } 6 | 7 | class StatementEvent(var subject: Node, var statements: Statements) { 8 | override def toString = subject.toString 9 | } 10 | 11 | object StatementEvent extends EventFactory[StatementEvent] { 12 | private[this] final val empty: Node = new BNode("") 13 | 14 | def apply(subject: Node, statements: Statements) = 15 | new StatementEvent(subject, statements) 16 | 17 | def newInstance() = apply(empty, Nil) 18 | } 19 | -------------------------------------------------------------------------------- /src/main/scala/de/knutwalker/dbpedia/disruptor/StatementEventHandler.scala: -------------------------------------------------------------------------------- 1 | package de.knutwalker.dbpedia.disruptor 2 | 3 | import com.lmax.disruptor.EventHandler 4 | import de.knutwalker.dbpedia.components.HandlerComponent 5 | 6 | class StatementEventHandler(handler: HandlerComponent#Handler) extends EventHandler[StatementEvent] { 7 | def onEvent(event: StatementEvent, sequence: Long, endOfBatch: Boolean) = { 8 | handler.handleStatements(event.subject, event.statements) 9 | } 10 | } 11 | 12 | object StatementEventHandler { 13 | def apply[T <: HandlerComponent](handler: T#Handler) = new StatementEventHandler(handler) 14 | } 15 | -------------------------------------------------------------------------------- /src/main/scala/de/knutwalker/dbpedia/disruptor/StatementEventProducer.scala: -------------------------------------------------------------------------------- 1 | package de.knutwalker.dbpedia.disruptor 2 | 3 | import com.lmax.disruptor.{ EventTranslatorTwoArg, RingBuffer } 4 | import de.knutwalker.dbpedia.components.ParserComponent._ 5 | import org.semanticweb.yars.nx.Node 6 | 7 | class StatementEventProducer(ringBuffer: RingBuffer[StatementEvent]) { 8 | import StatementEventProducer.translator 9 | 10 | def apply(subject: Node, statements: Statements) = ringBuffer.publishEvent(translator, subject, statements) 11 | } 12 | 13 | object StatementEventProducer { 14 | val translator = new EventTranslatorTwoArg[StatementEvent, Node, Statements] { 15 | def translateTo(event: StatementEvent, sequence: Long, arg0: Node, arg1: Statements) = { 16 | event.subject = arg0 17 | event.statements = arg1 18 | } 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /src/main/scala/de/knutwalker/dbpedia/impl/ConfigSettingsComponent.scala: -------------------------------------------------------------------------------- 1 | package de.knutwalker.dbpedia.impl 2 | 3 | import com.typesafe.config.ConfigFactory 4 | import de.knutwalker.dbpedia.components.SettingsComponent 5 | 6 | trait ConfigSettingsComponent extends SettingsComponent { 7 | 8 | val settings: Settings = fromConfig() 9 | 10 | private def fromConfig() = { 11 | val config = { 12 | val c = ConfigFactory.load() 13 | c.checkValid(c, "dbpedia") 14 | c getConfig "dbpedia" 15 | } 16 | 17 | Settings( 18 | config getString "db-dir", 19 | config getInt "tx-size", 20 | config getInt "approx-resources", 21 | config getBoolean "deferred-index" 22 | ) 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /src/main/scala/de/knutwalker/dbpedia/impl/DefaultHandlerComponent.scala: -------------------------------------------------------------------------------- 1 | package de.knutwalker.dbpedia.impl 2 | 3 | import de.knutwalker.dbpedia.components.{ SettingsComponent, ParserComponent, MetricsComponent, GraphComponent, HandlerComponent } 4 | import org.neo4j.graphdb.{ DynamicRelationshipType, RelationshipType } 5 | import org.semanticweb.yars.nx.namespace.{ SKOS, RDFS, RDF } 6 | import org.semanticweb.yars.nx.{ Literal, Node, Resource, BNode } 7 | import scala.collection.{Seq => GSeq} 8 | import scala.collection.immutable.Seq 9 | import scala.collection.mutable 10 | import scala.util.{ Success, Failure, Try } 11 | 12 | trait DefaultHandlerComponent extends HandlerComponent { 13 | this: GraphComponent with MetricsComponent with SettingsComponent ⇒ 14 | 15 | import ParserComponent._ 16 | 17 | val handler: Handler = new DefaultHandler 18 | 19 | private final class DefaultHandler extends Handler { 20 | 21 | private val PREFLABEL = new Resource(SKOS.NS + "prefLabel") 22 | 23 | private val rels = new mutable.AnyRefMap[String, RelationshipType](100) 24 | 25 | private def relTypeFor(name: String): RelationshipType = { 26 | rels.getOrElseUpdate(name, DynamicRelationshipType.withName(name)) 27 | } 28 | 29 | def isLabel(p: Node) = p == RDFS.LABEL || p == PREFLABEL 30 | 31 | private def handleSubject(subject: Resource, labels: Seq[String], values: Seq[String]): NodeType = { 32 | graph.getAndUpdateResource(subject.toString, values, Labels.resource :: Nil, labels) 33 | } 34 | 35 | private def handleResource(resource: Resource): NodeType = { 36 | graph.getOrCreateResource(resource.toString, Nil, Labels.resource :: Nil, Nil) 37 | } 38 | 39 | private def createBNode(subject: BNode): NodeType = { 40 | graph.getOrCreateBNode(subject.toString, Labels.bNode :: Nil) 41 | } 42 | 43 | private def handleBNode(subject: BNode): NodeType = { 44 | graph.getOrCreateBNode(subject.toString, Labels.bNode :: Nil) 45 | } 46 | 47 | private def handleSubject(subject: Node, labels: Seq[String], values: Seq[String]): NodeType = subject match { 48 | case x: BNode ⇒ createBNode(x) 49 | case x: Resource ⇒ handleSubject(x, labels, values) 50 | } 51 | 52 | private def handleObject(obj: Literal): NodeType = { 53 | val value = obj.toString 54 | val tpe = Option(obj.getDatatype).toList.map(_.toString) 55 | 56 | graph.createLiteral(value, tpe) 57 | } 58 | 59 | private def handleObject(obj: Node): NodeType = obj match { 60 | case x: Literal ⇒ handleObject(x) 61 | case x: BNode ⇒ handleBNode(x) 62 | case x: Resource ⇒ handleResource(x) 63 | } 64 | 65 | private def handlePredicate(subj: NodeType, obj: NodeType, pred: String): Unit = { 66 | val relType = relTypeFor(pred) 67 | graph.createRel(subj, obj, pred, relType) 68 | } 69 | 70 | private def handleStatement(subjectNode: NodeType, nodes: Statement): Unit = { 71 | val predicate = nodes(1) 72 | val obj = nodes(2) 73 | 74 | val predicateName = predicate.toString 75 | val objNode = handleObject(obj) 76 | 77 | handlePredicate(subjectNode, objNode, predicateName) 78 | } 79 | 80 | private def handleStatements(subject: Node, labels: Seq[String], values: Seq[String], nodes: Statements): Unit = { 81 | val subjectNode = handleSubject(subject, labels, values) 82 | nodes.foreach(handleStatement(subjectNode, _)) 83 | } 84 | 85 | def handleStatements(subject: Node, nodes: Statements): Unit = metrics.time("subject-nodes") { 86 | val (rdfTypes, allStatements) = nodes.partition(_(1) == RDF.TYPE) 87 | val (labels, statements) = allStatements.partition(n ⇒ isLabel(n(1))) 88 | 89 | val rdfLabels = rdfTypes.map(_(2).toString) 90 | val schemaLabels = labels.map(_(2).toString) 91 | 92 | handleStatements(subject, rdfLabels, schemaLabels, statements) 93 | 94 | metrics.tripleAdded(nodes.length) 95 | 96 | graph.subjectAdded() 97 | } 98 | 99 | private def handleStatements(nodes: Statements): Unit = { 100 | val subject = nodes.head(0) 101 | handleStatements(subject, nodes) 102 | } 103 | 104 | private def handleBatch(batch: GSeq[Statements]): Try[Unit] = metrics.time("graph-tx") { 105 | graph.withinTx { 106 | batch.foreach(handleStatements) 107 | } 108 | } 109 | 110 | def apply(nodesBatches: Iterator[GSeq[Statements]]): Unit = { 111 | val it = nodesBatches 112 | while (it.hasNext) { 113 | handleBatch(it.next()) match { 114 | case Failure(e) ⇒ e.printStackTrace() 115 | case Success(x) ⇒ // metrics.report() 116 | } 117 | } 118 | } 119 | 120 | def shutdown(): Unit = { 121 | metrics.time("shutdown") { 122 | Try(graph.shutdown()) 123 | } 124 | } 125 | } 126 | } 127 | -------------------------------------------------------------------------------- /src/main/scala/de/knutwalker/dbpedia/impl/DefaultImporterComponent.scala: -------------------------------------------------------------------------------- 1 | package de.knutwalker.dbpedia.impl 2 | 3 | import de.knutwalker.dbpedia.components._ 4 | import de.knutwalker.dbpedia.util.itertools 5 | 6 | trait DefaultImporterComponent extends ImporterComponent { 7 | this: MetricsComponent with ParserComponent with HandlerComponent with SettingsComponent ⇒ 8 | 9 | val importer: Importer = new DefaultImporter 10 | 11 | private final class DefaultImporter extends Importer { 12 | 13 | def apply(fileNames: Array[String], txSize: Int, p: Parser, h: Handler) = { 14 | 15 | val statements = fileNames.toIterator.flatMap(p.apply) 16 | 17 | val grouped = itertools.groupIter(statements)(_(0)).grouped(txSize) 18 | 19 | h(grouped) 20 | } 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /src/main/scala/de/knutwalker/dbpedia/impl/DefaultMetricsComponent.scala: -------------------------------------------------------------------------------- 1 | package de.knutwalker.dbpedia.impl 2 | 3 | import com.codahale.metrics._ 4 | import de.knutwalker.dbpedia.components.MetricsComponent 5 | import java.util 6 | import java.util.Map.Entry 7 | import java.util.concurrent.{ TimeUnit, Callable } 8 | import org.slf4j.{ Logger, LoggerFactory } 9 | 10 | trait DefaultMetricsComponent extends MetricsComponent { 11 | 12 | val metrics: Metrics = CHMetrics 13 | 14 | private object CHMetrics extends Metrics { 15 | 16 | private final val reg = new MetricRegistry 17 | private final val reporter = { 18 | val r = MetricReporter(reg) 19 | r.start(1, TimeUnit.SECONDS) 20 | r 21 | } 22 | 23 | private final val triples = reg.meter("triples") 24 | private final val nodes = reg.meter("nodes") 25 | private final val rels = reg.meter("rels") 26 | private final val updates = reg.meter("updates") 27 | private final val importer = reg.timer("import") 28 | 29 | def time[A](name: String)(f: ⇒ A): A = { 30 | val timer = reg.timer(name) 31 | timer.time(new Callable[A] { 32 | def call() = f 33 | }) 34 | } 35 | 36 | def tripleAdded(n: Long) = triples.mark(n) 37 | 38 | def relAdded() = rels.mark() 39 | 40 | def nodeAdded() = nodes.mark() 41 | 42 | def nodeUpdated() = updates.mark() 43 | 44 | def start() = { 45 | val ctx = importer.time() 46 | () ⇒ ctx.stop() 47 | } 48 | 49 | def report() = reporter.report() 50 | 51 | def reportAll() = reporter.reportAll() 52 | 53 | def shutdown() = reporter.stop() 54 | } 55 | 56 | object MetricReporter { 57 | 58 | private final val durationUnit = TimeUnit.MICROSECONDS 59 | private final val durationFactor = 1.0 / durationUnit.toNanos(1) 60 | 61 | private final val rateUnit = TimeUnit.SECONDS 62 | private final val rateFactor = rateUnit.toSeconds(1) 63 | 64 | private final def duration(duration: Double) = f"${duration * durationFactor}%.2fus" 65 | 66 | private final def rate(rate: Double) = f"${rate * rateFactor}%.2f/s" 67 | 68 | def apply(registry: MetricRegistry, 69 | logger: Logger = LoggerFactory.getLogger("metrics"), 70 | filter: MetricFilter = MetricFilter.ALL): MetricReporter = { 71 | new MetricReporter(registry, logger, rateUnit, durationUnit, filter) 72 | } 73 | 74 | private sealed trait MetricLogger[T <: Metric] { 75 | def text(metric: T): String 76 | } 77 | 78 | private object CounterLogger extends MetricLogger[Counter] { 79 | def text(metric: Counter) = s"count=${metric.getCount}" 80 | } 81 | 82 | private object GaugeLogger extends MetricLogger[Gauge[_]] { 83 | def text(metric: Gauge[_]) = s"value=${metric.getValue}" 84 | } 85 | 86 | private object HistorgramLogger extends MetricLogger[Histogram] { 87 | def text(metric: Histogram) = { 88 | val s = metric.getSnapshot 89 | s"count=${metric.getCount} min=${s.getMin} max=${s.getMax} mean=${s.getMean} p95=${s.get95thPercentile()} p99=${s.get99thPercentile()} p999=${s.get999thPercentile()}" 90 | } 91 | } 92 | 93 | private object MeterLogger extends MetricLogger[Meter] { 94 | def text(metric: Meter) = s"count=${metric.getCount} rate=${rate(metric.getMeanRate)}" 95 | } 96 | 97 | private object TimerLogger extends MetricLogger[Timer] { 98 | def text(metric: Timer) = { 99 | val s = metric.getSnapshot 100 | s"count=${metric.getCount} rate=${rate(metric.getMeanRate)} [${duration(s.getMin)}, ${duration(s.getMax)}] ~${duration(s.getMean)} ±${duration(s.getStdDev)} p95=${duration(s.get95thPercentile)} p99=${duration(s.get99thPercentile)} p999=${duration(s.get999thPercentile)}" 101 | } 102 | } 103 | 104 | private implicit val counterLogger = CounterLogger 105 | private implicit val gaugeLogger = GaugeLogger 106 | private implicit val histogramLogger = HistorgramLogger 107 | private implicit val meterLogger = MeterLogger 108 | private implicit val timerLogger = TimerLogger 109 | } 110 | 111 | class MetricReporter private (registry: MetricRegistry, 112 | logger: Logger, 113 | rateUnit: TimeUnit, 114 | durationUnit: TimeUnit, 115 | filter: MetricFilter) 116 | extends ScheduledReporter(registry, "dbpedia-reporter", filter, rateUnit, durationUnit) { 117 | 118 | import MetricReporter._ 119 | 120 | private final def empty[T <: Metric]: util.SortedMap[String, T] = { 121 | new util.TreeMap[String, T] 122 | } 123 | 124 | private final val gauges = empty[Gauge[_]] 125 | private final val counters = empty[Counter] 126 | private final val histograms = empty[Histogram] 127 | private final val timers = empty[Timer] 128 | private final val meters: util.SortedMap[String, Meter] = { 129 | val meters = new util.TreeMap[String, Meter] 130 | meters.put("nodes", registry.meter("nodes")) 131 | meters.put("rels", registry.meter("rels")) 132 | meters.put("updates", registry.meter("updates")) 133 | meters 134 | } 135 | 136 | def reportAll() = { 137 | report(registry.getGauges(filter), 138 | registry.getCounters(filter), 139 | registry.getHistograms(filter), 140 | registry.getMeters(filter), 141 | registry.getTimers(filter)) 142 | } 143 | 144 | override def report() = { 145 | report(gauges, counters, histograms, meters, timers) 146 | } 147 | 148 | def report(gauges: util.SortedMap[String, Gauge[_]], 149 | counters: util.SortedMap[String, Counter], 150 | histograms: util.SortedMap[String, Histogram], 151 | meters: util.SortedMap[String, Meter], 152 | timers: util.SortedMap[String, Timer]) = { 153 | import scala.collection.JavaConversions._ 154 | 155 | gauges.entrySet().foreach(logMetric[Gauge[_]]) 156 | counters.entrySet().foreach(logMetric[Counter]) 157 | histograms.entrySet().foreach(logMetric[Histogram]) 158 | meters.entrySet().foreach(logMetric[Meter]) 159 | timers.entrySet().foreach(logMetric[Timer]) 160 | } 161 | 162 | private def logMetric[M <: Metric](entry: Entry[String, M])(implicit ev: MetricLogger[M]): Unit = { 163 | logger.info(s"[{}]: ${ev.text(entry.getValue)}", entry.getKey) 164 | } 165 | } 166 | } 167 | -------------------------------------------------------------------------------- /src/main/scala/de/knutwalker/dbpedia/impl/DefaultParserComponent.scala: -------------------------------------------------------------------------------- 1 | package de.knutwalker.dbpedia.impl 2 | 3 | import de.knutwalker.dbpedia.components.ParserComponent 4 | import de.knutwalker.dbpedia.components.ParserComponent.Statement 5 | import java.io.{ Closeable, EOFException, PushbackInputStream, FileInputStream, BufferedInputStream, InputStream, File } 6 | import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream 7 | import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream 8 | import org.semanticweb.yars.nx.parser.NxParser 9 | import org.slf4j.LoggerFactory 10 | import scala.collection.convert.DecorateAsScala 11 | 12 | trait DefaultParserComponent extends ParserComponent { 13 | 14 | val parser: Parser = new DeflateParser 15 | 16 | private final class DeflateParser extends Parser with DecorateAsScala { 17 | 18 | private var streams: List[Closeable] = Nil 19 | 20 | private def addCloseable(is: InputStream): InputStream = { 21 | streams ::= is 22 | is 23 | } 24 | 25 | val logger = LoggerFactory.getLogger(classOf[ParserComponent]) 26 | 27 | def toFile(fileName: String): File = new File(fileName) 28 | 29 | def openInputStream(file: File): InputStream = new BufferedInputStream(new FileInputStream(file)) 30 | 31 | def loadResource(fileName: String): Option[InputStream] = Option(getClass.getResourceAsStream(fileName)) 32 | 33 | def open(fileName: String): Option[InputStream] = 34 | Some(toFile(fileName)). 35 | filter(_.exists()). 36 | map(openInputStream). 37 | orElse(loadResource(fileName). 38 | orElse(loadResource(s"/$fileName"))). 39 | map(addCloseable) 40 | 41 | def peekBytes(stream: PushbackInputStream, n: Int): Array[Byte] = { 42 | val buf = new Array[Byte](n) 43 | val bytesRead = stream.read(buf) 44 | if (bytesRead == -1) throw new EOFException 45 | stream.unread(buf, 0, bytesRead) 46 | 47 | buf 48 | } 49 | 50 | /** using commons for concatenated stream, such as produced by pigz */ 51 | def gzip(stream: PushbackInputStream): Option[InputStream] = { 52 | 53 | val buf = peekBytes(stream, 2) 54 | if (GzipCompressorInputStream.matches(buf, 2)) { 55 | logger.info("using gzip encoding") 56 | Some(new GzipCompressorInputStream(stream, true)) 57 | } else None 58 | } 59 | 60 | /** using commons for concatenated stream, such as produced by pbzip2 */ 61 | def bzip2(stream: PushbackInputStream): Option[InputStream] = { 62 | 63 | val buf = peekBytes(stream, 3) 64 | if (BZip2CompressorInputStream.matches(buf, 3)) { 65 | logger.info("using bzip2 encoding") 66 | Some(new BZip2CompressorInputStream(stream, true)) 67 | } else None 68 | } 69 | 70 | def deflate(stream: InputStream): Option[InputStream] = { 71 | val pb = new PushbackInputStream(stream, 3) 72 | gzip(pb) orElse bzip2(pb) orElse Some(pb) map addCloseable 73 | } 74 | 75 | def nxParser(stream: InputStream) = new NxParser(stream) 76 | 77 | def readFile(fileName: String): Option[NxParser] = 78 | open(fileName).flatMap(deflate).map(nxParser) 79 | 80 | def nxIter(parser: NxParser): Iterator[Statement] = 81 | asScalaIteratorConverter(parser).asScala 82 | 83 | def apply(fileName: String): Iterator[Statement] = 84 | readFile(fileName).map(nxIter).getOrElse(Iterator.empty) 85 | 86 | def shutdown() = streams.foreach(_.close()) 87 | } 88 | } 89 | -------------------------------------------------------------------------------- /src/main/scala/de/knutwalker/dbpedia/impl/DisruptorImporterComponent.scala: -------------------------------------------------------------------------------- 1 | package de.knutwalker.dbpedia.impl 2 | 3 | import com.lmax.disruptor.BusySpinWaitStrategy 4 | import com.lmax.disruptor.dsl.{ ProducerType, Disruptor } 5 | import de.knutwalker.dbpedia.components.{ SettingsComponent, HandlerComponent, ParserComponent, MetricsComponent, ImporterComponent } 6 | import de.knutwalker.dbpedia.disruptor.{ StatementEvent, StatementEventProducer, StatementEventHandler } 7 | import de.knutwalker.dbpedia.util.itertools 8 | import java.util.concurrent.Executors 9 | import org.neo4j.helpers.NamedThreadFactory 10 | 11 | trait DisruptorImporterComponent extends ImporterComponent { 12 | this: MetricsComponent with ParserComponent with HandlerComponent with SettingsComponent ⇒ 13 | 14 | val importer: Importer = new DisruptorImporter 15 | 16 | private final class DisruptorImporter extends Importer { 17 | 18 | private val threadFactory = new NamedThreadFactory("disruptor") 19 | val waitStrategy = new BusySpinWaitStrategy 20 | // TODO: estimate 21 | private val bufferSize = 1 << 18 22 | 23 | def apply(fileNames: Array[String], txSize: Int, p: Parser, h: Handler) = { 24 | 25 | val executor = Executors.newSingleThreadExecutor(threadFactory) 26 | val disruptor = new Disruptor( 27 | StatementEvent, bufferSize, executor, ProducerType.SINGLE, waitStrategy) 28 | 29 | val eventHandler = StatementEventHandler(h) 30 | disruptor.handleEventsWith(eventHandler) 31 | disruptor.start() 32 | 33 | val ringBuffer = disruptor.getRingBuffer 34 | val producer = new StatementEventProducer(ringBuffer) 35 | 36 | val statements = fileNames.toIterator.flatMap(p.apply) 37 | val grouped = itertools.groupIter(statements)(_(0)) 38 | 39 | grouped.foreach { 40 | allStatements ⇒ 41 | val subject = allStatements.head(0) 42 | producer(subject, allStatements) 43 | } 44 | 45 | metrics.reportAll() 46 | 47 | disruptor.shutdown() 48 | executor.shutdown() 49 | } 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /src/main/scala/de/knutwalker/dbpedia/impl/FastBatchGraphComponent.scala: -------------------------------------------------------------------------------- 1 | package de.knutwalker.dbpedia.impl 2 | 3 | import com.carrotsearch.hppc.{ ObjectLongMap, ObjectLongOpenHashMap } 4 | import de.knutwalker.dbpedia.components.{ MetricsComponent, SettingsComponent, GraphComponent } 5 | import java.util 6 | import java.io.File 7 | import org.neo4j.graphdb.{ DynamicLabel, Label, RelationshipType } 8 | import org.neo4j.helpers.collection.MapUtil 9 | import org.neo4j.unsafe.batchinsert.BatchInserters 10 | import scala.collection.JavaConverters._ 11 | import scala.collection.immutable.Seq 12 | import scala.collection.mutable 13 | import scala.util.Try 14 | 15 | trait FastBatchGraphComponent extends GraphComponent { 16 | this: SettingsComponent with MetricsComponent ⇒ 17 | 18 | type NodeType = Long 19 | 20 | lazy val graph: Graph = new FastBatchGraph 21 | 22 | private final class FastBatchGraph extends Graph { 23 | 24 | private val megs: Double = 1000 * 1000 25 | 26 | private def mem(n: Int) = f"${n / megs}%.0fM" 27 | 28 | private def inserterConfig = { 29 | val res = settings.approximatedResources 30 | 31 | // TODO: allow for fine grained settings 32 | val relsPerNode = 3 33 | val propsPerNode = 4 34 | 35 | // as per http://docs.neo4j.org/chunked/stable/configuration-caches.html 36 | val bytesPerNode = 14 37 | val bytesPerRel = 33 38 | val bytesPerProp = 42 39 | val bytesPerStringProp = 128 // might be totally off 40 | 41 | val nodes = res 42 | val relationships = nodes * relsPerNode 43 | val properties = nodes * propsPerNode 44 | val stringProperties = properties 45 | 46 | val nodesMem = mem(nodes * bytesPerNode) 47 | val relsMem = mem(relationships * bytesPerRel) 48 | val propsMem = mem(properties * bytesPerProp) 49 | val stringPropsMem = mem(stringProperties * bytesPerStringProp) 50 | 51 | MapUtil.stringMap( 52 | // TODO: make cache_type configurable 53 | "cache_type", "none", 54 | "use_memory_mapped_buffers", "true", 55 | "neostore.nodestore.db.mapped_memory", nodesMem, 56 | "neostore.relationshipstore.db.mapped_memory", relsMem, 57 | "neostore.propertystore.db.mapped_memory", propsMem, 58 | "neostore.propertystore.db.strings.mapped_memory", stringPropsMem, 59 | "neostore.propertystore.db.arrays.mapped_memory", "0M", 60 | "neostore.propertystore.db.index.keys.mapped_memory", "5M", 61 | "neostore.propertystore.db.index.mapped_memory", "5M" 62 | ) 63 | } 64 | 65 | private val inserter = { 66 | val config = inserterConfig 67 | BatchInserters.inserter(new File(DB_PATH), config) 68 | } 69 | 70 | if (settings.createDeferredIndices) { 71 | inserter.createDeferredSchemaIndex(Labels.resource).on(Properties.uri).create() 72 | inserter.createDeferredSchemaIndex(Labels.literal).on(Properties.value).create() 73 | } 74 | 75 | private val labels = new mutable.AnyRefMap[String, Label](32) 76 | 77 | private val resources: ObjectLongMap[String] = new ObjectLongOpenHashMap(settings.approximatedResources) 78 | private val bnodes: ObjectLongMap[String] = new ObjectLongOpenHashMap(settings.txSize) 79 | 80 | private def getLabel(label: String): Label = { 81 | labels.getOrElseUpdate(label, DynamicLabel.label(label)) 82 | } 83 | 84 | private def addValues(p: util.Map[String, AnyRef], vs: Seq[String]) = { 85 | vs.foreach(v ⇒ p.put(Properties.value, v)) 86 | } 87 | 88 | private def makeLabels(dynamicLabels: Seq[String]): Seq[Label] = dynamicLabels.map(getLabel) 89 | 90 | private def get(cache: ObjectLongMap[String], key: String): Option[Long] = { 91 | val n = cache.getOrDefault(key, -1) 92 | if (n == -1) None else Some(n) 93 | } 94 | 95 | private def set(cache: ObjectLongMap[String], key: String, properties: util.Map[String, AnyRef], labels: Seq[Label]): NodeType = { 96 | val n = inserter.createNode(properties, labels: _*) 97 | cache.put(key, n) 98 | n 99 | } 100 | 101 | private def props(k: String, v: AnyRef): java.util.Map[String, AnyRef] = { 102 | val p = new java.util.HashMap[String, AnyRef](1) 103 | p.put(k, v) 104 | p 105 | } 106 | 107 | protected def getBNode(subject: String) = get(bnodes, subject) 108 | 109 | protected def createBNode(subject: String, labels: Seq[Label], dynamicLabels: Seq[String]) = { 110 | val ls = labels ++ makeLabels(dynamicLabels) 111 | set(bnodes, subject, null, ls) 112 | } 113 | 114 | protected def getResourceNode(uri: String) = get(resources, uri) 115 | 116 | protected def createResourceNode(uri: String, values: Seq[String], labels: Seq[Label], dynamicLabels: Seq[String]) = { 117 | val p = props(Properties.uri, uri) 118 | addValues(p, values) 119 | val ls = labels ++ makeLabels(dynamicLabels) 120 | set(resources, uri, p, ls) 121 | } 122 | 123 | def updateResourceNode(id: Long, uri: String, values: Seq[String], labels: Seq[Label], dynamicLabels: Seq[String]) = { 124 | 125 | val labelsBefore = inserter.getNodeLabels(id).asScala.toList 126 | val newLabels = labels ++ makeLabels(dynamicLabels) 127 | 128 | inserter.setNodeLabels(id, (labelsBefore ++ newLabels).distinct: _*) 129 | 130 | val propsBefore = inserter.getNodeProperties(id) 131 | addValues(propsBefore, values) 132 | inserter.setNodeProperties(id, propsBefore) 133 | 134 | metrics.nodeUpdated() 135 | 136 | id 137 | } 138 | 139 | def createLiteralNode(literal: String, labels: Seq[String]) = { 140 | val p = props(Properties.value, literal) 141 | val ls = Labels.literal +: makeLabels(labels) 142 | val n = inserter.createNode(p, ls: _*) 143 | n 144 | } 145 | 146 | def createRelationship(src: Long, dest: Long, rel: String, relType: RelationshipType): Unit = { 147 | val p = props(Properties.uri, rel) 148 | inserter.createRelationship(src, dest, relType, p) 149 | } 150 | 151 | def subjectAdded() = () 152 | 153 | def withinTx[A](body: ⇒ A) = Try(body) 154 | 155 | def shutdown() = inserter.shutdown() 156 | } 157 | } 158 | -------------------------------------------------------------------------------- /src/main/scala/de/knutwalker/dbpedia/util/Tx.scala: -------------------------------------------------------------------------------- 1 | package de.knutwalker.dbpedia.util 2 | 3 | import org.neo4j.graphdb.GraphDatabaseService 4 | import scala.util.{ Failure, Success, Try } 5 | 6 | final class Tx(gdb: GraphDatabaseService) { 7 | 8 | def apply[T](body: GraphDatabaseService ⇒ T): Try[T] = { 9 | val tx = gdb.beginTx 10 | try { 11 | val ret = body(gdb) 12 | tx.success() 13 | Success(ret) 14 | } catch { 15 | case t: Throwable ⇒ Failure(t) 16 | } finally { 17 | tx.close() 18 | } 19 | } 20 | 21 | def map[T](body: GraphDatabaseService ⇒ T): Try[T] = apply(body) 22 | 23 | def flatMap[T](body: GraphDatabaseService ⇒ T): Try[T] = apply(body) 24 | 25 | def foreach[T](body: GraphDatabaseService ⇒ T): Unit = apply(body) 26 | } 27 | -------------------------------------------------------------------------------- /src/main/scala/de/knutwalker/dbpedia/util/itertools.scala: -------------------------------------------------------------------------------- 1 | package de.knutwalker.dbpedia.util 2 | 3 | import scala.collection.immutable.Seq 4 | import scala.collection.mutable.ListBuffer 5 | 6 | object itertools { 7 | def groupIter[T, K](iter: Iterator[T])(key: T ⇒ K): Iterator[Seq[T]] = new Iterator[Seq[T]] { 8 | val it = iter.buffered 9 | 10 | def hasNext = it.hasNext 11 | 12 | def next() = { 13 | val first = it.next() 14 | 15 | if (!it.hasNext) first :: Nil 16 | else { 17 | val firstKey = key(first) 18 | 19 | val buf = new ListBuffer[T] 20 | buf += first 21 | 22 | var nextKey = key(it.head) 23 | while (it.hasNext && firstKey == nextKey) { 24 | buf += it.next() 25 | 26 | if (it.hasNext) nextKey = key(it.head) 27 | } 28 | 29 | buf.result() 30 | } 31 | } 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /src/main/scala/de/knutwalker/dbpedia/wire.scala: -------------------------------------------------------------------------------- 1 | package de.knutwalker.dbpedia 2 | 3 | import de.knutwalker.dbpedia.components.{ ImporterComponent, GraphComponent } 4 | import de.knutwalker.dbpedia.impl._ 5 | 6 | trait BaseImporter extends ConfigSettingsComponent 7 | with DefaultParserComponent 8 | with DefaultMetricsComponent 9 | with DefaultHandlerComponent { 10 | this: GraphComponent with ImporterComponent ⇒ 11 | 12 | def main(args: Array[String]) { 13 | importer(args) 14 | } 15 | } 16 | 17 | trait ParallelBatchImportComponent extends BaseImporter with FastBatchGraphComponent with DisruptorImporterComponent 18 | trait SerialBatchImportComponent extends BaseImporter with FastBatchGraphComponent with DefaultImporterComponent 19 | --------------------------------------------------------------------------------