├── .gitignore
├── extraction
    ├── src
    │   ├── test
    │   │   ├── resources
    │   │   │   ├── data
    │   │   │   │   └── enwiki
    │   │   │   │   │   └── 20160407
    │   │   │   │   │       └── enwiki-20160407-pages-articles-multistream.xml.bz2
    │   │   │   ├── dist-config.properties
    │   │   │   └── config.properties
    │   │   └── scala
    │   │   │   └── org
    │   │   │       └── dbpedia
    │   │   │           └── extraction
    │   │   │               ├── spark
    │   │   │                   └── io
    │   │   │                   │   ├── WritableTest.scala
    │   │   │                   │   ├── QuadSeqWritableTest.scala
    │   │   │                   │   └── WikiPageWritableTest.scala
    │   │   │               └── mappings
    │   │   │                   └── DistRedirectsTest.scala
    │   └── main
    │   │   └── scala
    │   │       └── org
    │   │           ├── dbpedia
    │   │               └── extraction
    │   │               │   ├── destinations
    │   │               │       ├── DistWrapperDestination.scala
    │   │               │       ├── DistDestination.scala
    │   │               │       ├── DistMarkerDestination.scala
    │   │               │       └── DistDeduplicatingWriterDestination.scala
    │   │               │   ├── spark
    │   │               │       ├── serialize
    │   │               │       │   ├── LanguageSerializer.scala
    │   │               │       │   ├── LoggerSerializer.scala
    │   │               │       │   ├── LocaleSerializer.scala
    │   │               │       │   ├── ParserUtilsSerializer.scala
    │   │               │       │   ├── KryoSerializer.scala
    │   │               │       │   ├── WikiTitleSerializer.scala
    │   │               │       │   ├── KryoSerializationWrapper.scala
    │   │               │       │   ├── WikiPageSerializer.scala
    │   │               │       │   └── KryoExtractionRegistrator.scala
    │   │               │       └── io
    │   │               │       │   ├── QuadSeqWritable.scala
    │   │               │       │   ├── WikiPageWritable.scala
    │   │               │       │   ├── input
    │   │               │       │       ├── SeekableInputStream.scala
    │   │               │       │       ├── DBpediaWikiPageInputFormat.scala
    │   │               │       │       └── ByteMatcher.scala
    │   │               │       │   └── output
    │   │               │       │       ├── DBpediaDatasetOutputFormat.scala
    │   │               │       │       ├── DBpediaCompositeOutputFormat.scala
    │   │               │       │       └── MultipleTextOutputFormat.scala
    │   │               │   ├── dump
    │   │               │       └── extract
    │   │               │       │   ├── DumpExtractionContextWrapper.scala
    │   │               │       │   ├── DistExtraction.scala
    │   │               │       │   ├── DistExtractionJob.scala
    │   │               │       │   ├── DistConfig.scala
    │   │               │       │   └── DistConfigLoader.scala
    │   │               │   ├── mappings
    │   │               │       ├── DistDisambiguations.scala
    │   │               │       └── DistRedirects.scala
    │   │               │   └── util
    │   │               │       ├── SparkUtils.scala
    │   │               │       └── DistIOUtils.scala
    │   │           └── apache
    │   │               └── spark
    │   │                   └── ui
    │   │                       └── jobs
    │   │                           └── DBpediaJobProgressListener.scala
    └── pom.xml
├── download
    ├── src
    │   ├── main
    │   │   ├── scala
    │   │   │   └── org
    │   │   │   │   └── dbpedia
    │   │   │   │       └── extraction
    │   │   │   │           ├── dump
    │   │   │   │               └── download
    │   │   │   │               │   ├── actors
    │   │   │   │               │       ├── message
    │   │   │   │               │       │   ├── WorkerProgressMessage.scala
    │   │   │   │               │       │   ├── GeneralMessage.scala
    │   │   │   │               │       │   ├── DownloaderProgressMessage.scala
    │   │   │   │               │       │   ├── MasterWorkerMessage.scala
    │   │   │   │               │       │   └── DownloadJob.scala
    │   │   │   │               │       ├── DownloadResultConsumer.scala
    │   │   │   │               │       ├── DownloadClient.scala
    │   │   │   │               │       ├── DownloadProgressTracker.scala
    │   │   │   │               │       ├── DownloadJobRunner.scala
    │   │   │   │               │       ├── Worker.scala
    │   │   │   │               │       └── Master.scala
    │   │   │   │               │   ├── ActoredCounter.scala
    │   │   │   │               │   ├── DistDownload.scala
    │   │   │   │               │   └── DumpFileSource.scala
    │   │   │   │           └── util
    │   │   │   │               └── RemoteExecute.scala
    │   │   └── resources
    │   │   │   ├── application.conf
    │   │   │   └── reference.conf
    │   └── test
    │   │   └── resources
    │   │       ├── download.properties
    │   │       └── dist-download.properties
    └── pom.xml
├── .travis.yml
├── install-run
├── clean-install-run
├── run
├── run-extraction-test
├── common
    ├── pom.xml
    └── src
    │   └── main
    │       └── scala
    │           └── org
    │               └── dbpedia
    │                   └── extraction
    │                       └── util
    │                           ├── HadoopConfigurable.scala
    │                           └── RichHadoopPath.scala
├── gce
    └── README.md
├── README.md
└── pom.xml


/.gitignore:
--------------------------------------------------------------------------------
 1 | target/
 2 | .idea/
 3 | *~
 4 | *.iml
 5 | .cache
 6 | *.log
 7 | *.lck
 8 | *.tmp
 9 | java_pid*
10 | 


--------------------------------------------------------------------------------
/extraction/src/test/resources/data/enwiki/20160407/enwiki-20160407-pages-articles-multistream.xml.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dbpedia/distributed-extraction-framework/HEAD/extraction/src/test/resources/data/enwiki/20160407/enwiki-20160407-pages-articles-multistream.xml.bz2


--------------------------------------------------------------------------------
/download/src/main/scala/org/dbpedia/extraction/dump/download/actors/message/WorkerProgressMessage.scala:
--------------------------------------------------------------------------------
 1 | package org.dbpedia.extraction.dump.download.actors.message
 2 | 
 3 | object WorkerProgressMessage
 4 | {
 5 |   // DownloadProgressTracker to Worker
 6 |   trait ProgressMessage
 7 |   case class Progress(bytes: Long) extends ProgressMessage
 8 |   case class ProgressStart(bytes: Long) extends ProgressMessage
 9 | }
10 | 


--------------------------------------------------------------------------------
/download/src/main/scala/org/dbpedia/extraction/dump/download/actors/message/GeneralMessage.scala:
--------------------------------------------------------------------------------
 1 | package org.dbpedia.extraction.dump.download.actors.message
 2 | 
 3 | object GeneralMessage
 4 | {
 5 |   // This message is used by different actors to propagate a cluster shutdown.
 6 |   case object ShutdownCluster
 7 | 
 8 |   // This message is published by the master when the pending download queue is empty.
 9 |   case object MasterQueueEmpty
10 | }
11 | 


--------------------------------------------------------------------------------
/download/src/main/resources/application.conf:
--------------------------------------------------------------------------------
 1 | akka {
 2 |   actor.provider = "akka.cluster.ClusterActorRefProvider"
 3 | 
 4 |   remote.netty.tcp {
 5 |     hostname="127.0.0.1"
 6 |     port=0
 7 |   }
 8 | 
 9 |   extensions = ["akka.contrib.pattern.ClusterReceptionistExtension"]
10 | 
11 |   akka.cluster.auto-down = on
12 | 
13 |   auto-down-unreachable-after = 10s
14 | 
15 |   log-dead-letters = 0
16 | 
17 |   log-dead-letters-during-shutdown = off
18 | }
19 | 


--------------------------------------------------------------------------------
/download/src/main/resources/reference.conf:
--------------------------------------------------------------------------------
 1 | akka {
 2 |   actor.provider = "akka.cluster.ClusterActorRefProvider"
 3 | 
 4 |   remote.netty.tcp {
 5 |     hostname="127.0.0.1"
 6 |     port=0
 7 |   }
 8 | 
 9 |   extensions = ["akka.contrib.pattern.ClusterReceptionistExtension"]
10 | 
11 |   akka.cluster.auto-down = on
12 | 
13 |   auto-down-unreachable-after = 10s
14 | 
15 |   log-dead-letters = 0
16 | 
17 |   log-dead-letters-during-shutdown = 0
18 | }
19 | 


--------------------------------------------------------------------------------
/download/src/main/scala/org/dbpedia/extraction/dump/download/actors/message/DownloaderProgressMessage.scala:
--------------------------------------------------------------------------------
 1 | package org.dbpedia.extraction.dump.download.actors.message
 2 | 
 3 | object DownloaderProgressMessage
 4 | {
 5 |   // From Downloader or DownloadJobRunner to DownloadProgressTracker
 6 |   case class Read(bytesRead: Long)
 7 |   case class Start(totalBytes: Long) // totalBytes = total content length
 8 |   case object Stop
 9 | 
10 |   // From DownloadProgressTracker to DownloadJobRunner
11 |   case class ProgressEnd(bytes: Long)
12 | }
13 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | sudo: false
 2 | language: scala
 3 | jdk:
 4 |     - oraclejdk8
 5 |     - oraclejdk7
 6 |     - openjdk7
 7 | # branches:
 8 | #   only:
 9 | #     - master
10 | before_install:
11 |   - sed -i.bak -e 's|https://nexus.codehaus.org/snapshots/|https://oss.sonatype.org/content/repositories/codehaus-snapshots/|g' ~/.m2/settings.xml
12 | script: "mvn test"
13 | notifications:
14 |   email:
15 |     recipients:
16 |       - riteshoneinamillion@gmail.com
17 |     on_success: change
18 |     on_failure: change
19 | cache:
20 |   directories:
21 |   - $HOME/.m2
22 | 


--------------------------------------------------------------------------------
/extraction/src/main/scala/org/dbpedia/extraction/destinations/DistWrapperDestination.scala:
--------------------------------------------------------------------------------
 1 | package org.dbpedia.extraction.destinations
 2 | 
 3 | import org.apache.spark.rdd.RDD
 4 | 
 5 | /**
 6 |  * Base class for DistDestination objects that forward most calls to another destination.
 7 |  */
 8 | abstract class DistWrapperDestination(destination: DistDestination) extends DistDestination
 9 | {
10 |   override def open() = destination.open()
11 | 
12 |   def write(rdd: RDD[Seq[Quad]]) = destination.write(rdd)
13 | 
14 |   override def close() = destination.close()
15 | }


--------------------------------------------------------------------------------
/install-run:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Shortcut for 
 4 | # mvn -f ../pom.xml install && mvn scala:run -Dlauncher=... -DaddArgs=...
 5 | # Must be called with one of the modules (core/, dump/, ...) as current directory.
 6 | # Example:
 7 | # extraction_framework/core> ../install-run LAUNCHER ARG1 ARG2 ARG3
 8 | # is equivalent to
 9 | # extraction_framework/core> mvn -f ../pom.xml install && mvn scala:run "-Dlauncher=LAUNCHER" "-DaddArgs=ARG1|ARG2|ARG3"
10 | 
11 | # if we're not on a terminal, use batch mode to avoid ugly log files
12 | [ ! -t 1 ] && BATCH="-B"
13 | mvn $BATCH -f ../pom.xml install && . ../run "$@"
14 | 


--------------------------------------------------------------------------------
/clean-install-run:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Shortcut for 
 4 | # mvn -f ../pom.xml clean install && mvn scala:run -Dlauncher=... -DaddArgs=...
 5 | # Must be called with one of the modules (core/, dump/, ...) as current directory.
 6 | # Example:
 7 | # extraction_framework/core> ../clean-install-run LAUNCHER ARG1 ARG2 ARG3
 8 | # is equivalent to
 9 | # extraction_framework/core> mvn -f ../pom.xml clean install && mvn scala:run "-Dlauncher=LAUNCHER" "-DaddArgs=ARG1|ARG2|ARG3"
10 | 
11 | # if we're not on a terminal, use batch mode to avoid ugly log files
12 | [ ! -t 1 ] && BATCH="-B"
13 | mvn $BATCH -f ../pom.xml clean && . ../install-run "$@"
14 | 


--------------------------------------------------------------------------------
/extraction/src/main/scala/org/dbpedia/extraction/spark/serialize/LanguageSerializer.scala:
--------------------------------------------------------------------------------
 1 | package org.dbpedia.extraction.spark.serialize
 2 | 
 3 | import com.esotericsoftware.kryo.{Kryo, Serializer}
 4 | import com.esotericsoftware.kryo.io.{Input, Output}
 5 | import org.dbpedia.extraction.util.Language
 6 | 
 7 | /**
 8 |  * Kryo serializer for org.dbpedia.extraction.util.Language
 9 |  */
10 | class LanguageSerializer extends Serializer[Language]
11 | {
12 |   override def write(kryo: Kryo, output: Output, language: Language)
13 |   {
14 |     output.writeString(language.wikiCode)
15 |   }
16 | 
17 |   override def read(kryo: Kryo, input: Input, languageClass: Class[Language]): Language =
18 |   {
19 |     val wikiCode = input.readString()
20 |     Language(wikiCode)
21 |   }
22 | }
23 | 


--------------------------------------------------------------------------------
/extraction/src/main/scala/org/dbpedia/extraction/spark/serialize/LoggerSerializer.scala:
--------------------------------------------------------------------------------
 1 | package org.dbpedia.extraction.spark.serialize
 2 | 
 3 | import com.esotericsoftware.kryo.{Kryo, Serializer}
 4 | import com.esotericsoftware.kryo.io.{Input, Output}
 5 | import org.dbpedia.extraction.util.Language
 6 | import java.util.logging.Logger
 7 | 
 8 | /**
 9 |  * Kryo serializer for org.dbpedia.extraction.util.Language
10 |  */
11 | class LoggerSerializer extends Serializer[Logger]
12 | {
13 |   override def write(kryo: Kryo, output: Output, logger: Logger)
14 |   {
15 |     output.writeString(logger.getName)
16 |   }
17 | 
18 |   override def read(kryo: Kryo, input: Input, loggerClass: Class[Logger]): Logger =
19 |   {
20 |     val className = input.readString()
21 |     Logger.getLogger(className)
22 |   }
23 | }
24 | 


--------------------------------------------------------------------------------
/extraction/src/main/scala/org/dbpedia/extraction/spark/serialize/LocaleSerializer.scala:
--------------------------------------------------------------------------------
 1 | package org.dbpedia.extraction.spark.serialize
 2 | 
 3 | import com.esotericsoftware.kryo.{Kryo, Serializer}
 4 | import com.esotericsoftware.kryo.io.{Output, Input}
 5 | import java.util.Locale
 6 | 
 7 | /**
 8 |  * Kryo serializer for java.util.Locale
 9 |  */
10 | class LocaleSerializer extends Serializer[Locale]
11 | {
12 |   override def write(kryo: Kryo, output: Output, locale: Locale)
13 |   {
14 |     output.writeAscii(locale.getLanguage)
15 |     output.writeAscii(locale.getCountry)
16 |     output.writeAscii(locale.getVariant)
17 |   }
18 | 
19 |   override def read(kryo: Kryo, input: Input, localeClass: Class[Locale]): Locale =
20 |   {
21 |     new Locale(input.readString(), input.readString(), input.readString())
22 |   }
23 | }
24 | 


--------------------------------------------------------------------------------
/run:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Shortcut for mvn scala:run -Dlauncher=... -DaddArgs=...
 4 | # Must be called with one of the modules (core/, dump/, ...) as current directory.
 5 | # Example:
 6 | # extraction_framework/core> ../run LAUNCHER ARG1 ARG2 ARG3
 7 | # is equivalent to
 8 | # extraction_framework/core> mvn scala:run "-Dlauncher=LAUNCHER" "-DaddArgs=ARG1|ARG2|ARG3"
 9 | 
10 | LAUNCHER="$1"
11 | 
12 | ADD_ARGS="$2"
13 | for ARG in ${@:3}
14 | do
15 |   ADD_ARGS="$ADD_ARGS|$ARG"
16 | done
17 | 
18 | # export MAVEN_OPTS='-Xmx4096M -XX:MaxPermSize=1024M -XX:+HeapDumpOnOutOfMemoryError -XX:+PrintGC -XX:+PrintGCTimeStamps'
19 | # export MAVEN_DEBUG='-X -e'
20 | 
21 | # if we're not on a terminal, use batch mode to avoid ugly log files
22 | [ ! -t 1 ] && BATCH="-B"
23 | mvn $MAVEN_DEBUG $BATCH scala:run "-Dlauncher=$LAUNCHER" "-DaddArgs=$ADD_ARGS" 
24 | 


--------------------------------------------------------------------------------
/download/src/main/scala/org/dbpedia/extraction/dump/download/actors/message/MasterWorkerMessage.scala:
--------------------------------------------------------------------------------
 1 | package org.dbpedia.extraction.dump.download.actors.message
 2 | 
 3 | import org.dbpedia.extraction.dump.download.actors.message.WorkerProgressMessage.ProgressMessage
 4 | 
 5 | object MasterWorkerMessage
 6 | {
 7 |   // Messages from Workers
 8 |   case class RegisterWorker(workerId: String)
 9 |   case class WorkerRequestsDownload(workerId: String)
10 |   case class DownloadIsDone(workerId: String, downloadId: String, outputPath: String, bytes: Long)
11 |   case class DownloadFailed(workerId: String, downloadId: String)
12 |   case class ProgressReport(workerId: String, downloadId: String, progress: ProgressMessage) // progress = number of bytes read till now
13 |   case class RemoveWorker(workerId: String)
14 | 
15 |   // Messages to Workers
16 |   case object DownloadIsReady
17 |   case class Ack(id: String)
18 | }
19 | 


--------------------------------------------------------------------------------
/extraction/src/main/scala/org/dbpedia/extraction/destinations/DistDestination.scala:
--------------------------------------------------------------------------------
 1 | package org.dbpedia.extraction.destinations
 2 | 
 3 | import org.apache.spark.rdd.RDD
 4 | 
 5 | /**
 6 |  * A distributed destination for RDF quads.
 7 |  */
 8 | trait DistDestination
 9 | {
10 |   /**
11 |    * Opens this destination. This method should only be called once during the lifetime
12 |    * of a destination, and it should not be called concurrently with other methods of this class.
13 |    */
14 |   def open(): Unit
15 | 
16 |   /**
17 |    * Writes RDD of quads to this destination.
18 |    *
19 |    * @param rdd RDD[ Seq[Quad] ]
20 |    */
21 |   def write(rdd: RDD[Seq[Quad]]): Unit
22 | 
23 |   /**
24 |    * Closes this destination. This method should only be called once during the lifetime
25 |    * of a destination, and it should not be called concurrently with other methods of this class.
26 |    */
27 |   def close(): Unit
28 | }
29 | 


--------------------------------------------------------------------------------
/extraction/src/main/scala/org/dbpedia/extraction/spark/serialize/ParserUtilsSerializer.scala:
--------------------------------------------------------------------------------
 1 | package org.dbpedia.extraction.spark.serialize
 2 | 
 3 | import com.esotericsoftware.kryo.{Kryo, Serializer}
 4 | import com.esotericsoftware.kryo.io.{Input, Output}
 5 | import org.dbpedia.extraction.dataparser.ParserUtils
 6 | import org.dbpedia.extraction.util.Language
 7 | 
 8 | /**
 9 |  * Kryo serializer for org.dbpedia.extraction.dataparser.ParserUtils
10 |  */
11 | class ParserUtilsSerializer extends Serializer[ParserUtils]
12 | {
13 |   override def write(kryo: Kryo, output: Output, parserUtils: ParserUtils) {
14 |     kryo.writeObjectOrNull(output, parserUtils.context.language, new LanguageSerializer)
15 |   }
16 | 
17 |   override def read(kryo: Kryo, input: Input, parserUtilsClass: Class[ParserUtils]): ParserUtils = {
18 |     val lang = kryo.readObjectOrNull(input, classOf[Language], new LanguageSerializer)
19 |     new ParserUtils(new {def language: Language = lang})
20 |   }
21 | }


--------------------------------------------------------------------------------
/extraction/src/test/scala/org/dbpedia/extraction/spark/io/WritableTest.scala:
--------------------------------------------------------------------------------
 1 | package org.dbpedia.extraction.spark.io
 2 | 
 3 | import org.apache.hadoop.io.Writable
 4 | import java.io.{ByteArrayInputStream, DataInputStream, DataOutputStream, ByteArrayOutputStream}
 5 | 
 6 | abstract class WritableTest[T <: Writable]
 7 | {
 8 |   /**
 9 |    * Utility method that takes two Writables as parameters, writes the first Writable to a byte
10 |    * array and reads it back into the second Writable.
11 |    *
12 |    * @param oldWritable Writable to be serialized and deserialized again
13 |    * @param newWritable Writable where oldWritable is deserialized into after serialization.
14 |    */
15 |   def performReadWriteRoundTrip(oldWritable: T, newWritable: T) =
16 |   {
17 |     val bos = new ByteArrayOutputStream
18 |     val dos = new DataOutputStream(bos)
19 |     oldWritable.write(dos)
20 |     newWritable.readFields(new DataInputStream(new ByteArrayInputStream(bos.toByteArray)))
21 |   }
22 | }
23 | 


--------------------------------------------------------------------------------
/extraction/src/main/scala/org/dbpedia/extraction/spark/serialize/KryoSerializer.scala:
--------------------------------------------------------------------------------
 1 | package org.dbpedia.extraction.spark.serialize
 2 | 
 3 | import java.nio.ByteBuffer
 4 | 
 5 | import org.apache.spark.{SparkConf, SparkEnv}
 6 | import org.apache.spark.serializer.{KryoSerializer => SparkKryoSerializer}
 7 | import scala.reflect.ClassTag
 8 | 
 9 | 
10 | /**
11 |  * Java object serialization using Kryo. This is much more efficient, but Kryo
12 |  * sometimes is buggy to use. We use this mainly to serialize the object
13 |  * inspectors.
14 |  */
15 | object KryoSerializer
16 | {
17 | 
18 |   @transient lazy val ser: SparkKryoSerializer =
19 |   {
20 |     val sparkConf = Option(SparkEnv.get).map(_.conf).getOrElse(new SparkConf())
21 |     new SparkKryoSerializer(sparkConf)
22 |   }
23 | 
24 |   def serialize[T: ClassTag](o: T): Array[Byte] =
25 |   {
26 |     ser.newInstance().serialize(o).array()
27 |   }
28 | 
29 |   def deserialize[T: ClassTag](bytes: Array[Byte]): T =
30 |   {
31 |     ser.newInstance().deserialize[T](ByteBuffer.wrap(bytes))
32 |   }
33 | }


--------------------------------------------------------------------------------
/extraction/src/main/scala/org/dbpedia/extraction/dump/extract/DumpExtractionContextWrapper.scala:
--------------------------------------------------------------------------------
 1 | package org.dbpedia.extraction.dump.extract
 2 | 
 3 | import org.dbpedia.extraction.ontology.Ontology
 4 | import org.dbpedia.extraction.sources.{WikiPage, Source}
 5 | import org.dbpedia.extraction.util.Language
 6 | import org.dbpedia.extraction.mappings.{Disambiguations, Redirects, Mappings}
 7 | 
 8 | /**
 9 |  * A simple wrapper for a DumpExtractionContext object
10 |  * 
11 |  * @param context
12 |  */
13 | class DumpExtractionContextWrapper(context: DumpExtractionContext) extends DumpExtractionContext
14 | {
15 |   override def ontology: Ontology = context.ontology
16 | 
17 |   override def commonsSource: Source = context.commonsSource
18 | 
19 |   override def language: Language = context.language
20 | 
21 |   override def mappingPageSource: Traversable[WikiPage] = context.mappingPageSource
22 | 
23 |   override def mappings: Mappings = context.mappings
24 | 
25 |   override def articlesSource: Source = context.articlesSource
26 | 
27 |   override def redirects: Redirects = context.redirects
28 | 
29 |   override def disambiguations: Disambiguations = context.disambiguations
30 | }
31 | 


--------------------------------------------------------------------------------
/download/src/main/scala/org/dbpedia/extraction/dump/download/ActoredCounter.scala:
--------------------------------------------------------------------------------
 1 | package org.dbpedia.extraction.dump.download
 2 | 
 3 | import akka.actor.ActorRef
 4 | import java.io.InputStream
 5 | import java.net.URLConnection
 6 | import org.dbpedia.extraction.util.CountingInputStream
 7 | import org.dbpedia.extraction.dump.download.actors.message.DownloaderProgressMessage
 8 | import DownloaderProgressMessage.{Start, Read}
 9 | import Counter.getContentLength
10 | 
11 | /**
12 |  * A Downloader mixin to be used with DownloadProgressTracker. Sends Start/Read messages to
13 |  * the DownloadProgressTracker actor reference.
14 |  *
15 |  * @see org.dbpedia.extraction.dump.download.actors.DownloadProgressTracker
16 |  */
17 | trait ActoredCounter extends Downloader
18 | {
19 |   /**
20 |    * Reference to a DownloadProgressTracker actor.
21 |    */
22 |   val progressActor: ActorRef
23 | 
24 |   protected abstract override def inputStream(conn: URLConnection): InputStream = {
25 |     def logger(bytesRead: Long, close: Boolean): Unit = progressActor ! Read(bytesRead)
26 |     progressActor ! Start(getContentLength(conn)) // Signal start of download with the total file size in bytes
27 |     new CountingInputStream(super.inputStream(conn), logger)
28 |   }
29 | 
30 | }
31 | 


--------------------------------------------------------------------------------
/extraction/src/main/scala/org/dbpedia/extraction/spark/io/QuadSeqWritable.scala:
--------------------------------------------------------------------------------
 1 | package org.dbpedia.extraction.spark.io
 2 | 
 3 | import org.dbpedia.extraction.destinations.Quad
 4 | import org.apache.hadoop.io.Writable
 5 | import org.dbpedia.extraction.util.DistIOUtils
 6 | import java.io.{DataOutput, ByteArrayOutputStream, DataInput}
 7 | import com.esotericsoftware.kryo.io.{Input, Output}
 8 | 
 9 | /**
10 |  * Writable wrapping Seq[Quad] - used by custom OutputFormat
11 |  */
12 | class QuadSeqWritable(quads: Seq[Quad]) extends Writable
13 | {
14 |   var _quads = quads
15 | 
16 |   def this() = this(null)
17 | 
18 |   def set(quads: Seq[Quad])
19 |   {
20 |     _quads = quads
21 |   }
22 | 
23 |   def get = _quads
24 | 
25 |   override def write(output: DataOutput)
26 |   {
27 |     val out = new ByteArrayOutputStream()
28 |     val o = new Output(out)
29 |     DistIOUtils.getKryoInstance.writeClassAndObject(o, get)
30 |     o.close()
31 |     val bytes = out.toByteArray
32 |     output.writeInt(bytes.size)
33 |     output.write(bytes)
34 |   }
35 | 
36 |   override def readFields(input: DataInput)
37 |   {
38 |     val size = input.readInt()
39 |     val bytes = new Array[Byte](size)
40 |     input.readFully(bytes)
41 |     val i = new Input()
42 |     i.setBuffer(bytes)
43 |     set(DistIOUtils.getKryoInstance.readClassAndObject(i).asInstanceOf[Seq[Quad]])
44 |     i.close()
45 |   }
46 | }
47 | 


--------------------------------------------------------------------------------
/extraction/src/test/scala/org/dbpedia/extraction/spark/io/QuadSeqWritableTest.scala:
--------------------------------------------------------------------------------
 1 | package org.dbpedia.extraction.spark.io
 2 | 
 3 | import org.junit.runner.RunWith
 4 | import org.scalatest.junit.JUnitRunner
 5 | import org.scalatest.FunSuiteLike
 6 | import org.dbpedia.extraction.destinations.Quad
 7 | import scala.util.Random
 8 | import org.junit.Assert._
 9 | 
10 | @RunWith(classOf[JUnitRunner])
11 | class QuadSeqWritableTest extends WritableTest[QuadSeqWritable] with FunSuiteLike
12 | {
13 |   test("Verify that serialization-deserialization works properly")
14 |   {
15 |     // Create random List[Quad]
16 |     val sampleQuads = for (i <- (0 until 100).toList) yield new Quad(Random.nextString(10),
17 |                                                                      Random.nextString(10),
18 |                                                                      Random.nextString(10),
19 |                                                                      Random.nextString(10),
20 |                                                                      Random.nextString(10),
21 |                                                                      Random.nextString(10),
22 |                                                                      Random.nextString(10))
23 | 
24 |     val writable1 = new QuadSeqWritable(sampleQuads)
25 |     val writable2 = new QuadSeqWritable()
26 | 
27 |     performReadWriteRoundTrip(writable1, writable2)
28 |     assertEquals(writable1.get, writable2.get)
29 |   }
30 | }
31 | 


--------------------------------------------------------------------------------
/run-extraction-test:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Performs both normal sequential extraction and distributed extraction and compares the outputs.
 4 | 
 5 | CONFIG_FILE="$1"
 6 | SPARK_CONF_FILE="$2"
 7 | 
 8 | echo "===================================================================="
 9 | echo "Running sequential extraction"
10 | echo "===================================================================="
11 | ./run seq-extraction $CONFIG_FILE
12 | mkdir /tmp/dbpedia-test-seq-extraction
13 | mv `grep base-dir $CONFIG_FILE | sed -ne 's/^base-dir=//p'`/*wiki/*/*wiki*.gz /tmp/dbpedia-test-seq-extraction
14 | 
15 | echo "===================================================================="
16 | echo "Running distributed extraction"
17 | echo "===================================================================="
18 | ./run extraction $CONFIG_FILE $SPARK_CONF_FILE
19 | mkdir /tmp/dbpedia-test-par-extraction
20 | cp -rf `grep base-dir $CONFIG_FILE | sed -ne 's/^base-dir=//p'`/*wiki/*/*wiki*.gz /tmp/dbpedia-test-par-extraction/
21 | 
22 | echo "===================================================================="
23 | echo "Computing diffs:"
24 | echo "===================================================================="
25 | diffs=`diff <(gzip -dc /tmp/dbpedia-test-seq-extraction/*.gz | grep -v "^#" | sort) <(gzip -dc /tmp/dbpedia-test-par-extraction/*wiki*.gz/part*.gz | grep -v "^#" | sort)`
26 | if [ -z "$diffs" ]; then
27 | 	echo "Outputs match!"
28 | else
29 | 	echo $diffs
30 | fi
31 | 
32 | 
33 | rm -rf /tmp/dbpedia-test-???-extraction
34 | 


--------------------------------------------------------------------------------
/extraction/src/main/scala/org/dbpedia/extraction/destinations/DistMarkerDestination.scala:
--------------------------------------------------------------------------------
 1 | package org.dbpedia.extraction.destinations
 2 | 
 3 | import org.dbpedia.extraction.util.FileLike
 4 | import java.io.IOException
 5 | 
 6 | /**
 7 |  * MakerDestination that wraps a DistDestination. The code has been taken from MakerDestination.
 8 |  *
 9 |  * Handles a marker file that signals that the extraction is either running ('start mode')
10 |  * or finished ('end mode').
11 |  *
12 |  * In 'start mode', the file is created before the extraction starts (it must not already exist)
13 |  * and deleted after the extraction ends.
14 |  *
15 |  * In 'end mode', the file is deleted before the extraction starts (if it already exists)
16 |  * and re-created after the extraction ends.
17 |  *
18 |  * @param file marker file
19 |  * @param start 'start mode' if true, 'end mode' if false.
20 |  */
21 | class DistMarkerDestination(destination: DistDestination, file: FileLike[_], start: Boolean)
22 |   extends DistWrapperDestination(destination)
23 | {
24 |   override def open(): Unit =
25 |   {
26 |     if (start) create() else delete()
27 |     super.open()
28 |   }
29 | 
30 |   override def close(): Unit =
31 |   {
32 |     super.close()
33 |     if (!start) create() else delete()
34 |   }
35 | 
36 |   private def create(): Unit =
37 |   {
38 |     if (file.exists) throw new IOException("file '" + file + "' already exists")
39 |     file.outputStream().close()
40 |   }
41 | 
42 |   private def delete(): Unit =
43 |   {
44 |     if (file.exists) file.delete()
45 |   }
46 | }


--------------------------------------------------------------------------------
/download/src/main/scala/org/dbpedia/extraction/util/RemoteExecute.scala:
--------------------------------------------------------------------------------
 1 | package org.dbpedia.extraction.util
 2 | 
 3 | import com.jcraft.jsch.{JSch, JSchException, ChannelExec, Session}
 4 | import java.io.IOException
 5 | 
 6 | /**
 7 |  * Utility trait for creating an SSH session and executing remote commands.
 8 |  */
 9 | trait RemoteExecute
10 | {
11 |   val jsch = new JSch()
12 | 
13 |   def addIdentity(privateKeyPath: String, passphrase: String) = jsch.addIdentity(privateKeyPath, passphrase)
14 | 
15 |   def addIdentity(privateKeyPath: String) = jsch.addIdentity(privateKeyPath)
16 | 
17 |   def createSession(userName: String, host: String): Session =
18 |   {
19 |     val session = jsch.getSession(userName, host)
20 |     session.setConfig("UserKnownHostsFile", "/dev/null")
21 |     session.setConfig("CheckHostIP", "no")
22 |     session.setConfig("StrictHostKeyChecking", "no")
23 |     session.connect()
24 |     session
25 |   }
26 | 
27 |   def execute(session: Session, command: String): String =
28 |   {
29 |     val outputBuffer = new StringBuilder()
30 | 
31 |     val channel = session.openChannel("exec").asInstanceOf[ChannelExec]
32 |     channel.setCommand(command)
33 |     channel.connect()
34 |     channel.setErrStream(System.err)
35 | 
36 |     val commandOutput = channel.getInputStream
37 |     var readByte = commandOutput.read()
38 | 
39 |     while (readByte != 0xffffffff)
40 |     {
41 |       outputBuffer.append(readByte)
42 |       readByte = commandOutput.read()
43 |     }
44 | 
45 |     channel.disconnect()
46 |     outputBuffer.toString()
47 |   }
48 | }
49 | 


--------------------------------------------------------------------------------
/extraction/src/main/scala/org/dbpedia/extraction/spark/serialize/WikiTitleSerializer.scala:
--------------------------------------------------------------------------------
 1 | package org.dbpedia.extraction.spark.serialize
 2 | 
 3 | import com.esotericsoftware.kryo.{Kryo, Serializer}
 4 | import com.esotericsoftware.kryo.serializers.FieldSerializer
 5 | import com.esotericsoftware.kryo.io.{Input, Output}
 6 | import org.dbpedia.extraction.wikiparser.{Namespace, WikiTitle}
 7 | import org.dbpedia.extraction.util.Language
 8 | 
 9 | /**
10 |  * Kryo serializer for org.dbpedia.extraction.wikiparser.WikiTitle
11 |  */
12 | class WikiTitleSerializer extends Serializer[WikiTitle]
13 | {
14 |   override def write(kryo: Kryo, output: Output, wikiTitle: WikiTitle)
15 |   {
16 |     output.writeString(wikiTitle.decoded)
17 |     kryo.writeObjectOrNull(output, wikiTitle.language, new LanguageSerializer)
18 |     kryo.writeObjectOrNull(output, wikiTitle.namespace, new FieldSerializer(kryo, classOf[Namespace]))
19 |     output.writeBoolean(wikiTitle.isInterLanguageLink)
20 |     output.writeString(wikiTitle.fragment)
21 |   }
22 | 
23 |   override def read(kryo: Kryo, input: Input, wikiTitleClass: Class[WikiTitle]): WikiTitle =
24 |   {
25 |     val decoded = input.readString()
26 |     val language = kryo.readObjectOrNull(input, classOf[Language], new LanguageSerializer)
27 |     val namespace = kryo.readObjectOrNull(input, classOf[Namespace], new FieldSerializer(kryo, classOf[Namespace]))
28 |     val isInterLanguageLink = input.readBoolean()
29 |     val fragment = input.readString()
30 |     new WikiTitle(decoded, namespace, language, isInterLanguageLink, fragment)
31 |   }
32 | }
33 | 


--------------------------------------------------------------------------------
/extraction/src/main/scala/org/dbpedia/extraction/spark/io/WikiPageWritable.scala:
--------------------------------------------------------------------------------
 1 | package org.dbpedia.extraction.spark.io
 2 | 
 3 | import org.apache.hadoop.io.Writable
 4 | import java.io.{ByteArrayOutputStream, DataOutput, DataInput}
 5 | import org.dbpedia.extraction.sources.WikiPage
 6 | import com.esotericsoftware.kryo.io.{Input, Output}
 7 | import org.dbpedia.extraction.spark.serialize.WikiPageSerializer
 8 | import org.dbpedia.extraction.util.DistIOUtils
 9 | 
10 | /**
11 |  * DBpediaWikiPageInputFormat emits values of type WikiPageWritable. This class holds a single WikiPage instance.
12 |  * @see DBpediaWikiPageInputFormat
13 |  */
14 | class WikiPageWritable(wikiPage: WikiPage) extends Writable
15 | {
16 |   var _wikiPage = wikiPage
17 | 
18 |   def this() = this(null)
19 | 
20 |   def set(wikiPage: WikiPage)
21 |   {
22 |     _wikiPage = wikiPage
23 |   }
24 | 
25 |   def get = _wikiPage
26 | 
27 |   val wps = new WikiPageSerializer
28 | 
29 |   override def write(output: DataOutput)
30 |   {
31 |     val out = new ByteArrayOutputStream()
32 |     val o = new Output(out)
33 |     wps.write(DistIOUtils.getKryoInstance, o, get)
34 |     o.close()
35 |     val bytes = out.toByteArray
36 |     output.writeInt(bytes.size)
37 |     output.write(bytes)
38 |   }
39 | 
40 |   override def readFields(input: DataInput)
41 |   {
42 |     val size = input.readInt()
43 |     val bytes = new Array[Byte](size)
44 |     input.readFully(bytes)
45 |     val i = new Input()
46 |     i.setBuffer(bytes)
47 |     set(wps.read(DistIOUtils.getKryoInstance, i, classOf[WikiPage]))
48 |     i.close()
49 |   }
50 | }


--------------------------------------------------------------------------------
/extraction/src/main/scala/org/dbpedia/extraction/spark/serialize/KryoSerializationWrapper.scala:
--------------------------------------------------------------------------------
 1 | package org.dbpedia.extraction.spark.serialize
 2 | 
 3 | import scala.reflect.ClassTag
 4 | 
 5 | /**
 6 |  * A wrapper around some unserializable objects that make them both Java
 7 |  * serializable. Internally, Kryo is used for serialization.
 8 |  *
 9 |  * Use KryoSerializationWrapper(value) to create a wrapper.
10 |  */
11 | class KryoSerializationWrapper[T: ClassTag] extends Serializable
12 | {
13 | 
14 |   @transient var value: T = _
15 | 
16 |   private var valueSerialized: Array[Byte] = _
17 | 
18 |   // The getter and setter for valueSerialized is used for XML serialization.
19 |   def getValueSerialized(): Array[Byte] =
20 |   {
21 |     valueSerialized = KryoSerializer.serialize(value)
22 |     valueSerialized
23 |   }
24 | 
25 |   def setValueSerialized(bytes: Array[Byte]) =
26 |   {
27 |     valueSerialized = bytes
28 |     value = KryoSerializer.deserialize[T](valueSerialized)
29 |   }
30 | 
31 |   // Used for Java serialization.
32 |   private def writeObject(out: java.io.ObjectOutputStream)
33 |   {
34 |     getValueSerialized()
35 |     out.defaultWriteObject()
36 |   }
37 | 
38 |   private def readObject(in: java.io.ObjectInputStream)
39 |   {
40 |     in.defaultReadObject()
41 |     setValueSerialized(valueSerialized)
42 |   }
43 | }
44 | 
45 | 
46 | object KryoSerializationWrapper
47 | {
48 |   def apply[T: ClassTag](value: T): KryoSerializationWrapper[T] =
49 |   {
50 |     val wrapper = new KryoSerializationWrapper[T]
51 |     wrapper.value = value
52 |     wrapper
53 |   }
54 | }


--------------------------------------------------------------------------------
/common/pom.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project xmlns="http://maven.apache.org/POM/4.0.0"
 3 |          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 4 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 5 |     <modelVersion>4.0.0</modelVersion>
 6 | 
 7 |     <parent>
 8 |         <groupId>org.dbpedia</groupId>
 9 |         <artifactId>distributed-extraction</artifactId>
10 |         <version>4.1-SNAPSHOT</version>
11 |     </parent>
12 | 
13 |     <groupId>org.dbpedia.distributed-extraction</groupId>
14 |     <artifactId>common</artifactId>
15 |     <version>4.1-SNAPSHOT</version>
16 |     <name>DBpedia Distributed Extraction Framework Commons</name>
17 | 
18 |     <build>
19 |         <plugins>
20 |             <plugin>
21 |                 <groupId>net.alchim31.maven</groupId>
22 |                 <artifactId>scala-maven-plugin</artifactId>
23 |             </plugin>
24 |         </plugins>
25 |     </build>
26 | 
27 |     <dependencies>
28 |         <dependency>
29 |             <groupId>org.dbpedia.extraction</groupId>
30 |             <artifactId>core</artifactId>
31 |             <version>4.1</version>
32 |         </dependency>
33 | 
34 |         <dependency>
35 |             <groupId>org.apache.hadoop</groupId>
36 |             <artifactId>hadoop-client</artifactId>
37 |             <version>${hadoop.version}</version>
38 |         </dependency>
39 | 
40 |         <dependency>
41 |             <groupId>org.apache.hadoop</groupId>
42 |             <artifactId>hadoop-common</artifactId>
43 |             <version>${hadoop.version}</version>
44 |         </dependency>
45 |     </dependencies>
46 | 
47 | </project>
48 | 


--------------------------------------------------------------------------------
/extraction/src/main/scala/org/dbpedia/extraction/destinations/DistDeduplicatingWriterDestination.scala:
--------------------------------------------------------------------------------
 1 | package org.dbpedia.extraction.destinations
 2 | 
 3 | import org.apache.hadoop.fs.Path
 4 | import org.apache.hadoop.conf.Configuration
 5 | import org.apache.spark.rdd.RDD
 6 | import org.apache.hadoop.io.Text
 7 | import org.dbpedia.extraction.spark.io.QuadSeqWritable
 8 | import org.dbpedia.extraction.spark.io.output.DBpediaCompositeOutputFormat
 9 | import org.apache.spark.SparkContext._
10 | 
11 | /**
12 |  * Destination where RDF graphs are deduplicated and written to a Hadoop Path.
13 |  *
14 |  * @param path Path used by DBpediaCompositeOutputFormat to write outputs
15 |  * @param hadoopConfiguration Hadoop Configuration object
16 |  */
17 | class DistDeduplicatingWriterDestination(path: Path, hadoopConfiguration: Configuration) extends DistDestination
18 | {
19 |   override def open() = ()
20 | 
21 |   /**
22 |    * Writes RDD of quads (after extracting unique quads) to path using DBpediaCompositeOutputFormat.
23 |    *
24 |    * @param rdd RDD[ Seq[Quad] ]
25 |    */
26 |   override def write(rdd: RDD[Seq[Quad]])
27 |   {
28 |     rdd.flatMap
29 |     {
30 |       quads =>
31 |         quads.distinct.groupBy(quad => new Text(quad.dataset)).toSeq.map
32 |         {
33 |           case (key: Text, quads: Seq[Quad]) => (key, new QuadSeqWritable(quads))
34 |         }
35 |     }.saveAsNewAPIHadoopFile(path.toString,
36 |                              classOf[Text],
37 |                              classOf[QuadSeqWritable],
38 |                              classOf[DBpediaCompositeOutputFormat],
39 |                              hadoopConfiguration)
40 |   }
41 | 
42 |   override def close() = ()
43 | }
44 | 


--------------------------------------------------------------------------------
/extraction/src/main/scala/org/dbpedia/extraction/spark/serialize/WikiPageSerializer.scala:
--------------------------------------------------------------------------------
 1 | package org.dbpedia.extraction.spark.serialize
 2 | 
 3 | import com.esotericsoftware.kryo.{Kryo, Serializer}
 4 | import org.dbpedia.extraction.sources.WikiPage
 5 | import com.esotericsoftware.kryo.io.{Output, Input}
 6 | import org.dbpedia.extraction.wikiparser.WikiTitle
 7 | 
 8 | /**
 9 |  * Kryo serializer for org.dbpedia.extraction.sources.WikiPage
10 |  */
11 | class WikiPageSerializer extends Serializer[WikiPage]
12 | {
13 |   override def write(kryo: Kryo, output: Output, wikiPage: WikiPage)
14 |   {
15 |     kryo.writeObjectOrNull(output, wikiPage.title, new WikiTitleSerializer)
16 |     kryo.writeObjectOrNull(output, wikiPage.redirect, new WikiTitleSerializer)
17 |     output.writeLong(wikiPage.id)
18 |     output.writeLong(wikiPage.revision)
19 |     output.writeLong(wikiPage.timestamp)
20 |     output.writeLong(wikiPage.contributorID)
21 |     output.writeString(wikiPage.contributorName)
22 |     output.writeString(wikiPage.source)
23 |     output.writeString(wikiPage.format)
24 |   }
25 | 
26 |   override def read(kryo: Kryo, input: Input, wikiPageClass: Class[WikiPage]): WikiPage =
27 |   {
28 |     val title = kryo.readObjectOrNull(input, classOf[WikiTitle], new WikiTitleSerializer)
29 |     val redirect = kryo.readObjectOrNull(input, classOf[WikiTitle], new WikiTitleSerializer)
30 |     val id = input.readLong()
31 |     val revision = input.readLong()
32 |     val timestamp = input.readLong()
33 |     val contributorID = input.readLong()
34 |     val contributorName = input.readString()
35 |     val source = input.readString()
36 |     val format = input.readString()
37 |     new WikiPage(title, redirect, id, revision, timestamp, contributorID, contributorName, source, format)
38 |   }
39 | }
40 | 


--------------------------------------------------------------------------------
/download/src/main/scala/org/dbpedia/extraction/dump/download/actors/message/DownloadJob.scala:
--------------------------------------------------------------------------------
 1 | package org.dbpedia.extraction.dump.download.actors.message
 2 | 
 3 | import java.net.URL
 4 | import org.dbpedia.extraction.dump.download.actors.message.WorkerProgressMessage.ProgressMessage
 5 | 
 6 | /**
 7 |  * Download job used by the actor framework.
 8 |  *
 9 |  * @param downloadId Unique job ID
10 |  * @param file URL information
11 |  */
12 | case class DownloadJob(downloadId: String, file: DumpFile)
13 | 
14 | /**
15 |  * Download job wrapped along with the mirror to use for downloading.
16 |  * This contains all the information needed by DownloadJobRunner to perform the job.
17 |  *
18 |  * @param baseUrl URL of the mirror to download from
19 |  * @param job download job
20 |  */
21 | case class MirroredDownloadJob(baseUrl: URL, job: DownloadJob)
22 | 
23 | /**
24 |  * Download information for single wiki dump file.
25 |  *
26 |  * @param baseDir Base directory on Hadoop file system (HDFS for distributed downloads)
27 |  * @param wikiSuffix Wiki name suffix (eg. wiki)
28 |  * @param language Language wikiCode
29 |  * @param date YYYYMMDD date string
30 |  * @param fileName URL file name
31 |  */
32 | case class DumpFile(baseDir: String, wikiSuffix: String, language: String, date: String, fileName: String)
33 | 
34 | /**
35 |  * Download job used by the actor framework.
36 |  *
37 |  * @param job MirroredDownloadJob
38 |  * @param outputPath Output path name in scheme://path/fileName format
39 |  * @param bytes Total bytes downloaded
40 |  */
41 | case class DownloadResult(job: MirroredDownloadJob, outputPath: String, bytes: Long)
42 | 
43 | /**
44 |  * Progress reports published by Master.
45 |  *
46 |  * @param downloadId Unique job ID
47 |  * @param progress Progress message
48 |  */
49 | case class DownloadProgress(downloadId: String, progress: ProgressMessage)


--------------------------------------------------------------------------------
/extraction/src/test/resources/dist-config.properties:
--------------------------------------------------------------------------------
 1 | # The SPARK_HOME environment variable should be set to this, Spark's location
 2 | spark-home=/home/user/engine/spark-0.9.1-bin-hadoop2/
 3 | 
 4 | # Paths to the Hadoop configuration files, if any. These are needed for HDFS.
 5 | # hadoop-coresite-xml-path=/home/user/engine/hadoop-2.2.0/etc/hadoop/core-site.xml
 6 | # hadoop-hdfssite-xml-path=/home/user/engine/hadoop-2.2.0/etc/hadoop/hdfs-site.xml
 7 | # hadoop-mapredsite-xml-path=/home/user/engine/hadoop-2.2.0/etc/hadoop/mapred-site.xml
 8 | 
 9 | # Refer to README.md for advice
10 | spark.executor.memory=2500m
11 | 
12 | # Replace local[8] with something like spark://192.168.0.100 to go into distributed mode.
13 | spark-master=local[8]
14 | 
15 | # When running on a distributed cluster, it is essential that you set spark.cores.max to N * M
16 | # where N = total no. of slave machines, M = SPARK_WORKER_INSTANCES (from spark-env.sh)
17 | # This is to ensure that Spark uses as many cores (over the entire cluster) as many workers there are.
18 | spark.cores.max=8
19 | 
20 | # You can add more spark.* variables here. All variables starting with spark. will be provided to SparkConf
21 | 
22 | # This is used for setting log levels for "org.apache", "spark", "org.eclipse.jetty" and "akka" using
23 | # SparkUtils.setLogLevels(). It is WARN by default to prevent excessive logging from Spark.
24 | # It is a good idea to set it to INFO while debugging/testing out the framework.
25 | # Refer to org.apache.log4j.Level for more details
26 | # logging-level=INFO
27 | 
28 | # WARNING: If base-dir is set here, the base-dir in config.properties (the original DBpedia extraction configuration) is ignored.
29 | # base-dir=/data
30 | 
31 | # Please refer to the source code for org.dbpedia.extraction.dump.extract.DistConfig for the complete set of configuration variables
32 | # TODO: Add info on all configuration variables here.


--------------------------------------------------------------------------------
/extraction/src/test/scala/org/dbpedia/extraction/spark/io/WikiPageWritableTest.scala:
--------------------------------------------------------------------------------
 1 | package org.dbpedia.extraction.spark.io
 2 | 
 3 | import org.junit.Assert._
 4 | import org.junit.runner.RunWith
 5 | import org.scalatest.junit.JUnitRunner
 6 | import org.scalatest.FunSuiteLike
 7 | import org.dbpedia.extraction.sources.XMLSource
 8 | import scala.xml.XML
 9 | import org.dbpedia.extraction.util.Language
10 | 
11 | @RunWith(classOf[JUnitRunner])
12 | class WikiPageWritableTest extends WritableTest[WikiPageWritable] with FunSuiteLike
13 | {
14 |   test("Verify that serialization-deserialization works properly")
15 |   {
16 |     val samplePage =
17 |       """
18 |         |  <page>
19 |         |    <title>Lèmburg</title>
20 |         |    <ns>0</ns>
21 |         |    <id>13</id>
22 |         |    <redirect title="Limburg" />
23 |         |    <revision>
24 |         |      <id>196988</id>
25 |         |      <parentid>5980</parentid>
26 |         |      <timestamp>2010-01-25T20:24:26Z</timestamp>
27 |         |      <contributor>
28 |         |        <username>PahlesBot</username>
29 |         |        <id>458</id>
30 |         |      </contributor>
31 |         |      <minor />
32 |         |      <comment>Bot: automatisch tekst vervangen  (-#redirect +#REDIRECT)</comment>
33 |         |      <text xml:space="preserve">#REDIRECT [[Limburg]]</text>
34 |         |      <sha1>2uewphqvpum37i9d7g5okf5c3m643c7</sha1>
35 |         |      <model>wikitext</model>
36 |         |      <format>text/x-wiki</format>
37 |         |    </revision>
38 |         |  </page>
39 |       """.stripMargin
40 | 
41 |     val wikiPage = XMLSource.fromXML(XML.loadString("<mediawiki>" + samplePage + "</mediawiki>"), Language("li")).head
42 |     val writable1 = new WikiPageWritable(wikiPage)
43 |     val writable2 = new WikiPageWritable()
44 | 
45 |     performReadWriteRoundTrip(writable1, writable2)
46 |     assertEquals(writable1.get.toString, writable2.get.toString)
47 |   }
48 | }
49 | 


--------------------------------------------------------------------------------
/extraction/src/main/scala/org/dbpedia/extraction/dump/extract/DistExtraction.scala:
--------------------------------------------------------------------------------
 1 | package org.dbpedia.extraction.dump.extract
 2 | 
 3 | import org.dbpedia.extraction.util.{SparkUtils, ProxyAuthenticator, ConfigUtils}
 4 | import java.net.Authenticator
 5 | import scala.concurrent.{ExecutionContext, Await, Future, future}
 6 | import scala.concurrent.duration.Duration
 7 | import java.io.File
 8 | import java.util.concurrent.Executors
 9 | 
10 | /**
11 |  * Dump extraction script.
12 |  */
13 | object DistExtraction
14 | {
15 | 
16 |   val Started = "extraction-started"
17 | 
18 |   val Complete = "extraction-complete"
19 | 
20 |   def main(args: Array[String]): Unit =
21 |   {
22 |     require(args != null && args.length >= 2 && args(0).nonEmpty && args(1).nonEmpty, "missing required arguments: <extraction config file name> <spark config file name>")
23 |     Authenticator.setDefault(new ProxyAuthenticator())
24 | 
25 |     // Load properties
26 |     val extractionConfigProps = ConfigUtils.loadConfig(args(0), "UTF-8")
27 |     val distConfigProps = ConfigUtils.loadConfig(args(1), "UTF-8")
28 |     val distConfig = new DistConfig(distConfigProps, extractionConfigProps, new File(args(0)).toURI)
29 | 
30 |     // overwrite properties with CLI args
31 |     // TODO arguments could be of the format a=b and then property a can be overwritten with "b"
32 | 
33 |     // Create SparkContext
34 |     SparkUtils.setSparkLogLevels(distConfig)
35 |     val sparkContext = SparkUtils.getSparkContext(distConfig)
36 | 
37 |     // Load extraction jobs from configuration
38 |     val jobs = new DistConfigLoader(distConfig, sparkContext).getExtractionJobs()
39 | 
40 |     val executor = Executors.newFixedThreadPool(distConfig.extractionJobThreads)
41 |     implicit val ec = ExecutionContext.fromExecutor(executor)
42 |     val futures = for (job <- jobs) yield future
43 |                                           {
44 |                                             job.run()
45 |                                           }
46 |     Await.result(Future.sequence(futures), Duration.Inf)
47 | 
48 |     sparkContext.stop()
49 |     executor.shutdown()
50 |   }
51 | }
52 | 


--------------------------------------------------------------------------------
/download/src/main/scala/org/dbpedia/extraction/dump/download/actors/DownloadResultConsumer.scala:
--------------------------------------------------------------------------------
 1 | package org.dbpedia.extraction.dump.download.actors
 2 | 
 3 | import akka.actor.{ActorLogging, Actor}
 4 | import org.dbpedia.extraction.dump.download.actors.message.GeneralMessage.ShutdownCluster
 5 | import akka.contrib.pattern.{DistributedPubSubExtension, DistributedPubSubMediator}
 6 | import org.dbpedia.extraction.dump.download.actors.message.WorkerProgressMessage.{Progress, ProgressStart}
 7 | import org.dbpedia.extraction.dump.download.actors.message.{DownloadProgress, DownloadResult, DownloadJob, MirroredDownloadJob}
 8 | 
 9 | /**
10 |  * This actor is used to print download progress logging messages on the driver/master node.
11 |  * Hooks into Master.ResultsTopic and consumes DownloadResult messages.
12 |  *
13 |  * TODO: Refactor the code to pretty-print better progress results like ByteLogger. Maintain list of jobs
14 |  * and log percentage of work done etc.
15 |  */
16 | class DownloadResultConsumer extends Actor with ActorLogging
17 | {
18 |   var jobs = Map[String, MirroredDownloadJob]()
19 |   val mediator = DistributedPubSubExtension(context.system).mediator
20 |   mediator ! DistributedPubSubMediator.Subscribe(Master.General, self)
21 |   mediator ! DistributedPubSubMediator.Subscribe(Master.ProgressTopic, self)
22 |   mediator ! DistributedPubSubMediator.Subscribe(Master.ResultsTopic, self)
23 | 
24 |   def receive =
25 |   {
26 |     case _: DistributedPubSubMediator.SubscribeAck =>
27 | 
28 |     case job @ MirroredDownloadJob(_, DownloadJob(downloadId, _)) =>
29 |       log.info("Starting job: {}", job)
30 |       jobs += (downloadId -> job)
31 | 
32 |     case DownloadResult(downloadId, outputPath, bytes) =>
33 |       log.info("{}: {} bytes downloaded to {}", downloadId, bytes, outputPath)
34 | 
35 |     case DownloadProgress(downloadId, p @ ProgressStart(bytes)) =>
36 |       log.info("{}: {}", jobs(downloadId), p)
37 | 
38 |     case DownloadProgress(downloadId, p @ Progress(bytes)) =>
39 |       log.info("{}: {}", jobs(downloadId), p)
40 | 
41 |     case ShutdownCluster =>
42 |       context.stop(self)
43 |       context.system.shutdown()
44 |   }
45 | }
46 | 


--------------------------------------------------------------------------------
/common/src/main/scala/org/dbpedia/extraction/util/HadoopConfigurable.scala:
--------------------------------------------------------------------------------
 1 | package org.dbpedia.extraction.util
 2 | 
 3 | import org.apache.hadoop.fs.Path
 4 | import org.dbpedia.extraction.util.RichHadoopPath.wrapPath
 5 | import org.apache.hadoop.conf.Configuration
 6 | 
 7 | /**
 8 |  * Trait for classes that need to create a Hadoop Configuration.
 9 |  */
10 | trait HadoopConfigurable
11 | {
12 |   /** Path to hadoop core-site.xml */
13 |   protected val hadoopCoreConf: String
14 | 
15 |   /** Path to hadoop hdfs-site.xml */
16 |   protected val hadoopHdfsConf: String
17 | 
18 |   /** Path to hadoop mapred-site.xml */
19 |   protected val hadoopMapredConf: String
20 | 
21 |   /** Hadoop Configuration. This is implicit because RichHadoopPath operations need it. */
22 |   implicit lazy val hadoopConf =
23 |   {
24 |     val hadoopConf = new Configuration()
25 | 
26 |     if (hadoopCoreConf != null)
27 |       hadoopConf.addResource(new Path(hadoopCoreConf))
28 |     if (hadoopHdfsConf != null)
29 |       hadoopConf.addResource(new Path(hadoopHdfsConf))
30 |     if (hadoopMapredConf != null)
31 |       hadoopConf.addResource(new Path(hadoopMapredConf))
32 | 
33 |     hadoopConf
34 |   }
35 | 
36 |   /**
37 |    * Checks if a Path exists.
38 |    *
39 |    * @param path Option[Path] if this is None, pathMustExist has no effect.
40 |    * @param pathMustExist Boolean to ensure that the Path, if obtained, actually exists.
41 |    * @throws RuntimeException if Option[Path] is defined but the path does not exist
42 |    * @return the Option[Path] given as input
43 |    */
44 |   def checkPathExists(path: Option[Path], pathMustExist: Boolean): Option[Path] =
45 |   {
46 |     // If pathMustExist is set to true, and path is defined but it does not exist, throw an error.
47 |     if (pathMustExist && path.isDefined && !path.get.exists)
48 |     {
49 |       val hadoopHint = if (hadoopCoreConf == null || hadoopHdfsConf == null || hadoopMapredConf == null) " Make sure you configured Hadoop correctly and the directory exists on the configured file system." else ""
50 |       throw sys.error("Dir " + path.get.getSchemeWithFileName + " does not exist." + hadoopHint)
51 |     }
52 |     path
53 |   }
54 | }
55 | 


--------------------------------------------------------------------------------
/gce/README.md:
--------------------------------------------------------------------------------
 1 | Spark GCE
 2 | =========
 3 | 
 4 | Spark GCE is like Spark Ec2 but for those who run their cluster on Google Cloud.
 5 | 
 6 |   - Make sure you have installed and authenticated gcutils where you are running this script.
 7 |   - Helps you launch a spark cluster in the Google Cloud
 8 |   - Attaches 100GB empty disk to all nodes in the cluster
 9 |   - Installs and configures Spark and HDFS automatically
10 |   - Starts the Shark server automatically
11 | 
12 | Spark GCE is a python script which will help you launch a spark cluster in the google cloud like the way the spark_ec2 script does for AWS.
13 | 
14 | Usage
15 | -----
16 | 
17 | > ***spark_gce.py project-name number-of-slaves slave-type master-type identity-file zone cluster-name spark-mem workers-per-node cores-per-worker local-log-dir***
18 | >
19 | >>
20 | >> - **project-name**: Name of the project where you are going to launch your spark cluster.
21 | >>
22 | >> - **number-of-slave**: Number of slaves that you want to launch.
23 | >>
24 | >> - **slave-type**: Instance type for the slave machines.
25 | >>
26 | >> - **master-type**: Instance type for the master node.
27 | >>
28 | >> - **identity-file**: Identity file to authenticate with your GCE instances, Usually resides at *~/.ssh/google_compute_engine* once you authenticate using gcutils.
29 | >>
30 | >> - **zone:** Specify the zone where you are going to launch the cluster.
31 | >>
32 | >> - **cluster-name**: Name the cluster that you are going to launch.
33 | >>
34 | >> - **spark-mem**: Amount of memory per Spark worker (as a JVM memory string eg. 2500m, 2g)
35 | >>
36 | >> - **workers-per-node**: Number of workers to run on each slave node
37 | >>
38 | >> - **cores-per-worker**: Number of cores each worker should use (optional, 1 by default)
39 | >>
40 | >> - **local-log-dir**: A local directory to download nmon logs from all the nodes (optional, empty, or no logging by default)
41 | >>
42 | >
43 | > ***spark_gce.py project-name cluster-name [identity-fle local-log-dir] destroy***
44 | >
45 | >> - **project-name**: Name of the project where the spark cluster is at.
46 | >> - **cluster-name**: Name of the cluster that you are going to destroy.
47 | >> - **NOTE**: If you had specified a local-log-dir while starting the cluster, provide it here too, along with the identity-file, else skip both.
48 | 
49 | 
50 | Installation
51 | --------------
52 | 
53 | ```sh
54 | git clone git@github.com:dbpedia/distributed-extraction-framework.git
55 | cd gce
56 | python spark_gce.py
57 | ```
58 | 


--------------------------------------------------------------------------------
/download/src/main/scala/org/dbpedia/extraction/dump/download/actors/DownloadClient.scala:
--------------------------------------------------------------------------------
 1 | package org.dbpedia.extraction.dump.download.actors
 2 | 
 3 | import akka.actor.{ActorLogging, Actor}
 4 | import java.util.UUID
 5 | import scala.concurrent.duration._
 6 | import akka.pattern._
 7 | import akka.contrib.pattern.{DistributedPubSubMediator, DistributedPubSubExtension}
 8 | import akka.contrib.pattern.DistributedPubSubMediator.Send
 9 | import org.dbpedia.extraction.dump.download.actors.message.GeneralMessage.{MasterQueueEmpty, ShutdownCluster}
10 | import akka.util.Timeout
11 | import org.dbpedia.extraction.dump.download.actors.message.{DumpFile, DownloadJob}
12 | 
13 | /**
14 |  * A client actor used to submit download jobs to the master. To submit a job, a DumpFile object is sent as message.
15 |  */
16 | class DownloadClient extends Actor with ActorLogging
17 | {
18 | 
19 |   import DownloadClient._
20 |   import context.dispatcher
21 | 
22 |   def scheduler = context.system.scheduler
23 | 
24 |   def nextDownloadId(): String = UUID.randomUUID().toString
25 | 
26 |   val mediator = DistributedPubSubExtension(context.system).mediator
27 |   mediator ! DistributedPubSubMediator.Subscribe(Master.General, self)
28 | 
29 |   implicit val timeout = Timeout(10.seconds)
30 | 
31 |   var canShutDownCluster = false
32 | 
33 |   def receive =
34 |   {
35 |     case Finished =>
36 |       // send this when no more DumpFiles are to be added - ready for shutdown
37 |       canShutDownCluster = true
38 | 
39 |     case MasterQueueEmpty =>
40 |       if (canShutDownCluster) self ! ShutdownCluster
41 | 
42 |     case ShutdownCluster =>
43 |       mediator ! Send("/user/master/active", ShutdownCluster, localAffinity = false)
44 |       context.stop(self)
45 |       context.system.shutdown()
46 |       context.become(shuttingDown)
47 | 
48 |     case file: DumpFile =>
49 |       self ! DownloadJob(nextDownloadId(), file)
50 | 
51 |     case job: DownloadJob =>
52 |       (mediator ? Send("/user/master/active", job, localAffinity = false)) map
53 |         {
54 |           case Master.Ack(_) =>
55 |             log.info("Job accepted by master: {}", job)
56 |         } recover
57 |         {
58 |           case _ =>
59 |             log.info("Job not accepted, retry after a while")
60 |             scheduler.scheduleOnce(3.seconds, self, job)
61 |         }
62 |   }
63 | 
64 |   def shuttingDown: Receive =
65 |   {
66 |     case _ => // ignore all messages, shutting down cluster.
67 |   }
68 | }
69 | 
70 | object DownloadClient
71 | {
72 |   case object Finished
73 | }
74 | 


--------------------------------------------------------------------------------
/download/src/main/scala/org/dbpedia/extraction/dump/download/actors/DownloadProgressTracker.scala:
--------------------------------------------------------------------------------
 1 | package org.dbpedia.extraction.dump.download.actors
 2 | 
 3 | import akka.actor._
 4 | import java.util.concurrent.atomic.AtomicLong
 5 | import scala.concurrent.duration.FiniteDuration
 6 | import org.dbpedia.extraction.dump.download.actors.message.{DownloaderProgressMessage, WorkerProgressMessage}
 7 | import scala.Some
 8 | 
 9 | /**
10 |  * An actor that receives Start and Read messages, and relays ProgressStart and Progress messages to the client.
11 |  * This is used to keep track of download progress - the number of bytes being read in real time.
12 |  *
13 |  * @param client The actor to send progress messages to
14 |  * @param notifyInterval The time interval at which progress reports will be sent to client
15 |  */
16 | class DownloadProgressTracker(client: ActorRef, notifyInterval: FiniteDuration) extends Actor with ActorLogging
17 | {
18 |   import WorkerProgressMessage._
19 |   import DownloaderProgressMessage._
20 |   import DownloadProgressTracker._
21 |   import context.dispatcher
22 | 
23 |   def scheduler = context.system.scheduler
24 | 
25 |   private val bytesRead = new AtomicLong()
26 | 
27 |   /** This task is used to send Progress messages to client at each interval */
28 |   private var progressTaskOption: Option[Cancellable] = None
29 | 
30 |   override def postStop() = progressTaskOption.foreach(_.cancel())
31 | 
32 |   def receive =
33 |   {
34 |     case Start(total) => // Sent by ActoredCounter to signal start of download
35 |       if (0 != bytesRead.get() || progressTaskOption.isDefined)
36 |       {
37 |         log.info("ProgressTracker is already started!")
38 |       }
39 |       else
40 |       {
41 |         progressTaskOption = Some(scheduler.schedule(notifyInterval, notifyInterval, self, Tick))
42 |         client ! ProgressStart(total)
43 |       }
44 | 
45 |     case Read(bytes) => // Sent by ActoredCounter to signal bytes read
46 |       bytesRead.set(bytes)
47 | 
48 |     case Stop =>
49 |       (progressTaskOption, bytesRead.get) match
50 |       {
51 |         case (Some(progressTask), b) if b != 0 =>
52 |           sender ! ProgressEnd(bytesRead.get())
53 |           bytesRead.set(0)
54 | 
55 |           progressTask.cancel()
56 |           progressTaskOption = None
57 | 
58 |         case _ =>
59 |           log.info("ProgressTracker is already stopped!")
60 |       }
61 | 
62 |     case Tick =>
63 |       client ! Progress(bytesRead.get())
64 |   }
65 | }
66 | 
67 | object DownloadProgressTracker
68 | {
69 |   case object Tick
70 | }


--------------------------------------------------------------------------------
/download/src/test/resources/download.properties:
--------------------------------------------------------------------------------
 1 | # NOTE: format is not java.util.Properties, but org.dbpedia.extraction.dump.download.DownloadConfig
 2 | 
 3 | # Default download server. It lists mirrors which may be faster. 
 4 | base-url=http://dumps.wikimedia.org/
 5 | 
 6 | # Replace by your target folder.
 7 | base-dir=/home/gonephishing/dbpedia-extraction/distributed-extraction-framework/dumps/files
 8 | 
 9 | # This setting is recommended for large languages that have part files (eg. en, fr). See below. Replace xx/yy by your language.
10 | #download=xx,yy:@pages-articles\d+\.xml.*\.bz2
11 | download=en:pages-articles1.xml-p000000010p000010000.bz2
12 | 
13 | # This setting should be provided for small languages that have no part files (eg. li)
14 | #download=xx,yy:pages-articles.xml.bz2
15 | 
16 | # You may provide multiple "download=" lines for different types of languages, just like above.
17 | 
18 | ###### Download part files ######
19 | #
20 | # Please make sure that the regex actually matches the format used for xx dumps
21 | # by checking http://dumps.wikimedia.org/xxwiki/yyyymmdd
22 | #
23 | # Example:
24 | # enwiki => enwiki-20131120-pages-articles1.xml-p000000010p000010000.bz2 hence @pages-articles\d+\.xml-p\d+p\d+\.bz2 matches
25 | # frwiki => frwiki-20131120-pages-articles1.xml.bz2 hence @pages-articles\d+\.xml\.bz2 matches (the previous regex does not!)
26 | #
27 | # NOTE: @pages-articles\d+\.xml.*\.bz2 is especially recommended when using the distributed downloader because it captures both
28 | # the above types and exploits maximum parallelism by allowing multiple part files to be downloaded and processed simultaneously.
29 | #
30 | # Remember that certain languages have small dumps and therefore no part files at all. They need to be handled with only
31 | # pages-articles.xml.bz2. Example with both small and large languages (setting download multiple times works like appending; so
32 | # adding both download's below is perfectly valid):
33 | #
34 | # download=en,fr:@pages-articles\d+\.xml.*\.bz2
35 | # download=li,bn,ilo:pages-articles.xml.bz2
36 | #
37 | # commonswiki => it does not have part files! This is true for other wikis as well. In this case xx:pages-articles.xml.bz2
38 | # shoud be used (e.g. commons:pages-articles.xml.bz2 or cowiki:pages-articles.xml.bz2)
39 | #
40 | # download=xx:@pages-articles\d+\.xml-p\d+p\d+\.bz2
41 | # download=xx:@pages-articles\d+\.xml.*\.bz2
42 | 
43 | # Only needed for the ImageExtractor
44 | # download=commons:pages-articles.xml.bz2
45 | 
46 | # Unzip files while downloading? Not necessary, extraction will unzip on the fly. Let's save space.
47 | unzip=false
48 | 
49 | # Sometimes connecting to the server fails, so we try five times with pauses of 10 seconds.
50 | retry-max=5
51 | retry-millis=1000
52 | 


--------------------------------------------------------------------------------
/extraction/src/main/scala/org/dbpedia/extraction/mappings/DistDisambiguations.scala:
--------------------------------------------------------------------------------
 1 | package org.dbpedia.extraction.mappings
 2 | 
 3 | import java.util.logging.{Level, Logger}
 4 | import java.io._
 5 | import org.apache.hadoop.fs.Path
 6 | import org.dbpedia.extraction.util.RichHadoopPath.wrapPath
 7 | import org.dbpedia.extraction.util.{DistIOUtils, Language}
 8 | import org.apache.hadoop.conf.Configuration
 9 | import com.esotericsoftware.kryo.io.{Input, Output}
10 | 
11 | /**
12 |  * A version of Disambiguations that works with org.apache.hadoop.fs.Path.
13 |  *
14 |  * @see Disambiguations
15 |  */
16 | class DistDisambiguations(override val set : Set[Long]) extends Disambiguations(set)
17 | 
18 | object DistDisambiguations
19 | {
20 |   private val logger = Logger.getLogger(classOf[DistDisambiguations].getName)
21 | 
22 |   /**
23 |    * Loads disambiguations from cache/source reader.
24 |    *
25 |    * @param reader Reader to load disambiguations from
26 |    * @param cache Path to cache file
27 |    * @param lang Language
28 |    * @param hadoopConf Configuration
29 |    * @return Disambiguations object
30 |    */
31 |   def load(reader : () => Reader, cache : Path, lang : Language)(implicit hadoopConf: Configuration) : Disambiguations =
32 |   {
33 |     try
34 |     {
35 |       return loadFromCache(cache)
36 |     }
37 |     catch
38 |       {
39 |         case ex : Exception => logger.log(Level.INFO, "Will extract disambiguations from source for "+lang.wikiCode+" wiki, could not load cache file '"+cache.getSchemeWithFileName+"': "+ex)
40 |       }
41 | 
42 |     val disambiguations = Disambiguations.loadFromFile(reader, lang)
43 | 
44 |     val dir = cache.getParent
45 |     if (!dir.exists && !dir.mkdirs()) throw new IOException("cache dir [" + dir.getSchemeWithFileName + "] does not exist and cannot be created")
46 |     val output = new Output(new BufferedOutputStream(cache.outputStream()))
47 | 
48 |     try
49 |     {
50 |       DistIOUtils.getKryoInstance.writeClassAndObject(output, disambiguations.set)
51 |       logger.info(disambiguations.set.size + " disambiguations written to cache file " + cache.getSchemeWithFileName)
52 |       disambiguations
53 |     }
54 |     finally
55 |     {
56 |       output.close()
57 |     }
58 |   }
59 | 
60 |   /**
61 |    * Loads the disambiguations from a cache file.
62 |    */
63 |   private def loadFromCache(cache : Path)(implicit hadoopConf: Configuration) : Disambiguations =
64 |   {
65 |     logger.info("Loading disambiguations from cache file " + cache.getSchemeWithFileName)
66 |     val input = new Input(new BufferedInputStream(cache.inputStream()))
67 |     try
68 |     {
69 |       val disambiguations = new Disambiguations(DistIOUtils.getKryoInstance.readClassAndObject(input).asInstanceOf[Set[Long]])
70 |       logger.info(disambiguations.set.size + " disambiguations loaded from cache file " + cache.getSchemeWithFileName)
71 |       disambiguations
72 |     }
73 |     finally
74 |     {
75 |       input.close()
76 |     }
77 |   }
78 | }
79 | 


--------------------------------------------------------------------------------
/common/src/main/scala/org/dbpedia/extraction/util/RichHadoopPath.scala:
--------------------------------------------------------------------------------
 1 | package org.dbpedia.extraction.util
 2 | 
 3 | import java.io.{IOException, FileNotFoundException,OutputStream, InputStream}
 4 | import org.apache.hadoop.fs.{FileSystem, Path}
 5 | import org.apache.hadoop.conf.Configuration
 6 | import java.nio.file.NotDirectoryException
 7 | import scala.language.implicitConversions
 8 | 
 9 | object RichHadoopPath {
10 | 
11 |   implicit def wrapPath(path: Path)(implicit hadoopConf: Configuration) = new RichHadoopPath(path, hadoopConf)
12 | 
13 |   implicit def toPath(path: String) = new Path(path)
14 | 
15 | }
16 | 
17 | /**
18 |  * This class lets us use org.apache.hadoop.fs.Path seamlessly wherever a FileLike is used.
19 |  * Defines additional methods on Path by using an implicit Configuration.
20 |  */
21 | class RichHadoopPath(path: Path, conf: Configuration) extends FileLike[Path] {
22 | 
23 |   private val fs: FileSystem = path.getFileSystem(conf)
24 | 
25 |   override def toString: String = path.toString
26 | 
27 |   override def name: String = path.getName
28 | 
29 |   /**
30 |    * @throws NotDirectoryException if the path is not a directory
31 |    * @throws FileNotFoundException if the path does not exist
32 |    */
33 |   override def hasFiles: Boolean = {
34 |     isDirectory match {
35 |       // Not a directory?
36 |       case false => throw new NotDirectoryException(path.toString)
37 |       // Contains files?
38 |       case true => if(fs.listStatus(path).size > 0) true else false
39 |     }
40 |   }
41 | 
42 |   override def delete(recursive: Boolean = false): Unit = {
43 |     if(!fs.delete(path, recursive))
44 |       throw new IOException("failed to delete path ["+path+"]")
45 |   }
46 | 
47 |   override def resolve(name: String): Path = new Path(path, name)
48 | 
49 |   override def exists: Boolean = fs.exists(path)
50 | 
51 |   // TODO: more efficient type than List?
52 |   override def names: List[String] = names("*")
53 | 
54 |   // TODO: more efficient type than List?
55 |   def names(glob: String): List[String] = list(glob).map(_.getName)
56 | 
57 |   // TODO: more efficient type than List?
58 |   override def list: List[Path] = list("*")
59 | 
60 |   // TODO: more efficient type than List?
61 |   def list(glob: String): List[Path] = {
62 |     val list = fs.globStatus(new Path(path, glob)).map(_.getPath).toList
63 |     if(list.isEmpty) throw new IOException("failed to list files in ["+path+"]")
64 |     list
65 |   }
66 | 
67 |   override def size: Long = fs.getContentSummary(path).getLength
68 | 
69 |   override def isFile: Boolean = fs.isFile(path)
70 | 
71 |   override def isDirectory: Boolean = fs.getFileStatus(path).isDirectory
72 | 
73 |   override def inputStream(): InputStream = fs.open(path)
74 | 
75 |   override def outputStream(append: Boolean = false): OutputStream = if(append) fs.append(path) else fs.create(path)
76 | 
77 |   def mkdirs(): Boolean = fs.mkdirs(path)
78 | 
79 |   def getSchemeWithFileName: String = fs.getScheme + "://" + path.toUri.getPath
80 | }
81 | 


--------------------------------------------------------------------------------
/extraction/src/main/scala/org/dbpedia/extraction/dump/extract/DistExtractionJob.scala:
--------------------------------------------------------------------------------
 1 | package org.dbpedia.extraction.dump.extract
 2 | 
 3 | import java.util.logging.{Level, Logger}
 4 | import org.dbpedia.extraction.destinations.{Quad, DistDestination}
 5 | import org.dbpedia.extraction.mappings.RootExtractor
 6 | import org.dbpedia.extraction.sources.WikiPage
 7 | import org.dbpedia.extraction.spark.serialize.KryoSerializationWrapper
 8 | import org.dbpedia.extraction.wikiparser.Namespace
 9 | import org.apache.spark.rdd.RDD
10 | import org.dbpedia.extraction.util.StringUtils
11 | import org.apache.spark.SparkContext._
12 | import org.dbpedia.util.Exceptions
13 | 
14 | /**
15 |  * Executes an extraction using Spark.
16 |  *
17 |  * @param extractor The Extractor
18 |  * @param rdd The RDD of WikiPages
19 |  * @param namespaces Only extract pages in these namespaces
20 |  * @param destination The extraction destination. Will be closed after the extraction has been finished.
21 |  * @param label user readable label of this extraction job.
22 |  */
23 | class DistExtractionJob(extractor: => RootExtractor, rdd: => RDD[WikiPage], namespaces: Set[Namespace], destination: => DistDestination, label: String, description: => String)
24 | {
25 |   private val logger = Logger.getLogger(getClass.getName)
26 | 
27 |   def run(): Unit =
28 |   {
29 |     val sc = rdd.sparkContext
30 |     val allPages = sc.accumulator(0)
31 |     val failedPages = sc.accumulator(0)
32 | 
33 |     val loggerBC = sc.broadcast(logger)
34 |     val extractorBC = sc.broadcast(KryoSerializationWrapper(extractor))
35 |     val namespacesBC = sc.broadcast(namespaces)
36 | 
37 |     val startTime = System.currentTimeMillis
38 | 
39 |     val results: RDD[Seq[Quad]] =
40 |       rdd.map
41 |       {
42 |         page =>
43 |           // Take a WikiPage, perform the extraction with a set of extractors and return the results as a Seq[Quad].
44 |           val (success, graph) = try
45 |           {
46 |             (true, if (namespacesBC.value.contains(page.title.namespace)) Some(extractorBC.value.value.apply(page)) else None)
47 |           }
48 |           catch
49 |             {
50 |               case ex: Exception =>
51 |                 loggerBC.value.log(Level.WARNING, "error processing page '" + page.title + "': " + Exceptions.toString(ex, 200))
52 |                 (false, None)
53 |             }
54 | 
55 |           if (success) allPages += 1 else failedPages += 1
56 | 
57 |           graph.getOrElse(Nil)
58 |       }
59 | 
60 |     logger.info(description+" started")
61 | 
62 |     destination.open()
63 | 
64 |     logger.info("Writing outputs to destination...")
65 | 
66 |     destination.write(results)
67 | 
68 |     destination.close()
69 | 
70 |     val time = System.currentTimeMillis - startTime
71 |     println("%s: extracted %d pages in %s (per page: %f ms; failed pages: %d).".format(label,
72 |                                                                                        allPages.value,
73 |                                                                                        StringUtils.prettyMillis(time),
74 |                                                                                        time.toDouble / allPages.value,
75 |                                                                                        failedPages.value))
76 | 
77 |     logger.info(description+" finished")
78 |   }
79 | }
80 | 


--------------------------------------------------------------------------------
/extraction/src/main/scala/org/dbpedia/extraction/spark/io/input/SeekableInputStream.scala:
--------------------------------------------------------------------------------
 1 | package org.dbpedia.extraction.spark.io.input
 2 | 
 3 | import org.apache.hadoop.io.compress._
 4 | import org.apache.hadoop.fs.{FileSystem, Seekable, FSDataInputStream}
 5 | import java.io.{InputStream, FilterInputStream}
 6 | import org.apache.hadoop.mapreduce.lib.input.FileSplit
 7 | 
 8 | object SeekableInputStream
 9 | {
10 |   /**
11 |    * Examines a FileSplit and returns the appropriate SeekableInputStream generated from it.
12 |    *
13 |    * @param split FileSplit to generate the SeekableInputStream from
14 |    * @param fs FileSystem
15 |    * @param compressionCodecs CompressionCodecFactory
16 |    * @return SeekableInputStream to read from split
17 |    */
18 |   def apply(split: FileSplit, fs: FileSystem, compressionCodecs: CompressionCodecFactory): SeekableInputStream =
19 |   {
20 |     val path = split.getPath
21 |     val start = split.getStart
22 |     val end = start + split.getLength
23 | 
24 |     val codec = compressionCodecs.getCodec(path)
25 |     val dataInputStream = fs.open(path)
26 | 
27 |     codec match
28 |     {
29 |       case splitableCodec: SplittableCompressionCodec =>
30 |         // Is it a splittable compression input stream?
31 |         val compressionInputStream = splitableCodec.createInputStream(dataInputStream,
32 |                                                                       CodecPool.getDecompressor(codec),
33 |                                                                       start,
34 |                                                                       end,
35 |                                                                       SplittableCompressionCodec.READ_MODE.BYBLOCK)
36 |         SeekableSplitCompressedInputStream(compressionInputStream)
37 |       case null =>
38 |         // Input stream not compressed?
39 |         dataInputStream.seek(start)
40 |         SeekableUncompressedInputStream(dataInputStream)
41 |       case _ =>
42 |         // Non-splittable compression input stream? No seeking or offsetting is needed
43 |         assert(start == 0)
44 |         val compressionInputStream = codec.createInputStream(dataInputStream, CodecPool.getDecompressor(codec))
45 |         SeekableCompressedInputStream(compressionInputStream, dataInputStream)
46 |     }
47 |   }
48 | }
49 | 
50 | /**
51 | * A SeekableInputStream internally using a SplitCompressionInputStream, ie. compressed by a splittable compression method.
52 | */
53 | case class SeekableSplitCompressedInputStream(sin: SplitCompressionInputStream) extends SeekableInputStream(sin, sin)
54 | 
55 | /**
56 | * A compressed SeekableInputStream using a non-splittable compression input stream
57 | */
58 | case class SeekableCompressedInputStream(cin: CompressionInputStream, fsin: FSDataInputStream) extends SeekableInputStream(cin, fsin)
59 | 
60 | /**
61 | * SeekableInputStream without compression.
62 | */
63 | case class SeekableUncompressedInputStream(fsin: FSDataInputStream) extends SeekableInputStream(fsin, fsin)
64 | 
65 | /**
66 | * Wraps an InputStream and a corresponding Seekable to track its position.
67 | *
68 | * @param in InputStream to read binary data from
69 | * @param seeker Seekable for the InputStream "in" - used for keeping track of position in the InputStream
70 | */
71 | sealed class SeekableInputStream(in: InputStream, seeker: Seekable) extends FilterInputStream(in) with Seekable
72 | {
73 |   override def getPos: Long = seeker.getPos
74 | 
75 |   override def seek(pos: Long) = seeker.seek(pos)
76 | 
77 |   override def seekToNewSource(targetPos: Long): Boolean = seeker.seekToNewSource(targetPos)
78 | 
79 |   override def toString: String = in.toString
80 | }


--------------------------------------------------------------------------------
/extraction/src/test/resources/config.properties:
--------------------------------------------------------------------------------
 1 | # download and extraction target dir
 2 | # This can be a directory on HDFS or a local directory, depending on the Hadoop configuration files given in dist-config.properties
 3 | base-dir=src/test/resources/data
 4 | # Source file. If source file name ends with .gz or .bz2, it is unzipped on the fly.
 5 | # Must exist in the directory xxwiki/yyyymmdd and have the prefix xxwiki-yyyymmdd-
 6 | # where xx is the wiki code and yyyymmdd is the dump date.
 7 | 
 8 | # default (prefer multistream bz2 files):
 9 | source=pages-articles-multistream.xml.bz2
10 | 
11 | # alternatives:
12 | # source=pages-articles.xml.gz
13 | # source=pages-articles.xml
14 | 
15 | ###### Extract from part files ######
16 | #
17 | # Please make sure that the regex actually matches the format used by ALL the wikis you are going to extract from!!!!
18 | # One that should work in all cases is
19 | # source=@pages-articles\\d*\\.xml(-p\\d+p\\d+)?\\.bz2
20 | #
21 | # NOTE: when using the above regex you should make sure you do not have part files AND regular dump files together
22 | # for the same wiki, e.g. frwiki-20131120-pages-articles1.xml.bz2 and frwiki-20131120-pages-articles.xml.bz2, as they
23 | # BOTH will match and that will result in duplicate output data
24 | #
25 | # Example:
26 | # enwiki => enwiki-latest-pages-articles1.xml-p000000010p000010000.bz2 hence @pages-articles\\d+\\.xml-p\\d+p\\d+\\.bz2 matches
27 | # frwiki => frwiki-latest-pages-articles1.xml.bz2 hence @pages-articles\\d+\\.xml\\.bz2 matches (the previous regex does not!)
28 | # commonswiki => it does not have part files! This is true for other wikis as well.
29 | #
30 | # source=@pages-articles\\d+\\.xml-p\\d+p\\d+\\.bz2
31 | 
32 | # In case of multistream chunks
33 | # source=@pages-articles-multistream\\.xml\\.\\d+\\.bz2
34 | 
35 | # use only directories that contain a 'download-complete' file? Default is false.
36 | require-download-complete=false
37 | 
38 | # List of languages or article count ranges, e.g. 'en,de,fr' or '10000-20000' or '10000-', or '@mappings'
39 | languages=en
40 | # extractor class names starting with "." are prefixed by "org.dbpedia.extraction.mappings"
41 | 
42 | extractors=.ArticleCategoriesExtractor,.ArticleTemplatesExtractor,.CategoryLabelExtractor,\
43 | .ExternalLinksExtractor,.GeoExtractor,.InterLanguageLinksExtractor,.LabelExtractor,.PageIdExtractor,\
44 | .PageLinksExtractor,.RedirectExtractor,.RevisionIdExtractor,.ProvenanceExtractor,.SkosCategoriesExtractor,\
45 | .ArticlePageExtractor
46 | 
47 | #extractors=.InfoboxExtractor
48 | 
49 | # if ontology and mapping files are not given or do not exist, download info from mappings.dbpedia.org
50 | ontology=ontology.xml
51 | mappings=../mappings
52 | 
53 | # Serialization URI policies and file formats. Quick guide:
54 | # uri-policy keys: uri, generic, xml-safe, reject-long
55 | # uri-policy position modifiers: -subjects, -predicates, -objects, -datatypes, -contexts
56 | # uri-policy values: comma-separated languages or '*' for all languages
57 | # format values: n-triples, n-quads, turtle-triples, turtle-quads, trix-triples, trix-quads
58 | # See http://git.io/DBpedia-serialization-format-properties for details.
59 | 
60 | # For backwards compatibility, en uses generic URIs. All others use local IRIs.
61 | uri-policy.uri=uri:en; generic:en; xml-safe-predicates:*; reject-long:*
62 | uri-policy.iri=generic:en; xml-safe-predicates:*; reject-long:*
63 | 
64 | # NT is unreadable anyway - might as well use URIs for en
65 | format.nt.gz=n-triples;uri-policy.uri
66 | format.nq.gz=n-quads;uri-policy.uri
67 | 
68 | # Turtle is much more readable - use nice IRIs for all languages
69 | format.ttl.gz=turtle-triples;uri-policy.iri
70 | format.tql.gz=turtle-quads;uri-policy.iri
71 | 


--------------------------------------------------------------------------------
/download/src/main/scala/org/dbpedia/extraction/dump/download/actors/DownloadJobRunner.scala:
--------------------------------------------------------------------------------
 1 | package org.dbpedia.extraction.dump.download.actors
 2 | 
 3 | import akka.actor.{ActorLogging, Props, Actor}
 4 | import akka.pattern.ask
 5 | import akka.util.Timeout
 6 | import org.dbpedia.extraction.dump.download.{Unzip, ActoredCounter, FileDownloader}
 7 | import org.dbpedia.extraction.util.{Language, Finder}
 8 | import java.net.URL
 9 | import org.dbpedia.extraction.util.RichHadoopPath.wrapPath
10 | import org.apache.hadoop.fs.Path
11 | import org.apache.hadoop.conf.Configuration
12 | import java.io.File
13 | import scala.concurrent.Future
14 | import scala.concurrent.duration._
15 | import scala.language.postfixOps
16 | import org.dbpedia.extraction.dump.download.actors.Worker.DownloadComplete
17 | import org.dbpedia.extraction.dump.download.actors.message.{DumpFile, DownloadJob, MirroredDownloadJob, DownloaderProgressMessage}
18 | import DownloaderProgressMessage.{ProgressEnd, Stop}
19 | import scala.util.{Failure, Success}
20 | 
21 | /**
22 |  * This actor is used by Worker to run a download job.
23 |  *
24 |  * @param progressInterval Download progress report interval
25 |  * @param hadoopConfiguration Hadoop Configuration
26 |  * @param tempDir temporary directory on local file system to download to (before being moved to HDFS)
27 |  * @param unzip true if file should be unzipped while downloading, false otherwise
28 |  */
29 | class DownloadJobRunner(progressInterval: FiniteDuration, hadoopConfiguration: Configuration, tempDir: File, unzip: Boolean) extends Actor with ActorLogging
30 | {
31 |   implicit private val _hadoopConfiguration = hadoopConfiguration
32 |   implicit private val progressStopTimeout = Timeout(5 seconds)
33 | 
34 |   val progress = context.watch(context.actorOf(Props(classOf[DownloadProgressTracker], context.parent, progressInterval), "progress"))
35 | 
36 |   class Downloader extends FileDownloader with ActoredCounter
37 |   {
38 |     override val progressActor = progress
39 |   }
40 | 
41 |   val downloader =
42 |     if (unzip) new Downloader with Unzip
43 |     else new Downloader
44 | 
45 |   def receive =
46 |   {
47 |     case job@MirroredDownloadJob(mirror, DownloadJob(_, DumpFile(base, wikiName, lang, date, fileName))) =>
48 |       log.debug("Received download job from Worker: {}", job)
49 |       val s = sender()
50 |       import context.dispatcher
51 | 
52 |       val baseDir = new Path(base)
53 |       val finder = new Finder[Path](baseDir, Language(lang), wikiName)
54 |       val wiki = finder.wikiName
55 |       val dateDir = baseDir.resolve(wiki).resolve(date)
56 |       if (!dateDir.exists && !dateDir.mkdirs) throw new Exception("Target directory [" + dateDir.getSchemeWithFileName + "] does not exist and cannot be created")
57 |       if (!tempDir.exists && !tempDir.mkdirs) throw new Exception("Local temporary directory [" + tempDir + "] does not exist and cannot be created")
58 | 
59 |       val url = new URL(mirror, s"$wiki/$date/$wiki-$date-$fileName")
60 |       val targetFile = new File(tempDir, downloader.targetName(url))
61 |       if(targetFile.exists) targetFile.delete() // delete file in temp dir if it already exists
62 | 
63 |       Future(downloader.downloadTo(url, tempDir)).
64 |       onComplete
65 |       {
66 |         case Success(file) =>
67 |           // file was downloaded to tempDir; copy it to Hadoop FS.
68 |           val fs = dateDir.getFileSystem(hadoopConfiguration)
69 |           val outputPath = dateDir.resolve(file.getName)
70 |           fs.moveFromLocalFile(new Path(file.toURI), outputPath)
71 |           progress ? Stop onSuccess
72 |             {
73 |               case ProgressEnd(totalBytes) =>
74 |                 s ! DownloadComplete(outputPath.getSchemeWithFileName, totalBytes) // Tell worker that download is finished
75 |             }
76 |         case Failure(t) =>
77 |           log.info(t.getMessage)
78 |           progress ! Stop
79 |       }
80 |   }
81 | }
82 | 
83 | object DownloadJobRunner
84 | {
85 |   def props(progressInterval: FiniteDuration, hadoopConfiguration: Configuration, tempDir: File, unzip: Boolean = false): Props =
86 |     Props(classOf[DownloadJobRunner], progressInterval, hadoopConfiguration, tempDir, unzip)
87 | }
88 | 


--------------------------------------------------------------------------------
/download/src/test/resources/dist-download.properties:
--------------------------------------------------------------------------------
 1 | # NOTE: format is not java.util.Properties, but org.dbpedia.extraction.dump.download.DownloadConfig
 2 | 
 3 | #distconfig=/example/path/file.cfg
 4 | #   Path to existing distributed download configuration text file (UTF-8) whose lines contain arguments
 5 | #   in the format given here. Absolute or relative path. File paths in that config file will be interpreted
 6 | #   relative to the config file.
 7 | 
 8 | #extraction-framework-home=/path/to/distributed-extraction-framework
 9 | #   This must be set to the absolute path to the distributed extraction framework (containing this module)
10 | #   in all nodes. No default value is set.
11 | 
12 | mirrors=http://dumps.wikimedia.org/,http://wikipedia.c3sl.ufpr.br/,http://ftp.fi.muni.cz/pub/wikimedia/,http://dumps.wikimedia.your.org/
13 | #  List of mirrors to download from in the form of comma-separated URLs. Choose from the list of mirrors at:
14 | #  http://meta.wikimedia.org/wiki/Mirroring_Wikimedia_project_XML_dumps#Current_Mirrors
15 | #  Example: mirrors=http://dumps.wikimedia.org/,http://wikipedia.c3sl.ufpr.br,http://ftp.fi.muni.cz/pub/wikimedia/,http://dumps.wikimedia.your.org/
16 | 
17 | threads-per-mirror=2
18 | #  Number of simultaneous downloads from each mirror per slave node. Set to 2 by default.
19 | 
20 | workers-per-slave=2
21 | #  Number of workers to run per slave. This is set to 2 by default.
22 | #  Setting it to (no. of mirrors) * threads-per-mirror is recommended for exploiting maximum parallelism. On the other hand,
23 | #  if your whole cluster has only one public facing IP it is better to set this to a low number like 1.
24 | 
25 | progress-interval=2
26 | #  Progress report time interval in secs - the driver node receives real-time progress reports for running downloads from the workers.
27 | #  If a worker fails to send a progress report of the current download under the given timeout (the timeout is set to something
28 | #  like progressReportInterval + 2 to be safe) the download job will be marked as failed and inserted back into the pending
29 | #  download queue. This is 2 seconds by default.
30 | 
31 | max-duplicate-progress-reports=30
32 | #  Maximum number of consecutive duplicate progress read bytes to tolerate. The workers keep track of download progress;
33 | #  if a download gets stuck consecutive progress reports will contain the same number of bytes downloaded. If this is set
34 | #  to 30 (not recommended to go below that), the worker will declare a job as failed only after getting the same progress
35 | #  report for 30 times. By default set to 30.
36 | 
37 | local-temp-dir=/tmp
38 | #  Local temporary directory on worker nodes. Each dump file/chunk is downloaded to this directory before being moved to
39 | #  the configured Hadoop file system. This is /tmp by default.
40 | 
41 | #private-key=/path/to/id_rsa
42 | #  Optional identity file to connect to cluster nodes via SSH.
43 | 
44 | #ssh-passphrase=passphrase
45 | #  Optional passphrase for SSH private key.
46 | 
47 | sequential-languages=false
48 | #  If each language consists of multiple dump files (eg. enwiki-latest-pages-articles1.xml-p000000010p000010000.bz2)
49 | #  they are downloaded in parallel. Multiple languages are downloaded in parallel too, giving us 2 levels of
50 | #  parallelism. If sequentialLanguages is set to true, one language is downloaded at a time, otherwise,
51 | #  all languages are downloaded in parallel.
52 | 
53 | #hadoop-coresite-xml-path=/path/to/core-site.xml
54 | #  Path to hadoop core-site.xml configuration file.
55 | 
56 | #hadoop-hdfssite-xml-path=/path/to/hdfs-site.xml
57 | #  Path to hadoop hdfs-site.xml configuration file.
58 | 
59 | #hadoop-mapredsite-xml-path=/path/to/mapred-site.xml
60 | #  Path to hadoop mapred-site.xml configuration file.
61 | 
62 | master=127.0.0.1
63 | #  Master node host.
64 | 
65 | slaves=127.0.0.1
66 | #  List of comma-separated slave hosts. Example: slaves=node1,node2,node3
67 | 
68 | base-dir=/tmp/basedir
69 | #  Replace by your target folder. If this is omitted here, it is read from the general configuration file if there is any.
70 | 
71 | #join=akka.tcp://Workers@hostname:port
72 | #  This variable needs to be specified when starting up a worker manually. Do not use this variable unless you know what you're
73 | #  doing. The driver node automatically starts up workers on the slaves and takes care of this variable. Never set this variable
74 | #  when starting up the master/driver.


--------------------------------------------------------------------------------
/extraction/src/main/scala/org/dbpedia/extraction/spark/serialize/KryoExtractionRegistrator.scala:
--------------------------------------------------------------------------------
 1 | package org.dbpedia.extraction.spark.serialize
 2 | 
 3 | import org.apache.spark.serializer.KryoRegistrator
 4 | import com.esotericsoftware.kryo.Kryo
 5 | import scala.Console._
 6 | import org.dbpedia.extraction.sources.WikiPage
 7 | import org.dbpedia.extraction.wikiparser.{Namespace, WikiTitle}
 8 | import org.dbpedia.extraction.util.Language
 9 | import java.util.logging.Logger
10 | import org.dbpedia.extraction.dataparser.ParserUtils
11 | 
12 | /**
13 |  * It's best to register the classes that will be serialized/deserialized with Kryo.
14 |  */
15 | class KryoExtractionRegistrator extends KryoRegistrator
16 | {
17 |   override def registerClasses(kryo: Kryo)
18 |   {
19 |     kryo.register(classOf[Array[Object]])
20 |     kryo.register(classOf[org.dbpedia.extraction.dataparser.GeoCoordinateParser])
21 |     kryo.register(classOf[org.dbpedia.extraction.dataparser.SingleGeoCoordinateParser])
22 |     kryo.register(classOf[org.dbpedia.extraction.destinations.Dataset])
23 |     kryo.register(classOf[org.dbpedia.extraction.destinations.Quad])
24 |     kryo.register(classOf[org.dbpedia.extraction.dump.extract.DistConfigLoader])
25 |     kryo.register(classOf[org.dbpedia.extraction.dump.extract.DumpExtractionContext])
26 |     kryo.register(classOf[org.dbpedia.extraction.dump.extract.DumpExtractionContextWrapper])
27 |     kryo.register(classOf[org.dbpedia.extraction.mappings.ArticleCategoriesExtractor])
28 |     kryo.register(classOf[org.dbpedia.extraction.mappings.ArticlePageExtractor])
29 |     kryo.register(classOf[org.dbpedia.extraction.mappings.ArticleTemplatesExtractor])
30 |     kryo.register(classOf[org.dbpedia.extraction.mappings.CategoryLabelExtractor])
31 |     kryo.register(classOf[org.dbpedia.extraction.mappings.CompositeParseExtractor])
32 |     kryo.register(classOf[org.dbpedia.extraction.mappings.DistRedirects])
33 |     kryo.register(classOf[org.dbpedia.extraction.mappings.ExternalLinksExtractor])
34 |     kryo.register(classOf[org.dbpedia.extraction.mappings.GeoExtractor])
35 |     kryo.register(classOf[org.dbpedia.extraction.mappings.InfoboxExtractor])
36 |     kryo.register(classOf[org.dbpedia.extraction.mappings.InterLanguageLinksExtractor])
37 |     kryo.register(classOf[org.dbpedia.extraction.mappings.LabelExtractor])
38 |     kryo.register(classOf[org.dbpedia.extraction.mappings.PageIdExtractor])
39 |     kryo.register(classOf[org.dbpedia.extraction.mappings.PageLinksExtractor])
40 |     kryo.register(classOf[org.dbpedia.extraction.mappings.ProvenanceExtractor])
41 |     kryo.register(classOf[org.dbpedia.extraction.mappings.RedirectExtractor])
42 |     kryo.register(classOf[org.dbpedia.extraction.mappings.Redirects])
43 |     kryo.register(classOf[org.dbpedia.extraction.mappings.RevisionIdExtractor])
44 |     kryo.register(classOf[org.dbpedia.extraction.mappings.RootExtractor])
45 |     kryo.register(classOf[org.dbpedia.extraction.mappings.SkosCategoriesExtractor])
46 |     kryo.register(classOf[org.dbpedia.extraction.dataparser.ParserUtils])
47 |     kryo.register(classOf[org.dbpedia.extraction.ontology.datatypes.Datatype])
48 |     kryo.register(classOf[org.dbpedia.extraction.ontology.OntologyClass])
49 |     kryo.register(classOf[org.dbpedia.extraction.ontology.OntologyDatatypeProperty])
50 |     kryo.register(classOf[org.dbpedia.extraction.ontology.OntologyObjectProperty])
51 |     kryo.register(classOf[org.dbpedia.extraction.ontology.OntologyProperty])
52 |     kryo.register(Class.forName("scala.collection.immutable.$colon$colon"))
53 |     kryo.register(Class.forName("scala.collection.immutable.Map$EmptyMap$"))
54 |     kryo.register(Class.forName("scala.collection.immutable.Nil$"))
55 |     kryo.register(Class.forName("scala.collection.immutable.Set$EmptySet$"))
56 |     kryo.register(classOf[scala.collection.mutable.ArrayBuffer[_]])
57 |     kryo.register(classOf[Array[scala.collection.Seq[_]]])
58 |     kryo.register(classOf[scala.runtime.BoxedUnit])
59 |     kryo.register(classOf[Array[scala.Tuple2[_,_]]])
60 |     kryo.register(classOf[scala.util.matching.Regex])
61 |     kryo.register(classOf[WikiPage], new WikiPageSerializer)
62 |     kryo.register(classOf[WikiTitle], new WikiTitleSerializer)
63 |     kryo.register(classOf[Namespace])
64 |     kryo.register(classOf[Language], new LanguageSerializer)
65 |     kryo.register(classOf[Logger], new LoggerSerializer)
66 |     kryo.register(classOf[ParserUtils], new ParserUtilsSerializer)
67 |   }
68 | }


--------------------------------------------------------------------------------
/extraction/src/main/scala/org/dbpedia/extraction/util/SparkUtils.scala:
--------------------------------------------------------------------------------
  1 | package org.dbpedia.extraction.util
  2 | 
  3 | import org.apache.spark.{Logging, SparkContext, SparkConf}
  4 | import org.dbpedia.extraction.dump.extract.DistConfig
  5 | import org.apache.log4j.{Logger, Level}
  6 | import java.nio.file.{Paths, Files}
  7 | import java.io.FileNotFoundException
  8 | import scala.reflect.ClassTag
  9 | import org.apache.spark.rdd.RDD
 10 | import org.dbpedia.extraction.spark.serialize.KryoSerializationWrapper
 11 | import org.apache.spark.ui.jobs.DBpediaJobProgressListener
 12 | 
 13 | /**
 14 |  * Utility functions specific to Spark
 15 |  */
 16 | object SparkUtils
 17 | {
 18 |   /**
 19 |    * Stores the SparkContext instance.
 20 |    */
 21 |   private var sc: SparkContext = null
 22 | 
 23 |   /**
 24 |    * Set all loggers to the given log level.  Returns a map of the value of every logger
 25 |    * @param level
 26 |    * @param loggers
 27 |    * @return
 28 |    */
 29 |   def setLogLevels(level: org.apache.log4j.Level, loggers: TraversableOnce[String]) =
 30 |   {
 31 |     loggers.map
 32 |     {
 33 |       loggerName =>
 34 |         val logger = Logger.getLogger(loggerName)
 35 |         val prevLevel = logger.getLevel()
 36 |         logger.setLevel(level)
 37 |         loggerName -> prevLevel
 38 |     }.toMap
 39 |   }
 40 | 
 41 |   /**
 42 |    * Sets log levels for Spark and its peripheral libraries to DistConfig.sparkLogLevel.
 43 |    */
 44 |   def setSparkLogLevels(config: DistConfig) =
 45 |   {
 46 |     setLogLevels(config.sparkLogLevel, Seq("org.apache", "spark", "org.eclipse.jetty", "akka"))
 47 |   }
 48 | 
 49 |   /**
 50 |    * Creates and returns a new SparkContext taking configuration info from Config
 51 |    * @param config
 52 |    * @return
 53 |    */
 54 |   def getSparkContext(config: DistConfig) =
 55 |   synchronized
 56 |   {
 57 |     if (sc == null)
 58 |     {
 59 |       val conf = new SparkConf().setMaster(config.sparkMaster).setAppName(config.sparkAppName)
 60 |       for ((property, value) <- config.sparkProperties)
 61 |         conf.set(property, value)
 62 |       conf.setSparkHome(config.sparkHome)
 63 |       val distJarName = if (Files.exists(Paths.get("target/extraction-4.1-SNAPSHOT.jar")))
 64 |       {
 65 |         "target/extraction-4.1-SNAPSHOT.jar"
 66 |       } else if (Files.exists(Paths.get("extraction/target/extraction-4.1-SNAPSHOT.jar")))
 67 |       {
 68 |         "extraction/target/extraction-4.1-SNAPSHOT.jar"
 69 |       } else
 70 |       {
 71 |         throw new FileNotFoundException("extraction-4.1-SNAPSHOT.jar cannot be found in extraction/target. Please run mvn install -Dmaven.test.skip=true to build JAR first.")
 72 |       }
 73 | 
 74 |       conf.setJars(List(distJarName))
 75 |       conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
 76 |       conf.set("spark.kryo.registrator", "org.dbpedia.extraction.spark.serialize.KryoExtractionRegistrator")
 77 |       conf.set("spark.kryoserializer.buffer.mb", "100")
 78 |       sc = new SparkContext(conf)
 79 |       // No logging is done upon omitting 'with Logging' - some package problem?
 80 |       setLogLevels(Level.INFO, Seq("org.apache.spark.ui.jobs.DBpediaJobProgressListener"))
 81 |       sc.addSparkListener(new DBpediaJobProgressListener(conf))
 82 |     }
 83 |     sc
 84 |   }
 85 | 
 86 |   /**
 87 |    * Return an iterator that contains all of the elements in given RDD.
 88 |    * The iterator will consume as much memory as the largest partition in the RDD.
 89 |    *
 90 |    * @param rdd
 91 |    * @return iterator for rdd's elements
 92 |    */
 93 |   def rddToLocalIterator[T: ClassTag](rdd: RDD[T]): Iterator[T] =
 94 |   {
 95 |     def collectPartition(p: Int): Array[T] =
 96 |     {
 97 |       sc.runJob(rdd, (iter: Iterator[T]) => iter.toArray, Seq(p), allowLocal = false).head
 98 |     }
 99 |     (0 until rdd.partitions.length).iterator.flatMap(i => collectPartition(i))
100 |   }
101 | 
102 |   /**
103 |    * Returns the function object wrapped inside a KryoSerializationWrapper.
104 |    * This is useful for having Kryo-serialization for Spark closures.
105 |    *
106 |    * @param function
107 |    * @return
108 |    */
109 |   def kryoWrapFunction[T, U](function: (T => U)): (T => U) =
110 |   {
111 |     def genMapper(kryoWrapper: KryoSerializationWrapper[(T => U)])(input: T): U =
112 |     {
113 |       kryoWrapper.value.apply(input)
114 |     }
115 | 
116 |     genMapper(KryoSerializationWrapper(function)) _
117 |   }
118 | }
119 | 


--------------------------------------------------------------------------------
/extraction/src/main/scala/org/dbpedia/extraction/spark/io/input/DBpediaWikiPageInputFormat.scala:
--------------------------------------------------------------------------------
  1 | package org.dbpedia.extraction.spark.io.input
  2 | 
  3 | import org.apache.hadoop.io.{DataOutputBuffer, LongWritable}
  4 | import org.apache.hadoop.io.compress.{CompressionCodecFactory, SplittableCompressionCodec}
  5 | import org.apache.hadoop.fs.Path
  6 | import scala.xml.XML
  7 | import org.dbpedia.extraction.sources.XMLSource
  8 | import org.apache.hadoop.mapreduce.lib.input.{FileSplit, FileInputFormat}
  9 | import org.apache.hadoop.mapreduce.{JobContext, RecordReader, InputSplit, TaskAttemptContext}
 10 | import org.apache.commons.logging.LogFactory
 11 | import org.dbpedia.extraction.util.Language
 12 | import org.dbpedia.extraction.spark.io.WikiPageWritable
 13 | 
 14 | /**
 15 | * Hadoop InputFormat that splits a Wikipedia dump file into WikiPageWritable (representing a single
 16 | * org.dbpedia.extraction.sources.WikiPage) chunks.
 17 | *
 18 | * The WikiPageRecordReader class inside outputs a WikiPageWritable as value and the starting position (byte) as key.
 19 | *
 20 | * Note that wikipage.language.wikicode needs to be set in Hadoop's Configuration.
 21 | */
 22 | class DBpediaWikiPageInputFormat extends FileInputFormat[LongWritable, WikiPageWritable]
 23 | {
 24 |   private val LOG = LogFactory.getLog(classOf[DBpediaWikiPageInputFormat])
 25 |   private val LANGUAGE = "dbpedia.wiki.language.wikicode"
 26 | 
 27 |   protected override def isSplitable(context: JobContext, file: Path): Boolean =
 28 |   {
 29 |     val codec = new CompressionCodecFactory(context.getConfiguration).getCodec(file)
 30 |     if (null == codec) true else codec.isInstanceOf[SplittableCompressionCodec]
 31 |   }
 32 | 
 33 |   override def createRecordReader(genericSplit: InputSplit, context: TaskAttemptContext): RecordReader[LongWritable, WikiPageWritable] =
 34 |   {
 35 |     val split = genericSplit.asInstanceOf[FileSplit]
 36 |     LOG.info("getRecordReader start.....split=" + split)
 37 |     context.setStatus(split.toString)
 38 |     new WikiPageRecordReader(split, context)
 39 |   }
 40 | 
 41 |   private class WikiPageRecordReader(split: FileSplit, context: TaskAttemptContext) extends RecordReader[LongWritable, WikiPageWritable]
 42 |   {
 43 |     private var key: LongWritable = null
 44 |     private var value: WikiPageWritable = null
 45 | 
 46 |     private val conf = context.getConfiguration
 47 | 
 48 |     // Language code for this data dump
 49 |     private val language = Language(conf.get(LANGUAGE))
 50 |     private val page = new DataOutputBuffer()
 51 |     private val inputStream = SeekableInputStream(split,
 52 |                                                   split.getPath.getFileSystem(conf),
 53 |                                                   new CompressionCodecFactory(conf))
 54 |     private val matcher = new ByteMatcher(inputStream)
 55 | 
 56 |     private val (start, end) =
 57 |     {
 58 |       inputStream match
 59 |       {
 60 |         case SeekableSplitCompressedInputStream(sin) =>
 61 |           (sin.getAdjustedStart, sin.getAdjustedEnd + 1)
 62 |         case _ =>
 63 |           (split.getStart, split.getStart + split.getLength)
 64 |       }
 65 |     }
 66 | 
 67 |     private val pageBeginPattern = "<page>".getBytes("UTF-8")
 68 |     private val pageEndPattern = "</page>".getBytes("UTF-8")
 69 | 
 70 |     override def close() = inputStream.close()
 71 | 
 72 |     override def getProgress: Float =
 73 |     {
 74 |       if (end == start) 1.0f else (getPos - start).asInstanceOf[Float] / (end - start).asInstanceOf[Float]
 75 |     }
 76 | 
 77 |     def getPos: Long = matcher.getPos
 78 | 
 79 |     override def initialize(genericInputSplit: InputSplit, context: TaskAttemptContext) = ()
 80 | 
 81 |     override def nextKeyValue(): Boolean =
 82 |     {
 83 |       // Initialize key and value
 84 |       if (key == null) key = new LongWritable()
 85 |       if (value == null) value = new WikiPageWritable()
 86 | 
 87 |       if (matcher.getPos < end && matcher.readUntilMatch(pageBeginPattern, end))
 88 |       {
 89 |         try
 90 |         {
 91 |           page.write(pageBeginPattern)
 92 |           if (matcher.readUntilMatch(pageEndPattern, end, Some(page)))
 93 |           {
 94 |             // Key is set to the position (bytes) where the page is found
 95 |             key.set(matcher.getPos)
 96 | 
 97 |             // Set value to the WikiPage created from the parsed <page>...</page>
 98 |             val elem = XML.loadString("<mediawiki>" + new String(page.getData.take(page.getLength), "UTF-8") + "</mediawiki>")
 99 |             value.set(XMLSource.fromXML(elem, language).head)
100 | 
101 |             return true
102 |           }
103 |         }
104 |         finally
105 |         {
106 |           page.reset()
107 |         }
108 |       }
109 |       false
110 |     }
111 | 
112 |     override def getCurrentKey: LongWritable = key
113 | 
114 |     override def getCurrentValue: WikiPageWritable = value
115 |   }
116 | 
117 | }
118 | 


--------------------------------------------------------------------------------
/extraction/src/main/scala/org/apache/spark/ui/jobs/DBpediaJobProgressListener.scala:
--------------------------------------------------------------------------------
  1 | package org.apache.spark.ui.jobs
  2 | 
  3 | import org.apache.spark.{Logging, SparkConf}
  4 | import org.apache.spark.scheduler._
  5 | import org.apache.spark.scheduler.SparkListenerTaskEnd
  6 | import org.apache.spark.scheduler.SparkListenerJobEnd
  7 | import org.apache.spark.scheduler.SparkListenerStageSubmitted
  8 | import org.apache.spark.scheduler.SparkListenerStageCompleted
  9 | import org.apache.spark.scheduler.SparkListenerTaskStart
 10 | import org.apache.spark.scheduler.SparkListenerJobStart
 11 | import org.dbpedia.extraction.util.StringUtils
 12 | import scala.collection.mutable
 13 | 
 14 | /**
 15 |  * SparkListener implementation that provides real-time logging for jobs, tasks and stages in a
 16 |  * friendly way omitting most of the details that can be had using Spark's default logging
 17 |  * system.
 18 |  *
 19 |  * This is in the org.apache.spark.ui.jobs package because it needs to extend
 20 |  * org.apache.spark.ui.jobs.JobProgressListener which is private[spark].
 21 |  */
 22 | class DBpediaJobProgressListener(sc: SparkConf) extends JobProgressListener(sc) with Logging
 23 | {
 24 |   /**
 25 |    * The time when this class was created (usually along with the SparkContext).
 26 |    * Milliseconds since midnight, January 1, 1970 UTC.
 27 |    */
 28 |   val startTime = System.currentTimeMillis()
 29 | 
 30 |   val stageNumTasks = mutable.Map[Int, Int]() // Maps stageId to number of tasks
 31 | 
 32 |   override def onStageSubmitted(stageSubmitted: SparkListenerStageSubmitted): Unit =
 33 |   {
 34 |     super.onStageSubmitted(stageSubmitted)
 35 |     val stage = stageSubmitted.stageInfo
 36 |     val numTasks = stage.numTasks
 37 |     stageNumTasks.synchronized(stageNumTasks(stage.stageId) = numTasks)
 38 |     val time = prettyTime(stage.submissionTime.getOrElse(startTime))
 39 |     logInfo("Stage #%d: Starting stage %s with %d tasks at %s".format(stage.stageId, stage.name, numTasks, time))
 40 |   }
 41 | 
 42 |   override def onStageCompleted(stageCompleted: SparkListenerStageCompleted): Unit =
 43 |   {
 44 |     super.onStageCompleted(stageCompleted)
 45 |     val stage = stageCompleted.stageInfo
 46 |     val time = prettyTime(stage.completionTime.getOrElse(startTime))
 47 |     logInfo("Stage #%d: Finished stage %s at %s".format(stage.stageId, stage.name, time))
 48 |   }
 49 | 
 50 |   override def onTaskStart(taskStart: SparkListenerTaskStart): Unit =
 51 |   {
 52 |     super.onTaskStart(taskStart)
 53 |     val executor = taskStart.taskInfo.executorId
 54 |     val host = taskStart.taskInfo.host
 55 |     val time = prettyTime(taskStart.taskInfo.launchTime)
 56 |     val taskId = taskStart.taskInfo.taskId
 57 |     val stageId = taskStart.taskInfo.taskId
 58 |     // Get TaskInfos for this stage to compute number of tasks
 59 |     val numTasks = this.stageIdToInfo.size
 60 |     //val numTasks = this.stageIdToTaskInfos(stageId).size
 61 |     logInfo("Stage #%d: Started task #%d on host %s, executor %s at %s. Total tasks submitted: %d".format(stageId, taskId, host, executor, time, numTasks))
 62 |   }
 63 | 
 64 |   override def onTaskEnd(taskEnd: SparkListenerTaskEnd): Unit =
 65 |   {
 66 |     super.onTaskEnd(taskEnd)
 67 |     val time = prettyTime(taskEnd.taskInfo.finishTime)
 68 |     val taskId = taskEnd.taskInfo.taskId
 69 |     val stageId = taskEnd.stageId
 70 |     val totalNumTasks = stageNumTasks(taskEnd.stageId)
 71 |     // Get TaskInfos for this stage to compute number of tasks
 72 |     val numTasks = this.stageIdToInfo.size
 73 |     //val numTasks = this.stageIdToTaskInfos(stageId).size
 74 |     // Wrap in try/catch to return 0 if no completed/failed tasks for stageId are found in the maps.
 75 |     val finished = try { this.numCompletedStages } catch { case ex: NoSuchElementException =>0 }
 76 |     val failed = try { this.numFailedStages } catch { case ex: NoSuchElementException =>0 }
 77 |     //val finished = try { this.stageIdToTasksComplete(stageId) } catch { case ex: NoSuchElementException => 0 }
 78 |     //val failed = try { this.stageIdToTasksFailed(stageId) } catch { case ex: NoSuchElementException => 0 }
 79 |     logInfo("Stage #%d: Finished task #%d at %s. Completed: %d/%d Failed: %d/%d Total Progress: %d/%d".format(stageId, taskId, time, finished, numTasks, failed, numTasks, finished, totalNumTasks))
 80 |   }
 81 | 
 82 |   override def onJobStart(jobStart: SparkListenerJobStart): Unit =
 83 |   {
 84 |     super.onJobStart(jobStart)
 85 |     logInfo("Started job #" + jobStart.jobId)
 86 |   }
 87 | 
 88 |   override def onJobEnd(jobEnd: SparkListenerJobEnd): Unit =
 89 |   {
 90 |     super.onJobEnd(jobEnd)
 91 |     logInfo("Finished job #" + jobEnd.jobId)
 92 |   }
 93 | 
 94 |   override def onTaskGettingResult(taskGettingResult: SparkListenerTaskGettingResult): Unit =
 95 |   {
 96 |     super.onTaskGettingResult(taskGettingResult)
 97 |   }
 98 | 
 99 |   private def prettyTime(time: Long) = StringUtils.prettyMillis(time - startTime)
100 | }


--------------------------------------------------------------------------------
/extraction/src/main/scala/org/dbpedia/extraction/spark/io/input/ByteMatcher.scala:
--------------------------------------------------------------------------------
  1 | package org.dbpedia.extraction.spark.io.input
  2 | 
  3 | import java.io.InputStream
  4 | import org.apache.hadoop.fs.Seekable
  5 | import org.apache.hadoop.io.DataOutputBuffer
  6 | import scala.annotation.tailrec
  7 | 
  8 | /**
  9 | * A class that operates mainly on SeekableInputStreams, iteratively reading chunks of data from an InputStream
 10 | * depending upon a match pattern, through the method readUntilMatch().
 11 | *
 12 | * @param in InputStream to read binary data from
 13 | * @param seeker Seekable for the InputStream "in" - used for keeping track of position in the InputStream
 14 | */
 15 | class ByteMatcher(in: InputStream, seeker: Seekable)
 16 | {
 17 |   private var bytesRead: Long = 0
 18 |   private var lastMatchedPos: Long = -1
 19 |   private var currentPos: Long = -1
 20 | 
 21 |   def this(is: SeekableInputStream) = this(is, is)
 22 | 
 23 |   /**
 24 |    * @return number of bytes read
 25 |    */
 26 |   def getReadBytes: Long = bytesRead
 27 | 
 28 |   /**
 29 |    * @return current position in seeker
 30 |    */
 31 |   def getPos: Long = seeker.getPos
 32 | 
 33 |   /**
 34 |    * @return last position when a match was found
 35 |    */
 36 |   def getLastMatchedPos: Long = lastMatchedPos
 37 | 
 38 |   /**
 39 |    * @param len number of bytes to skip
 40 |    */
 41 |   def skip(len: Long)
 42 |   {
 43 |     in.skip(len)
 44 |     bytesRead += len
 45 |   }
 46 | 
 47 |   /**
 48 |    * Reads the InputStream until a match is found or "end" number of bytes is reached.
 49 |    *
 50 |    * @param textPattern String to match against
 51 |    * @param end number of bytes to read till - checked against seeker
 52 |    * @return Boolean true if a match was found, false if EOF was found or stopping point "end" was crossed.
 53 |    */
 54 |   def readUntilMatch(textPattern: String, end: Long): Boolean =
 55 |   {
 56 |     readUntilMatch(textPattern.getBytes("UTF-8"), 0, end)
 57 |   }
 58 | 
 59 |   /**
 60 |    * Reads the InputStream while writing to a buffer, until a match is found or "end" number of bytes is reached.
 61 |    *
 62 |    * @param textPattern String to match against
 63 |    * @param end number of bytes to read till - checked against seeker
 64 |    * @param outputBuffer DataOutputBuffer where the data being read is written to
 65 |    * @return Boolean true if a match was found, false if EOF was found or stopping point "end" was crossed.
 66 |    */
 67 |   def readUntilMatch(textPattern: String, end: Long, outputBuffer: Option[DataOutputBuffer]): Boolean =
 68 |   {
 69 |     readUntilMatch(textPattern.getBytes("UTF-8"), 0, end, outputBuffer)
 70 |   }
 71 | 
 72 |   /**
 73 |    * Reads the InputStream until a match is found or "end" number of bytes is reached.
 74 |    *
 75 |    * @param bytePattern Byte array to match against
 76 |    * @param end number of bytes to read till - checked against seeker
 77 |    * @return Boolean true if a match was found, false if EOF was found or stopping point "end" was crossed.
 78 |    */
 79 |   def readUntilMatch(bytePattern: Array[Byte], end: Long): Boolean =
 80 |   {
 81 |     readUntilMatch(bytePattern, 0, end)
 82 |   }
 83 | 
 84 |   /**
 85 |    * Reads the InputStream while writing to a buffer, until a match is found or "end" number of bytes is reached.
 86 |    *
 87 |    * @param bytePattern Byte array to match against
 88 |    * @param end number of bytes to read till - checked against seeker
 89 |    * @param outputBuffer DataOutputBuffer where the data being read is written to
 90 |    * @return Boolean true if a match was found, false if EOF was found or stopping point "end" was crossed.
 91 |    */
 92 |   def readUntilMatch(bytePattern: Array[Byte], end: Long, outputBuffer: Option[DataOutputBuffer]): Boolean =
 93 |   {
 94 |     readUntilMatch(bytePattern, 0, end, outputBuffer)
 95 |   }
 96 | 
 97 |   @tailrec private def readUntilMatch(matchBytes: Array[Byte], matchIter: Int, end: Long, outputBuffer: Option[DataOutputBuffer] = None): Boolean =
 98 |   {
 99 |     var i = matchIter
100 |     val b: Int = this.in.read
101 |     // EOF at the beginning
102 |     if (b == -1) return false
103 | 
104 |     this.bytesRead += 1
105 | 
106 |     // Save to the buffer, if any provided
107 |     outputBuffer.foreach(_.write(b))
108 | 
109 |     // Check if we're matching
110 |     if (b == matchBytes(i))
111 |     {
112 |       i += 1
113 |       // Whole of matchBytes matched successfully?
114 |       if (i >= matchBytes.length) return true
115 |     }
116 |     else
117 |     {
118 |       // If not matched, start afresh and increment position.
119 |       i = 0
120 |       if (this.currentPos != this.getPos)
121 |       {
122 |         this.lastMatchedPos = this.currentPos
123 |         this.currentPos = this.getPos
124 |       }
125 |     }
126 | 
127 |     // See if we've passed the stop point
128 |     if (i == 0 && this.seeker.getPos >= end) return false
129 | 
130 |     // Keep reading
131 |     readUntilMatch(matchBytes, i, end, outputBuffer)
132 |   }
133 | }


--------------------------------------------------------------------------------
/extraction/src/main/scala/org/dbpedia/extraction/spark/io/output/DBpediaDatasetOutputFormat.scala:
--------------------------------------------------------------------------------
  1 | package org.dbpedia.extraction.spark.io.output
  2 | 
  3 | import org.apache.hadoop.io.{Text, NullWritable}
  4 | import org.dbpedia.extraction.destinations.formatters.Formatter
  5 | import org.apache.hadoop.mapreduce.{TaskAttemptContext, RecordWriter}
  6 | import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat.LineRecordWriter
  7 | import org.dbpedia.extraction.spark.io.QuadSeqWritable
  8 | import java.io.DataOutputStream
  9 | import org.apache.hadoop.io.compress.CompressionCodec
 10 | 
 11 | /**
 12 |  * OutputFormat implementation that writes Quads to respective datasets depending upon the key, after applying
 13 |  * a given Formatter. This class extends MultipleTextOutputFormat which allows it to write to multiple locations
 14 |  * (for multiple datasets) depending upon custom criteria.
 15 |  *
 16 |  * The output needs to be grouped by dataset such that each key is a Text representing the dataset to which
 17 |  * the Quads in the value belong to. Example key: article_categories
 18 |  *
 19 |  * @param langWikiCode Language wiki code of the input wiki dump
 20 |  * @param wikiNameSuffix Config.wikiName (eg. wiki)
 21 |  * @param date Wiki dump date in YYYYMMDD format
 22 |  * @param outputSuffix Output suffix corresponding to formatter (eg. tql)
 23 |  * @param formatter Formatter object used to render the Quad objects according to a specific format
 24 |  */
 25 | class DBpediaDatasetOutputFormat(langWikiCode: String,
 26 |                                  wikiNameSuffix: String,
 27 |                                  date: String,
 28 |                                  outputSuffix: String,
 29 |                                  formatter: Formatter) extends MultipleTextOutputFormat[Text, QuadSeqWritable]
 30 | {
 31 |   /**
 32 |    * Construct the underlying RecordWriter. By default creates a LineRecordWriter that is used by
 33 |    * TextOutputFormat by default.
 34 |    *
 35 |    * @param context TaskAttemptContext
 36 |    * @param out DataOutputStream where output data is written to
 37 |    * @param keyValueSeparator String separator between output key and value
 38 |    * @param codec Option[CompressionCodec] for handling compression
 39 |    * @return A RecordWriter object over the given DataOutputStream
 40 |    */
 41 |   override protected def getBaseRecordWriter(context: TaskAttemptContext,
 42 |                                              out: DataOutputStream,
 43 |                                              keyValueSeparator: String,
 44 |                                              codec: Option[CompressionCodec] = None): RecordWriter[Text, QuadSeqWritable] =
 45 |   {
 46 |     // Get a LineRecordWriter (the usual RecordWriter used by TextOutputFormat) that ignores keys and writes Text outputs.
 47 |     val lineWriter = codec match
 48 |     {
 49 |       case Some(c) =>
 50 |         // Have we an output compression codec?
 51 |         new LineRecordWriter[NullWritable, Text](
 52 |                                                   new DataOutputStream(c.createOutputStream(out)),
 53 |                                                   keyValueSeparator
 54 |                                                 )
 55 |       case _ =>
 56 |         new LineRecordWriter[NullWritable, Text](out, keyValueSeparator)
 57 |     }
 58 | 
 59 |     new DBpediaDatasetRecordWriter(lineWriter)
 60 |   }
 61 | 
 62 |   /**
 63 |    * If inferCodecFromPathName is set to true, the output compression codec will be inferred from the suffix/extension
 64 |    * in pathName (eg. tql.gz implies GzipCodec is used), otherwise it uses Hadoop configuration settings.
 65 |    */
 66 |   override protected val inferCodecFromPathName = true
 67 | 
 68 |   /**
 69 |    * Generate the output file name (the directory where the leaf part-* files will be written to)
 70 |    * based on the given key and value. The default behavior is that the file name does not depend on them.
 71 |    * That is, by default this method returns an empty String.
 72 |    *
 73 |    * @param key the key of the output data
 74 |    * @return generated file name
 75 |    */
 76 |   override protected def generateFileNameForKeyValue(key: Text, value: QuadSeqWritable): String =
 77 |   {
 78 |     val datasetName = key.toString
 79 |     // eg. enwiki-20140614-article-categories.tql
 80 |     s"$langWikiCode$wikiNameSuffix-$date-${datasetName.replace('_', '-')}.$outputSuffix"
 81 |   }
 82 | 
 83 |   /**
 84 |    * RecordWriter that wraps a LineRecordWriter, applies the given Formatter on a Seq[Quad] and writes to
 85 |    * the LineRecordWriter.
 86 |    */
 87 |   private class DBpediaDatasetRecordWriter(lineWriter: LineRecordWriter[NullWritable, Text]) extends RecordWriter[Text, QuadSeqWritable]
 88 |   {
 89 |     private val text = new Text("")
 90 |     private val nullKey = NullWritable.get()
 91 | 
 92 |     // Begin writing split with formatter header
 93 |     text.set(formatter.header.dropRight(1)) // remove newline from header
 94 |     lineWriter.write(nullKey, text)
 95 | 
 96 |     /**
 97 |      * Note: This method is not synchronized, keeping with the rest of the Hadoop code in this framework.
 98 |      * When using this with Spark, set only one core per worker to ensure that only one thread accesses
 99 |      * this method per JVM.
100 |      */
101 |     override def write(key: Text, value: QuadSeqWritable) =
102 |     {
103 |       for (quad <- value.get)
104 |       {
105 |         text.set(formatter.render(quad).dropRight(1)) // remove newline from rendered output
106 |         lineWriter.write(nullKey, text)
107 |       }
108 |     }
109 | 
110 |     override def close(context: TaskAttemptContext) =
111 |     {
112 |       text.set(formatter.footer.dropRight(1)) // remove newline from footer
113 |       lineWriter.write(nullKey, text)
114 |       lineWriter.close(context)
115 |     }
116 |   }
117 | 
118 | }
119 | 


--------------------------------------------------------------------------------
/extraction/src/main/scala/org/dbpedia/extraction/spark/io/output/DBpediaCompositeOutputFormat.scala:
--------------------------------------------------------------------------------
  1 | package org.dbpedia.extraction.spark.io.output
  2 | 
  3 | import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat
  4 | import org.dbpedia.extraction.spark.io.QuadSeqWritable
  5 | import org.apache.hadoop.io.Text
  6 | import org.apache.hadoop.mapreduce.{JobContext, RecordWriter, TaskAttemptContext}
  7 | import scala.collection.mutable
  8 | import org.dbpedia.extraction.destinations.formatters.UriPolicy
  9 | import org.dbpedia.extraction.util.ConfigUtils
 10 | import org.apache.commons.io.FilenameUtils
 11 | import java.io.File
 12 | import org.apache.hadoop.fs.{Path, FileSystem}
 13 | 
 14 | /**
 15 |  * OutputFormat implementation that uses the configured Formatters to write Quads to respective datasets
 16 |  * through the DBpediaDatasetOutputFormat class. This class uses as many DBpediaDatasetOutputFormat objects
 17 |  * as there are configured formats. Formats are read in from the provided extraction config properties file.
 18 |  * This class handles configuration and Formatters, while DBpediaDatasetOutputFormat handles dividing the Quads
 19 |  * into datasets.
 20 |  *
 21 |  * 1. To use this OutputFormat three Strings need to be set in Hadoop's Configuration:
 22 |  * dbpedia.wiki.name - Config.wikiName, the wiki suffix (eg. wiki)
 23 |  * dbpedia.wiki.language.wikicode - Language wiki code of the input wiki dump
 24 |  * dbpedia.wiki.date - Wiki dump date in YYYYMMDD format
 25 |  * dbpedia.output.overwrite - Boolean, if set to true, output files will be overwritten if they already exist,
 26 |  * or else an IOException will be thrown (which is also the default behaviour) - this is actually for MultipleTextOutputFormat
 27 |  * dbpedia.config.properties - HDFS Path at which the extraction config properties file is stored
 28 |  *
 29 |  * 2. The extraction config properties file needs to be added to the distributed cache - the HDFS location should be
 30 |  * configured using dbpedia.config.properties.
 31 |  *
 32 |  * 3. Also, the output needs to be grouped by dataset such that each key is a Text representing the dataset
 33 |  * to which the Quads in the value belong to. Example key: article_categories
 34 |  *
 35 |  * NOTE: When using this with Spark set only one core per worker.
 36 |  *
 37 |  * Output will look like Hadoop leaf files (eg. part-r-00000) inside directories like enwiki-20140614-article-categories.tql.
 38 |  * The files will be compressed using the specified compression codec.
 39 |  *
 40 |  * @see DBpediaDatasetOutputFormat
 41 |  */
 42 | class DBpediaCompositeOutputFormat extends TextOutputFormat[Text, QuadSeqWritable]
 43 | {
 44 |   private val CONFIG_PROPERTIES = "dbpedia.config.properties"
 45 |   private val WIKI = "dbpedia.wiki.name"
 46 |   private val LANGUAGE = "dbpedia.wiki.language.wikicode"
 47 |   private val DATE = "dbpedia.wiki.date"
 48 | 
 49 |   private class DBpediaCompositeRecordWriter(context: TaskAttemptContext) extends RecordWriter[Text, QuadSeqWritable]
 50 |   {
 51 |     private val recordWriters = mutable.Map[String, RecordWriter[Text, QuadSeqWritable]]()
 52 |     private val conf = context.getConfiguration
 53 |     private val configPropertiesDCPath = conf.get(CONFIG_PROPERTIES)
 54 |     private val wikiName = conf.get(WIKI)
 55 |     private val langCode = conf.get(LANGUAGE)
 56 |     private val date = conf.get(DATE)
 57 |     private val localConfigPropertiesFile = new Path("./config.properties")
 58 |     private val formatters =
 59 |     {
 60 |       // Deserialize the config Properties object to get the Formatters
 61 |       println(context.getCacheFiles.mkString("\n"))
 62 |       val configProperties = context.getCacheFiles.find(_.getPath == configPropertiesDCPath).get
 63 | 
 64 |       val fs = FileSystem.get(conf)
 65 |       // copy config file from distributed cache to raw local FS
 66 |       fs.copyToLocalFile(false, new Path(configProperties), localConfigPropertiesFile, true)
 67 | 
 68 |       val config = ConfigUtils.loadConfig(localConfigPropertiesFile.toString, "UTF-8")
 69 |       UriPolicy.parseFormats(config, "uri-policy", "format")
 70 |     }
 71 | 
 72 |     /**
 73 |      * Note: This method is not synchronized, keeping with the rest of the Hadoop code in this framework.
 74 |      * When using this with Spark set only one core per worker to ensure that only one thread accesses
 75 |      * this method per JVM.
 76 |      */
 77 |     override def write(key: Text, value: QuadSeqWritable)
 78 |     {
 79 |       for ((suffix, format) <- formatters)
 80 |       {
 81 |         // Each RecordReader writes Quads to corresponding datasets depending upon the Text key.
 82 |         // See DBpediaDatasetOutputFormat and MultipleTextOutputFormat for details.
 83 |         val writer = recordWriters.getOrElseUpdate(suffix, new DBpediaDatasetOutputFormat(
 84 |                                                                                            langCode,
 85 |                                                                                            wikiName,
 86 |                                                                                            date,
 87 |                                                                                            suffix,
 88 |                                                                                            format
 89 |                                                                                          ).getRecordWriter(context))
 90 |         writer.write(key, value)
 91 |       }
 92 |     }
 93 | 
 94 |     override def close(context: TaskAttemptContext) = recordWriters.foreach(_._2.close(context))
 95 |   }
 96 | 
 97 |   override def getRecordWriter(context: TaskAttemptContext): RecordWriter[Text, QuadSeqWritable] = new DBpediaCompositeRecordWriter(context)
 98 | 
 99 |   override def checkOutputSpecs(job: JobContext) = () // allow overwriting output directory
100 | }
101 | 


--------------------------------------------------------------------------------
/extraction/src/test/scala/org/dbpedia/extraction/mappings/DistRedirectsTest.scala:
--------------------------------------------------------------------------------
  1 | package org.dbpedia.extraction.mappings
  2 | 
  3 | import org.junit.Assert._
  4 | import org.dbpedia.extraction.sources.{Source, XMLSource, WikiPage}
  5 | import org.apache.spark.rdd.RDD
  6 | import org.dbpedia.extraction.util._
  7 | import java.io.File
  8 | import org.dbpedia.extraction.wikiparser.Namespace
  9 | import org.dbpedia.extraction.dump.extract.{Config, DistConfig}
 10 | import org.dbpedia.extraction.dump.download.Download
 11 | import org.dbpedia.extraction.util.RichFile.wrapFile
 12 | import org.scalatest.FunSuite
 13 | import org.scalatest.junit.JUnitRunner
 14 | import org.junit.runner.RunWith
 15 | import org.apache.hadoop.conf.Configuration
 16 | import org.apache.hadoop.fs.Path
 17 | import org.dbpedia.extraction.util.RichHadoopPath.wrapPath
 18 | 
 19 | /**
 20 |  * Unit Test for the DistRedirects class.
 21 |  *
 22 |  * This test expects a DBpedia extraction configuration properties file named "config.properties" and a distributed
 23 |  * framework configuration file named "dist-config.properties" to be present at the test/resources directory.
 24 |  *
 25 |  * It's better to use a small data dump like the liwiki dump to run the test.
 26 |  *
 27 |  * TODO: Add some wiki dump content to test upon rather than rely upon an external wiki dump file and config files.
 28 |  */
 29 | @RunWith(classOf[JUnitRunner])
 30 | class DistRedirectsTest extends FunSuite
 31 | {
 32 |   val CONFIG_FILE = "config.properties"
 33 |   val SPARK_CONFIG_FILE = "dist-config.properties"
 34 | 
 35 |   // Fixtures shared between all tests in this class
 36 |   val (distConfig: DistConfig,
 37 |   articleSource: Source,
 38 |   rdd: RDD[WikiPage],
 39 |   language: Language,
 40 |   date: String,
 41 |   distFinder: Finder[Path]) = try
 42 |   {
 43 |     val configFileResource = getClass.getClassLoader.getResource(CONFIG_FILE)
 44 |     val sparkConfigFileResource = getClass.getClassLoader.getResource(SPARK_CONFIG_FILE)
 45 | 
 46 |     //Check if the wiki-pages file and config.properties file are present
 47 |     assertNotNull("Test file %s missing from distributed/src/test/resources".format(CONFIG_FILE), configFileResource)
 48 |     assertNotNull("Test file %s missing from distributed/src/test/resources".format(SPARK_CONFIG_FILE), sparkConfigFileResource)
 49 | 
 50 |     val configProperties = ConfigUtils.loadConfig(configFileResource.toURI.getPath, "UTF-8")
 51 |     val distConfigProperties = ConfigUtils.loadConfig(sparkConfigFileResource.toURI.getPath, "UTF-8")
 52 |     val config = new Config(configProperties)
 53 |     val distConfig = new DistConfig(distConfigProperties, configProperties, configFileResource.toURI)
 54 |     implicit val hadoopConfiguration = distConfig.hadoopConf
 55 |     val lang = config.extractorClasses.iterator.next()._1
 56 | 
 57 |     val localFinder = new Finder[File](config.dumpDir, lang, config.wikiName)
 58 |     val distFinder = new Finder[Path](distConfig.dumpDir.get, lang, config.wikiName)
 59 |     val date = latestDate(config, localFinder)
 60 | 
 61 |     // Get the readers for the test dump files
 62 |     val articlesReaders = files(config.source, localFinder, date).map(x => () => IOUtils.reader(x))
 63 | 
 64 |     // Get the article source for Redirects to load from
 65 |     val articleSource = XMLSource.fromReaders(articlesReaders, lang,
 66 |                                               title => title.namespace == Namespace.Main || title.namespace == Namespace.File ||
 67 |                                                 title.namespace == Namespace.Category || title.namespace == Namespace.Template)
 68 | 
 69 |     SparkUtils.setSparkLogLevels(distConfig)
 70 |     val sc = SparkUtils.getSparkContext(distConfig)
 71 |     // Generate RDD from the article source for DistRedirects to load from in parallel
 72 |     // Naively calls toArray on Seq, only for testing
 73 |     val rdd = sc.parallelize(articleSource.toSeq, 8)
 74 |     (distConfig, articleSource, rdd, lang, date, distFinder)
 75 |   } catch{ case ex:Exception => ex.printStackTrace(); (null, null,null, null,null, null)}
 76 | 
 77 |   implicit def hadoopConfiguration: Configuration = distConfig.hadoopConf
 78 | 
 79 |   test("Verify DistRedirects.loadFromRDD output")
 80 |   {
 81 |     val distRedirects = DistRedirects.loadFromRDD(rdd, language)
 82 |     val redirects = Redirects.loadFromSource(articleSource, language)
 83 |     assertEquals("Testing DistRedirects.loadFromRDD failed!", redirects.map, distRedirects.map)
 84 |   }
 85 | 
 86 |   test("Verify DistRedirects.load output")
 87 |   {
 88 |     val cache = distFinder.file(date, "template-redirects.obj")
 89 |     var distRedirects = DistRedirects.load(rdd, cache, language)
 90 |     var redirects = Redirects.loadFromSource(articleSource, language)
 91 |     assertEquals("Testing DistRedirects.loadFromRDD failed!", redirects.map, distRedirects.map)
 92 | 
 93 |     // Try again so that cache gets used
 94 |     distRedirects = DistRedirects.load(rdd, cache, language)
 95 |     redirects = Redirects.loadFromSource(articleSource, language)
 96 |     assertEquals("Testing DistRedirects.loadFromRDD failed!", redirects.map, distRedirects.map)
 97 |   }
 98 | 
 99 |   // Taken from org.dbpedia.extraction.dump.extract.Config
100 |   def latestDate(config: Config, finder: Finder[_]): String =
101 |   {
102 |     val isSourceRegex = config.source.startsWith("@")
103 |     val source = if (isSourceRegex) config.source.substring(1) else config.source
104 |     val fileName = if (config.requireComplete) Download.Complete else source
105 |     finder.dates(fileName, isSuffixRegex = isSourceRegex).last
106 |   }
107 | 
108 |   // Taken from org.dbpedia.extraction.dump.extract.Config
109 |   def files(source: String, finder: Finder[File], date: String): List[File] =
110 |   {
111 | 
112 |     val files = if (source.startsWith("@"))
113 |     {
114 |       // the articles source is a regex - we want to match multiple files
115 |       finder.matchFiles(date, source.substring(1))
116 |     } else List(finder.file(date, source))
117 | 
118 |     files
119 |   }
120 | }


--------------------------------------------------------------------------------
/extraction/src/main/scala/org/dbpedia/extraction/util/DistIOUtils.scala:
--------------------------------------------------------------------------------
  1 | package org.dbpedia.extraction.util
  2 | 
  3 | import com.esotericsoftware.kryo.Kryo
  4 | import org.dbpedia.extraction.spark.serialize.KryoSerializer
  5 | import org.apache.spark.SparkContext
  6 | import org.apache.spark.rdd.RDD
  7 | import org.apache.hadoop.io.{BytesWritable, NullWritable}
  8 | import java.io.{ByteArrayInputStream, ByteArrayOutputStream}
  9 | import com.esotericsoftware.kryo.io.{Input, Output}
 10 | import org.apache.spark.SparkContext._
 11 | import scala.reflect.ClassTag
 12 | import org.apache.hadoop.fs.Path
 13 | import org.apache.hadoop.conf.Configuration
 14 | import org.apache.hadoop.mapreduce.Job
 15 | import org.apache.hadoop.mapreduce.lib.input.{SequenceFileInputFormat, FileInputFormat}
 16 | 
 17 | /**
 18 |  * Kryo file operations helper methods
 19 |  */
 20 | object DistIOUtils
 21 | {
 22 |   private val kryo: ThreadLocal[Kryo] = new ThreadLocal[Kryo]
 23 |   {
 24 |     override def initialValue = getNewKryo()
 25 |   }
 26 | 
 27 |   /**
 28 |    * @return returns a thread-local instance of Kryo
 29 |    */
 30 |   def getKryoInstance: Kryo = kryo.get()
 31 | 
 32 |   /**
 33 |    * @return new Kryo instance.
 34 |    */
 35 |   def getNewKryo(): Kryo = KryoSerializer.ser.newKryo()
 36 | 
 37 |   /**
 38 |    * Loads an RDD saved as a SequenceFile containing objects serialized by Kryo,
 39 |    * with NullWritable keys and BytesWritable values.
 40 |    * @param sc SparkContext
 41 |    * @param path String path to existing file. Can be on local file system or HDFS, S3 etc. See Spark docs.
 42 |    * @return deserialized RDD
 43 |    */
 44 |   def loadRDD[T: ClassTag](sc: SparkContext, rddClass: Class[T], path: String): RDD[T] =
 45 |   {
 46 |     val arrayOfRddClass = Class.forName("[L" + rddClass.getName + ";")
 47 |     val serializedRDD = sc.sequenceFile(path, classOf[NullWritable], classOf[BytesWritable])
 48 |     serializedRDD.values.flatMap(x => deserialize(x.getBytes, arrayOfRddClass).asInstanceOf[Array[T]])
 49 |   }
 50 | 
 51 |   /**
 52 |    * Loads an RDD saved as a SequenceFile containing objects serialized by Kryo,
 53 |    * with NullWritable keys and BytesWritable values.
 54 |    * @param sc SparkContext
 55 |    * @param path String path to existing file. Can be on local file system or HDFS, S3 etc. See Spark docs.
 56 |    * @return deserialized RDD
 57 |    */
 58 |   def loadRDD[T: ClassTag](sc: SparkContext, rddClass: Class[T], path: Path): RDD[T] =
 59 |   {
 60 |     val arrayOfRddClass = Class.forName("[L" + rddClass.getName + ";")
 61 |     val conf = new Configuration()
 62 |     val job = Job.getInstance(conf)
 63 |     FileInputFormat.addInputPath(job, path)
 64 |     val updatedConf = job.getConfiguration
 65 |     val serializedRDD = sc.newAPIHadoopRDD(updatedConf, classOf[SequenceFileInputFormat[NullWritable, BytesWritable]], classOf[NullWritable], classOf[BytesWritable])
 66 |     serializedRDD.values.flatMap(x => deserialize(x.getBytes, arrayOfRddClass).asInstanceOf[Array[T]])
 67 |   }
 68 | 
 69 |   /**
 70 |    * Saves an RDD as a SequenceFile containing objects serialized by Kryo,
 71 |    * with NullWritable keys and BytesWritable values.
 72 |    * @param rdd Spark RDD
 73 |    * @param path String path to existing file. Can be on local file system or HDFS, S3 etc. See Spark docs.
 74 |    */
 75 |   def saveRDD(rdd: RDD[_ <: AnyRef], path: String)
 76 |   {
 77 |     rdd.mapPartitions(iter => iter.grouped(50).map(_.toArray))
 78 |     .map(x => (NullWritable.get(), new BytesWritable(serialize(x)))).saveAsSequenceFile(path)
 79 |   }
 80 | 
 81 |   /**
 82 |    * Saves an RDD as a SequenceFile containing objects serialized by Kryo,
 83 |    * with NullWritable keys and BytesWritable values.
 84 |    * @param rdd Spark RDD
 85 |    * @param path String path to existing file. Can be on local file system or HDFS, S3 etc. See Spark docs.
 86 |    */
 87 |   def saveRDD(rdd: RDD[_ <: AnyRef], path: Path)
 88 |   {
 89 |     rdd.mapPartitions(iter => iter.grouped(50).map(_.toArray))
 90 |     .map(x => (NullWritable.get(), new BytesWritable(serialize(x)))).saveAsSequenceFile(path.toString)
 91 |   }
 92 | 
 93 |   //  TODO: Add unit tests with code similar to:
 94 |   //  /**
 95 |   //   * Temporary method to test if serialization-deserialization works properly
 96 |   //   */
 97 |   //  def testSerDe(rdd: RDD[_ <: AnyRef], path: String) {
 98 |   //    val serialized = rdd.map(x => (NullWritable.get(), new BytesWritable(serialize(x))))
 99 |   //    serialized.saveAsSequenceFile(path)
100 |   //
101 |   //    val deserialized : RDD[_ <: AnyRef] = serialized.values.map(x => {
102 |   //      deserialize(x.getBytes, classOf[WikiPage]).asInstanceOf[WikiPage]
103 |   //    })
104 |   //
105 |   //    //Assertions below to test if (de)serialization works properly.
106 |   //    assert(deserialized.first().toString == rdd.first().toString)
107 |   //    assert(deserialized.count() == rdd.count())
108 |   //  }
109 |   //
110 |   //  /**
111 |   //   * Temporary method to test if saveAsKryoFile() and openFromKryoFile() work consistently.
112 |   //   */
113 |   //  def testSaveOpen(sc: SparkContext, rdd: RDD[_ <: WikiPage], path: String) {
114 |   //    saveRDD(rdd, path)
115 |   //    val deserialized = loadRDD(sc, path)
116 |   //
117 |   //    //Test to ensure we're saving as many WikiPages as we're retrieving after deserialization
118 |   //    assert(deserialized.count() == rdd.count())
119 |   //  }
120 | 
121 |   /**
122 |    * @param x Any object
123 |    * @return serialized Array of Bytes
124 |    */
125 |   def serialize(x: Any): Array[Byte] =
126 |   {
127 |     val stream = new ByteArrayOutputStream()
128 |     val output = new Output(stream)
129 |     getKryoInstance.writeObject(output, x)
130 |     output.close()
131 |     stream.toByteArray
132 |   }
133 | 
134 |   /**
135 |    * @param x Array of Bytes - serialized version of an object
136 |    * @param c Class of the object
137 |    * @return the object deserialized by Kryo
138 |    */
139 |   def deserialize[T](x: Array[Byte], c: Class[T]) =
140 |   {
141 |     getKryoInstance.readObject(new Input(new ByteArrayInputStream(x)), c)
142 |   }
143 | }


--------------------------------------------------------------------------------
/download/src/main/scala/org/dbpedia/extraction/dump/download/DistDownload.scala:
--------------------------------------------------------------------------------
  1 | package org.dbpedia.extraction.dump.download
  2 | 
  3 | import akka.actor._
  4 | import akka.cluster.Cluster
  5 | import akka.contrib.pattern.{ClusterSingletonManager, ClusterClient}
  6 | import com.typesafe.config.ConfigFactory
  7 | import scala.concurrent.duration._
  8 | import scala.language.postfixOps
  9 | import org.dbpedia.extraction.dump.download.actors._
 10 | import akka.actor.RootActorPath
 11 | import scala.Some
 12 | import java.util.logging.Logger
 13 | import org.dbpedia.extraction.util.RemoteExecute
 14 | import org.dbpedia.extraction.dump.download.actors.DownloadClient.Finished
 15 | 
 16 | /**
 17 |  * Distributed Wikipedia dump downloader.
 18 |  *
 19 |  * While running this on a cluster, make sure that all configuration variables (including the paths to configuration files)
 20 |  * are valid in all nodes of the cluster, ie. the configuration files need to be present on the worker nodes too.
 21 |  */
 22 | object DistDownload extends RemoteExecute
 23 | {
 24 |   val logger = Logger.getLogger(classOf[DistDownload].getName)
 25 | 
 26 |   def main(args: Array[String]): Unit =
 27 |   {
 28 |     val config = new DistDownloadConfig(args)
 29 |     if (config.isMaster)
 30 |     {
 31 |       val cluster = new ClusterStartup(config)
 32 | 
 33 |       // Start master on the driver node
 34 |       val joinAddress = cluster.startMaster(None, "driver")
 35 |       Thread.sleep(5000) // wait a few sec for master to start up
 36 | 
 37 |       (config.privateKey, config.sshPassphrase) match
 38 |       {
 39 |         case (Some(identity), Some(passphrase)) => // both private key and passphrase are provided
 40 |           addIdentity(identity, passphrase)
 41 |         case (Some(identity), None) => // passphrase is empty
 42 |           addIdentity(identity)
 43 |         case _ => // no private key provided
 44 |       }
 45 | 
 46 |       for (host <- config.slaves)
 47 |       {
 48 |         val session = createSession(config.userName, host)
 49 |         for (worker <- 1 to config.workersPerSlave)
 50 |         {
 51 |           val command = """cd %s/download;mkdir -p ../logs;nohup ../run download join=%s %s > ../logs/%s-%d.out &""".
 52 |                         format(config.homeDir, joinAddress, args.mkString(" "), host, worker)
 53 |           println(command)
 54 |           println(execute(session, command))
 55 |         }
 56 |         session.disconnect()
 57 |       }
 58 | 
 59 |       // Start download client and result/progress consumer
 60 |       val client = cluster.startFrontend(joinAddress)
 61 |       val dumpFiles = new DumpFileSource(config.languages,
 62 |                                          config.baseUrl,
 63 |                                          config.baseDir,
 64 |                                          config.wikiName,
 65 |                                          config.ranges,
 66 |                                          config.dateRange,
 67 |                                          config.dumpCount)
 68 |       for(dumpFile <- dumpFiles)
 69 |         client ! dumpFile
 70 | 
 71 |       client ! Finished
 72 |     }
 73 |     else
 74 |     {
 75 |       val cluster = new ClusterStartup(config)
 76 |       cluster.startWorker(config.joinAddress.get)
 77 |     }
 78 |   }
 79 | }
 80 | 
 81 | class DistDownload
 82 | 
 83 | class ClusterStartup(config: DistDownloadConfig)
 84 | {
 85 |   def systemName = "Workers"
 86 | 
 87 |   private def progressReportTimeout = config.progressReportInterval + 2.seconds
 88 | 
 89 |   def startMaster(joinAddressOption: Option[Address], role: String): Address =
 90 |   {
 91 |     val conf = ConfigFactory.parseString( s"""akka.cluster.roles=[$role]\nakka.remote.netty.tcp.hostname="${config.master}"""").
 92 |                withFallback(ConfigFactory.load())
 93 |     val system = ActorSystem(systemName, conf)
 94 |     val joinAddress = joinAddressOption.getOrElse(Cluster(system).selfAddress)
 95 |     Cluster(system).join(joinAddress)
 96 |     system.actorOf(
 97 |                     ClusterSingletonManager.props(Master.props(
 98 |                                                                 progressReportTimeout,
 99 |                                                                 config.mirrors,
100 |                                                                 config.threadsPerMirror
101 |                                                               ),
102 |                                                   "active", PoisonPill, Some(role)
103 |                                                  ),
104 |                     "master")
105 |     joinAddress
106 |   }
107 | 
108 |   def startFrontend(joinAddress: akka.actor.Address): ActorRef =
109 |   {
110 |     val conf = ConfigFactory.parseString( s"""akka.remote.netty.tcp.hostname="${config.master}"""").
111 |                withFallback(ConfigFactory.load())
112 |     val system = ActorSystem(systemName, conf)
113 |     Cluster(system).join(joinAddress)
114 | 
115 |     val client = system.actorOf(Props[DownloadClient], "client")
116 |     system.actorOf(Props[DownloadResultConsumer], "consumer")
117 |     client
118 |   }
119 | 
120 |   def startWorker(contactAddress: akka.actor.Address) =
121 |   {
122 |     val conf = ConfigFactory.load()
123 |     val system = ActorSystem(systemName, conf)
124 |     val initialContacts = Set(system.actorSelection(RootActorPath(contactAddress) / "user" / "receptionist"))
125 |     val clusterClient = system.actorOf(ClusterClient.props(initialContacts), "clusterClient")
126 |     system.actorOf(
127 |                     Worker.props(clusterClient,
128 |                                  DownloadJobRunner.props(config.progressReportInterval,
129 |                                                          config.hadoopConf,
130 |                                                          config.localTempDir,
131 |                                                          config.unzip
132 |                                                         ),
133 |                                  config.maxDuplicateProgress
134 |                                 ),
135 |                     "worker"
136 |                   )
137 |   }
138 | }


--------------------------------------------------------------------------------
/extraction/pom.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <project xmlns="http://maven.apache.org/POM/4.0.0"
  3 |          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  4 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  5 |     <parent>
  6 |         <artifactId>distributed-extraction</artifactId>
  7 |         <groupId>org.dbpedia</groupId>
  8 |         <version>4.1-SNAPSHOT</version>
  9 |     </parent>
 10 |     <modelVersion>4.0.0</modelVersion>
 11 | 
 12 |     <groupId>org.dbpedia.distributed-extraction</groupId>
 13 |     <artifactId>extraction</artifactId>
 14 |     <version>4.1-SNAPSHOT</version>
 15 |     <name>DBpedia Distributed Dump Extractor</name>
 16 | 
 17 |     <build>
 18 |         <plugins>
 19 |             <!-- A shaded jar is convenient for adding to Spark's classpath for distribution in the cluster -->
 20 |             <plugin>
 21 |                 <groupId>org.apache.maven.plugins</groupId>
 22 |                 <artifactId>maven-shade-plugin</artifactId>
 23 |                 <version>1.7</version>
 24 |                 <executions>
 25 |                     <execution>
 26 |                         <phase>package</phase>
 27 |                         <goals>
 28 |                             <goal>shade</goal>
 29 |                         </goals>
 30 |                     </execution>
 31 |                 </executions>
 32 |             </plugin>
 33 | 
 34 |             <plugin>
 35 |                 <groupId>net.alchim31.maven</groupId>
 36 |                 <artifactId>scala-maven-plugin</artifactId>
 37 | 
 38 |                 <configuration>
 39 |                     <launchers>
 40 |                         <launcher>
 41 |                             <id>seq-extraction</id>
 42 |                             <mainClass>org.dbpedia.extraction.dump.extract.Extraction</mainClass>
 43 | 
 44 |                             <jvmArgs>
 45 |                                 <jvmArg>-server</jvmArg>
 46 |                                 <!--
 47 |                                 <jvmArg>-Xmx4096m</jvmArg>
 48 |                                 <jvmArg>-XX:+HeapDumpOnOutOfMemoryError</jvmArg>
 49 |                                 <jvmArg>-XX:+UseConcMarkSweepGC</jvmArg>
 50 |                                 <jvmArg>-XX:+PrintGC</jvmArg>
 51 |                                 <jvmArg>-XX:+PrintGCTimeStamps</jvmArg>
 52 |                                 <jvmArg>-Dhttp.proxyHost=proxy.server.com</jvmArg>
 53 |                                 <jvmArg>-Dhttp.proxyPort=80</jvmArg>
 54 |                                 <jvmArg>-Dhttp.proxyUser=user</jvmArg>
 55 |                                 <jvmArg>-Dhttp.proxyPassword=password</jvmArg>
 56 |                                 <jvmArg>-Dhttp.nonProxyHosts="localhost|127.0.0.1"</jvmArg>
 57 |                                 -->
 58 |                             </jvmArgs>
 59 |                         </launcher>
 60 | 
 61 |                         <launcher>
 62 |                             <id>extraction</id>
 63 |                             <mainClass>org.dbpedia.extraction.dump.extract.DistExtraction</mainClass>
 64 | 
 65 |                             <jvmArgs>
 66 |                                 <jvmArg>-server</jvmArg>
 67 |                                 <!--
 68 |                                 <jvmArg>-Xmx4096m</jvmArg>
 69 |                                 <jvmArg>-XX:+HeapDumpOnOutOfMemoryError</jvmArg>
 70 |                                 <jvmArg>-XX:+UseConcMarkSweepGC</jvmArg>
 71 |                                 <jvmArg>-XX:+PrintGC</jvmArg>
 72 |                                 <jvmArg>-XX:+PrintGCTimeStamps</jvmArg>
 73 |                                 <jvmArg>-Dhttp.proxyHost=proxy.server.com</jvmArg>
 74 |                                 <jvmArg>-Dhttp.proxyPort=80</jvmArg>
 75 |                                 <jvmArg>-Dhttp.proxyUser=user</jvmArg>
 76 |                                 <jvmArg>-Dhttp.proxyPassword=password</jvmArg>
 77 |                                 <jvmArg>-Dhttp.nonProxyHosts="localhost|127.0.0.1"</jvmArg>
 78 |                                 -->
 79 |                             </jvmArgs>
 80 |                         </launcher>
 81 |                     </launchers>
 82 |                 </configuration>
 83 | 
 84 |             </plugin>
 85 |         </plugins>
 86 |     </build>
 87 |     <dependencies>
 88 |         <dependency>
 89 |             <groupId>org.dbpedia.distributed-extraction</groupId>
 90 |             <artifactId>common</artifactId>
 91 |             <version>4.1-SNAPSHOT</version>
 92 |         </dependency>
 93 | 
 94 |         <dependency>
 95 |             <groupId>org.dbpedia.extraction</groupId>
 96 |             <artifactId>core</artifactId>
 97 |             <version>4.1</version>
 98 |         </dependency>
 99 | 
100 |         <dependency>
101 |             <groupId>org.dbpedia.extraction</groupId>
102 |             <artifactId>dump</artifactId>
103 |             <version>4.1</version>
104 |         </dependency>
105 | 
106 |         <dependency>
107 |             <groupId>org.dbpedia.extraction</groupId>
108 |             <artifactId>scripts</artifactId>
109 |             <version>4.1</version>
110 |         </dependency>
111 | 
112 |         <dependency>
113 |             <groupId>org.apache.spark</groupId>
114 |             <artifactId>spark-core_2.11</artifactId>
115 |             <version>${spark.version}</version>
116 |             <scope>provided</scope>
117 |         </dependency>
118 | 
119 |         <dependency>
120 |             <groupId>org.apache.hadoop</groupId>
121 |             <artifactId>hadoop-client</artifactId>
122 |             <version>${hadoop.version}</version>
123 |         </dependency>
124 | 
125 |         <dependency>
126 |             <groupId>org.apache.hadoop</groupId>
127 |             <artifactId>hadoop-common</artifactId>
128 |             <version>${hadoop.version}</version>
129 |         </dependency>
130 | 
131 |         <dependency>
132 |             <groupId>org.scalatest</groupId>
133 |             <artifactId>scalatest_2.11</artifactId>
134 |             <scope>test</scope>
135 |         </dependency>
136 | 
137 |         <dependency>
138 |             <groupId>junit</groupId>
139 |             <artifactId>junit</artifactId>
140 |             <version>4.8.2</version>
141 |             <scope>test</scope>
142 |         </dependency>
143 |     </dependencies>
144 | 
145 | </project>
146 | 


--------------------------------------------------------------------------------
/download/pom.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <project xmlns="http://maven.apache.org/POM/4.0.0"
  3 |          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  4 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  5 |     <modelVersion>4.0.0</modelVersion>
  6 | 
  7 |     <parent>
  8 |         <groupId>org.dbpedia</groupId>
  9 |         <artifactId>distributed-extraction</artifactId>
 10 |         <version>4.1-SNAPSHOT</version>
 11 |     </parent>
 12 | 
 13 |     <groupId>org.dbpedia.distributed-extraction</groupId>
 14 |     <artifactId>download</artifactId>
 15 |     <version>4.1-SNAPSHOT</version>
 16 |     <name>DBpedia Distributed Dump Downloader</name>
 17 | 
 18 |     <build>
 19 |         <plugins>
 20 |             <!-- A shaded jar is convenient for adding to Spark's classpath for distribution in the cluster -->
 21 |             <plugin>
 22 |                 <groupId>org.apache.maven.plugins</groupId>
 23 |                 <artifactId>maven-shade-plugin</artifactId>
 24 |                 <version>2.2</version>
 25 |                 <configuration>
 26 |                     <filters>
 27 |                         <filter>
 28 |                             <artifact>*:*</artifact>
 29 |                             <excludes>
 30 |                                 <exclude>META-INF/*.SF</exclude>
 31 |                                 <exclude>META-INF/*.DSA</exclude>
 32 |                                 <exclude>META-INF/*.RSA</exclude>
 33 |                             </excludes>
 34 |                         </filter>
 35 |                     </filters>
 36 |                 </configuration>
 37 |                 <executions>
 38 |                     <execution>
 39 |                         <id>downloads-jar</id>
 40 |                         <phase>package</phase>
 41 |                         <goals>
 42 |                             <goal>shade</goal>
 43 |                         </goals>
 44 |                         <configuration>
 45 |                             <transformers>
 46 |                                 <transformer implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer"/>
 47 |                                 <!--
 48 |                                   Some care is required:
 49 |                                   http://doc.akka.io/docs/akka/snapshot/general/configuration.html
 50 |                                 -->
 51 |                                 <transformer implementation="org.apache.maven.plugins.shade.resource.AppendingTransformer">
 52 |                                     <resource>reference.conf</resource>
 53 |                                 </transformer>
 54 |                                 <transformer
 55 |                                         implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
 56 |                                     <manifestEntries>
 57 |                                         <Main-Class>worker.Main</Main-Class>
 58 |                                     </manifestEntries>
 59 |                                 </transformer>
 60 |                             </transformers>
 61 |                         </configuration>
 62 |                     </execution>
 63 |                 </executions>
 64 |             </plugin>
 65 | 
 66 |             <plugin>
 67 |                 <groupId>net.alchim31.maven</groupId>
 68 |                 <artifactId>scala-maven-plugin</artifactId>
 69 | 
 70 |                 <configuration>
 71 |                     <launchers>
 72 |                         <launcher>
 73 |                             <id>seq-download</id>
 74 |                             <mainClass>org.dbpedia.extraction.dump.download.Download</mainClass>
 75 |                             <!--
 76 |                             <jvmArgs>
 77 |                                 <jvmArg>-Dhttp.proxyHost=proxy.server.com</jvmArg>
 78 |                                 <jvmArg>-Dhttp.proxyPort=port</jvmArg>
 79 |                                 <jvmArg>-Dhttp.proxyUser=user</jvmArg>
 80 |                                 <jvmArg>-Dhttp.proxyPassword=password</jvmArg>
 81 |                                 <jvmArg>-Dhttp.nonProxyHosts="localhost|127.0.0.1"</jvmArg>
 82 |                              </jvmArgs>
 83 |                              -->
 84 |                             <!-- ../run download config=download.properties -->
 85 |                         </launcher>
 86 | 
 87 |                         <launcher>
 88 |                             <id>download</id>
 89 |                             <mainClass>org.dbpedia.extraction.dump.download.DistDownload</mainClass>
 90 |                             <!--
 91 |                             <jvmArgs>
 92 |                                 <jvmArg>-Dhttp.proxyHost=proxy.server.com</jvmArg>
 93 |                                 <jvmArg>-Dhttp.proxyPort=port</jvmArg>
 94 |                                 <jvmArg>-Dhttp.proxyUser=user</jvmArg>
 95 |                                 <jvmArg>-Dhttp.proxyPassword=password</jvmArg>
 96 |                                 <jvmArg>-Dhttp.nonProxyHosts="localhost|127.0.0.1"</jvmArg>
 97 |                              </jvmArgs>
 98 |                              -->
 99 |                             <!-- ../run download config=download.properties -->
100 |                         </launcher>
101 |                     </launchers>
102 |                 </configuration>
103 |             </plugin>
104 |         </plugins>
105 |     </build>
106 | 
107 |     <dependencies>
108 |         <dependency>
109 |             <groupId>org.dbpedia.extraction</groupId>
110 |             <artifactId>dump</artifactId>
111 |             <version>4.1</version>
112 |         </dependency>
113 | 
114 |         <dependency>
115 |             <groupId>org.dbpedia.distributed-extraction</groupId>
116 |             <artifactId>common</artifactId>
117 |             <version>4.1-SNAPSHOT</version>
118 |         </dependency>
119 | 
120 |         <dependency>
121 |             <groupId>org.apache.hadoop</groupId>
122 |             <artifactId>hadoop-client</artifactId>
123 |             <version>${hadoop.version}</version>
124 |         </dependency>
125 | 
126 |         <dependency>
127 |             <groupId>com.typesafe.akka</groupId>
128 |             <artifactId>akka-contrib_2.10</artifactId>
129 |             <version>2.3.0</version>
130 |         </dependency>
131 | 
132 |         <dependency>
133 |             <groupId>com.typesafe.akka</groupId>
134 |             <artifactId>akka-testkit_2.10</artifactId>
135 |             <version>2.3.0</version>
136 |         </dependency>
137 | 
138 |         <dependency>
139 |             <groupId>com.jcraft</groupId>
140 |             <artifactId>jsch</artifactId>
141 |             <version>0.1.51</version>
142 |         </dependency>
143 |     </dependencies>
144 | </project>
145 | 


--------------------------------------------------------------------------------
/download/src/main/scala/org/dbpedia/extraction/dump/download/actors/Worker.scala:
--------------------------------------------------------------------------------
  1 | package org.dbpedia.extraction.dump.download.actors
  2 | 
  3 | import akka.actor._
  4 | import scala.concurrent.duration._
  5 | import java.util.UUID
  6 | import akka.actor.SupervisorStrategy.{Stop, Restart}
  7 | import org.dbpedia.extraction.dump.download.actors.message._
  8 | import GeneralMessage.ShutdownCluster
  9 | import scala.language.postfixOps
 10 | import org.dbpedia.extraction.dump.download.actors.Worker.DownloadComplete
 11 | import scala.Some
 12 | import akka.actor.OneForOneStrategy
 13 | import akka.contrib.pattern.ClusterClient.SendToAll
 14 | import org.dbpedia.extraction.dump.download.actors.message.DownloadJob
 15 | import akka.actor.Terminated
 16 | import akka.actor.DeathPactException
 17 | 
 18 | /**
 19 |  * Worker actor that runs on each worker node. This dispatches a download job to a child DownloadJobRunner actor
 20 |  * which manages download and a DownloadProgressTracker to send progress reports back to the Worker.
 21 |  *
 22 |  * @param clusterClient Akka ClusterClient that acts as a proxy to the master
 23 |  * @param downloadRunnerProps Props for the downloadRunner actor. See Worker.props()
 24 |  * @param maxDuplicateProgress Maximum number of consecutive duplicate progress read bytes to tolerate
 25 |  * @param registerInterval The worker registers itself with the master every registerInterval
 26 |  */
 27 | class Worker(clusterClient: ActorRef, downloadRunnerProps: Props, maxDuplicateProgress: Int, registerInterval: FiniteDuration)
 28 |   extends Actor with ActorLogging
 29 | {
 30 | 
 31 |   import MasterWorkerMessage._
 32 |   import WorkerProgressMessage._
 33 |   import context.dispatcher
 34 | 
 35 |   def scheduler = context.system.scheduler
 36 | 
 37 |   val workerId = UUID.randomUUID().toString
 38 | 
 39 |   // Register to the master at specific intervals.
 40 |   val registerTask = context.system.scheduler.schedule(0.seconds, registerInterval, clusterClient,
 41 |                                                        SendToAll("/user/master/active", RegisterWorker(workerId)))
 42 | 
 43 |   val downloadRunner = context.watch(context.actorOf(downloadRunnerProps, "runner"))
 44 | 
 45 |   var currentDownloadId: Option[String] = None
 46 | 
 47 |   private var totalBytes = 0l
 48 |   private var currentBytes = 0l
 49 |   private var progressDelays = 0
 50 | 
 51 |   def downloadId: String = currentDownloadId match
 52 |   {
 53 |     case Some(workId) => workId
 54 |     case None => throw new IllegalStateException("Not working")
 55 |   }
 56 | 
 57 |   override def supervisorStrategy =
 58 |     OneForOneStrategy()
 59 |     {
 60 |       case _: ActorInitializationException => Stop
 61 |       case _: DeathPactException => Stop
 62 |       case _: Exception =>
 63 |         currentDownloadId foreach (workId => sendToMaster(DownloadFailed(workerId, workId)))
 64 |         context.become(idle)
 65 |         Restart
 66 |     }
 67 | 
 68 |   override def postStop(): Unit = registerTask.cancel()
 69 | 
 70 |   def receive = idle
 71 | 
 72 |   def idle: Receive =
 73 |   {
 74 |     case ShutdownCluster => // Master sends ShutdownCluster
 75 |       sendToMaster(RemoveWorker(workerId))
 76 |       scheduler.scheduleOnce(5 seconds)
 77 |       {
 78 |         registerTask.cancel()
 79 |         context.stop(downloadRunner)
 80 |         context.stop(self)
 81 |         context.system.shutdown()
 82 |       }
 83 | 
 84 |     case DownloadIsReady => // begin 3-way handshake to get download job from master
 85 |       sendToMaster(WorkerRequestsDownload(workerId))
 86 | 
 87 |     case job @ MirroredDownloadJob(_, DownloadJob(downloadId, _)) => // receive new download job
 88 |       log.info("Got download job: {}", job)
 89 |       currentDownloadId = Some(downloadId)
 90 | 
 91 |       // reset state variables for new download job
 92 |       currentBytes = 0
 93 |       totalBytes = 0
 94 |       progressDelays = 0
 95 | 
 96 |       downloadRunner ! job
 97 |       context.become(working)
 98 |   }
 99 | 
100 |   def working: Receive =
101 |   {
102 |     case p @ ProgressStart(total) =>
103 |       sendToMaster(ProgressReport(workerId, downloadId, p))
104 |       if(totalBytes == 0) totalBytes = total
105 | 
106 |     case p @ Progress(bytes) =>
107 |       sendToMaster(ProgressReport(workerId, downloadId, p))
108 | 
109 |       // check if number of bytes downloaded has increased.
110 |       if(bytes > currentBytes)
111 |       {
112 |         currentBytes = bytes
113 |         progressDelays = 0
114 |       }
115 |       else
116 |       {
117 |         progressDelays += 1
118 |       }
119 | 
120 |       if(progressDelays > maxDuplicateProgress && totalBytes != bytes) // too many progress delays?
121 |       {
122 |         val delay = progressDelays * downloadRunnerProps.args(0).asInstanceOf[FiniteDuration].toSeconds
123 |         log.info(s"Download progress of $currentDownloadId has stagnated. No update occurred in $delay seconds!")
124 |         sendToMaster(DownloadFailed(workerId, currentDownloadId.get))
125 |       }
126 | 
127 |     case DownloadComplete(output, bytes) => // DownloadJobRunner sends this upon completion
128 |       log.info("Download is complete. Output file: {}. Total bytes: {}", output, bytes)
129 |       sendToMaster(DownloadIsDone(workerId, downloadId, output, bytes))
130 |       context.setReceiveTimeout(10.seconds)
131 |       context.become(waitForDownloadIsDoneAck(output, bytes)) // Send news of finished download to Master and wait for ACK.
132 | 
133 |     case ShutdownCluster =>
134 |       log.info("Yikes. Master told me to shutdown, while I'm downloading.")
135 | 
136 |     case _: MirroredDownloadJob =>
137 |       log.info("Yikes. Master gave me a download job, while I'm downloading.")
138 |   }
139 | 
140 |   def waitForDownloadIsDoneAck(outputFilePath: String, bytes: Long): Receive =
141 |   {
142 |     case Ack(id) if id == downloadId =>
143 |       sendToMaster(WorkerRequestsDownload(workerId))
144 |       context.setReceiveTimeout(Duration.Undefined)
145 |       context.become(idle)
146 |     case ReceiveTimeout =>
147 |       log.info("No ACK from master, retrying")
148 |       sendToMaster(DownloadIsDone(workerId, downloadId, outputFilePath, bytes))
149 |   }
150 | 
151 |   override def unhandled(message: Any): Unit = message match
152 |   {
153 |     case Terminated(`downloadRunner`) => context.stop(self)
154 |     case DownloadIsReady =>
155 |     case _ => super.unhandled(message)
156 |   }
157 | 
158 |   def sendToMaster(msg: Any): Unit =
159 |   {
160 |     clusterClient ! SendToAll("/user/master/active", msg)
161 |   }
162 | }
163 | 
164 | object Worker
165 | {
166 |   def props(clusterClient: ActorRef, downloadRunnerProps: Props, maxDuplicateProgress: Int, registerInterval: FiniteDuration = 10.seconds): Props =
167 |     Props(classOf[Worker], clusterClient, downloadRunnerProps, maxDuplicateProgress, registerInterval)
168 | 
169 |   case class DownloadComplete(outputFilePath: String, bytes: Long)
170 | 
171 | }


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | DBpedia Distributed Extraction Framework 
  2 | ==================================
  3 | 
  4 | ## We are looking for maintatiner(s) for this project
  5 | 
  6 | If you want to join, express your interest!
  7 | 
  8 | ## Description
  9 | 
 10 | This is the distributed version of the [DBpedia Information Extraction Framework](https://github.com/dbpedia/extraction-framework/). It uses [Apache Spark](http://spark.apache.org) to extract structured data from Wikipedia in a parallel, distributed manner.
 11 | 
 12 | This is currently a work-in-progress, and the instructions are mostly intended for developers.
 13 | 
 14 | ## Requirements
 15 | * Java 7
 16 | * Maven 3
 17 | * Apache Spark 0.9.1 built with Apache Hadoop 2.2.0
 18 | 
 19 | ## Setup Apache Spark
 20 | 
 21 | ```bash
 22 | $ wget http://d3kbcqa49mib13.cloudfront.net/spark-0.9.1-bin-hadoop2.tgz
 23 | $ tar xzf http://d3kbcqa49mib13.cloudfront.net/spark-0.9.1-bin-hadoop2.tgz
 24 | $ cd spark-0.9.1-bin-hadoop2
 25 | $ SCALA_HOME=/usr/share/java MAVEN_OPTS=\"-Xmx2g -XX:MaxPermSize=512M -XX:ReservedCodeCacheSize=512m\" mvn -Dhadoop.version=2.2.0 -Dprotobuf.version=2.5.0 -DskipTests clean package
 26 | ```
 27 | 
 28 | Replace SCALA_HOME according to your machine settings. It is necessary to set enough memory for maven to make Spark compile successfully.
 29 | 
 30 | Add the hostnames of your slave nodes (after having downloaded Spark to all nodes) to conf/slaves. There's a bunch of attendant configurations needed for running on a cluster, like ensuring the firewall allows traffic on certain ports, ensuring passwordless access between the master and slave nodes, setting up HDFS and formatting the NameNode etc. Usually in a cluster of N nodes you would run Spark Master and Hadoop's NameNode on one node and the Spark Workers and Hadoop DataNodes on the remaining N-1 nodes.
 31 | 
 32 | Here's a sample `spark-env.sh` for a cluster where the slaves have 4 cores and 15G RAM each:
 33 | ```bash
 34 | export SCALA_HOME=/usr/share/java[
 35 | export SPARK_MEM=2500m
 36 | export SPARK_WORKER_CORES=1
 37 | export SPARK_WORKER_INSTANCES=4
 38 | SPARK_JAVA_OPTS+=" -Dspark.local.dir=/mnt/spark"
 39 | export SPARK_JAVA_OPTS
 40 | export SPARK_MASTER_IP=192.168.0.100
 41 | export JAVA_HOME=/usr/lib/jvm/java-1.7.0-openjdk-1.7.0.65.x86_64
 42 | ```
 43 | 
 44 | **Important**: Note that we have set cores (threads) per worker to 1 and set the number of workers equal to the number of cores on the machine. This is because:
 45 | * The implementation that Hadoop uses to decode bzip2 files - `CBZip2InputStream` - is not thead-safe (there's a JIRA for that: httips://issues.apache.org/jira/browse/HADOOP-10614). This means that allotting multiple threads to a single worker while using .bz2 input files will cause the jobs to fail.
 46 | * Multiple JVMs rather than a single huge JVM often increases performance.
 47 | 
 48 | While running tests we have found that setting `spark.executor.memory` to 2500m - 3000m is a good idea with the above sample configuration. It is given in the sample dist-config.properties file discussed in the next section.
 49 | 
 50 | And at the end:
 51 | 
 52 | ```
 53 | sbin/start-all.sh
 54 | ```
 55 |     
 56 | We have added a script for setting up Spark and Hadoop on Google Compute Engine with the optimal settings for this framework. You can find it in the **gce** directory.
 57 |     
 58 | Please refer to the [Spark official docs](http://spark.apache.org/docs/0.9.1/spark-standalone.html) for details on how to deploy Spark in standalone mode.
 59 | 
 60 | ## How to Build
 61 | 
 62 | Clone the latest version of the repo and switch to stage branch:
 63 | 
 64 |     $ git clone https://github.com/dbpedia/distributed-extraction-framework.git
 65 |     $ cd distributed-extraction-framework
 66 |     $ mvn clean install -Dmaven.test.skip=true # Compiles the code without running tests
 67 | 
 68 | ## Dump-based Distributed Extraction
 69 | 
 70 | Follow the instructions given below to download data for the extractions you need to perform. An example of the download.properties file is given at `download/src/test/resources/download.properties`
 71 | 
 72 | In the root directory run the following commands
 73 | 
 74 |     $ mvn clean install -Dmaven.test.skip=true # Compiles the code without running tests
 75 |     $ ./run download config=download.properties # Downloads the wikipedia dumps
 76 | 
 77 | **Points to keep in mind:**
 78 | 
 79 | 1. Before performing extractions you will need a config.properties file for general extraction configuration and a dist-config.properties file for the distributed framework specific configuration (Spark, Hadoop, logging etc.). Examples are given at `extraction/src/test/resources/`.
 80 | 
 81 | 2. The example `extraction/src/test/resources/dist-config.properties` file needs to be modified with a proper spark-home and spark-master (local[N] means N cores on the local node - you can change it to something like `spark://hostname:7077` to run it in distributed mode).
 82 | 
 83 | 3. Prefer pages-articles-multistream.bz2 files to pages-articles.bz2 because they are more efficient for parallel extraction. The former can be decompressed in parallel using Hadoop's splittable Bzip2Codec. Of course, this does not matter when using the pages-articlesX.xml-pXXXXXXXXXXpXXXXXXXXXX.bz2 files (which will be the files of choice for distributed downloads).
 84 | 
 85 | 4. **Important:** Finally, when running on a distributed cluster, it is essential that you set `spark.cores.max` (in dist-config.properties) to **N** \* **M** where N = total no. of slaves, M = `SPARK_WORKER_INSTANCES`. This is to ensure that Spark uses as many cores (over the entire cluster) as many workers there are.
 86 | 
 87 | Now perform parallel extractions on your Spark cluster:
 88 | 
 89 |     $ ./run extraction extraction/src/test/resources/config.properties extraction/src/test/resources/dist-config.properties
 90 | 
 91 | 
 92 | ### Testing
 93 | Please see the [wiki page for Testing](https://github.com/dbpedia/distributed-extraction-framework/wiki/Testing) for detailed instructions on how to verify outputs of the distributed extraction framework by comparing them with that of the original.
 94 | 
 95 | ## Distributed Downloads
 96 | 
 97 | This is still a work in progress and there are some issues that need to be solved.
 98 | 
 99 | Have a look at `download/src/test/resources/dist-download.properties` and `download/src/test/resources/download.properties`. You can create your own config files using them. Just make sure that they are present at the same path in all nodes of the cluster.
100 | 
101 | After cloning and building the framework on the master node, for each slave node, do this:
102 | ```
103 | rsync -avhz --progress ~/.m2 $SLAVE:~/
104 | rsync -avhz --progress /path/to/distributed-extraction-framework $SLAVE:/path/to/
105 | ../run download distconfig=/path/to/distributed-extraction-framework/download/src/test/resources/dist-download.properties config=/path/to/distributed-extraction-framework/download/src/test/resources/download.properties
106 | ```
107 | 
108 | You can find the worker logs at `/path/to/distributed-extraction-framework/logs` of each node.
109 | 
110 | 


--------------------------------------------------------------------------------
/extraction/src/main/scala/org/dbpedia/extraction/mappings/DistRedirects.scala:
--------------------------------------------------------------------------------
  1 | package org.dbpedia.extraction.mappings
  2 | 
  3 | import java.util.logging.{Level, Logger}
  4 | import org.dbpedia.extraction.sources.WikiPage
  5 | import java.io._
  6 | import org.dbpedia.extraction.wikiparser._
  7 | import org.dbpedia.extraction.util.{DistIOUtils, Language}
  8 | import org.dbpedia.extraction.wikiparser.impl.wikipedia.Redirect
  9 | import org.apache.spark.rdd.RDD
 10 | import com.esotericsoftware.kryo.io.{Input, Output}
 11 | import org.dbpedia.extraction.util.RichHadoopPath.wrapPath
 12 | import org.apache.hadoop.fs.Path
 13 | import org.apache.hadoop.conf.Configuration
 14 | import org.apache.spark.SparkContext._
 15 | 
 16 | /**
 17 |  * Distributed version of Redirects; uses Spark to compute redirects.
 18 |  *
 19 |  * Holds the redirects between wiki pages
 20 |  * At the moment, only redirects between Templates are considered
 21 |  *
 22 |  * @param map Redirect map. Contains decoded template titles.
 23 |  *
 24 |  * @see Redirects
 25 |  */
 26 | class DistRedirects(override val map: Map[String, String]) extends Redirects(map)
 27 | 
 28 | /**
 29 |  * Loads redirects from a cache file or source of Wiki pages.
 30 |  * At the moment, only redirects between Templates are considered
 31 |  */
 32 | object DistRedirects
 33 | {
 34 |   private val logger = Logger.getLogger(classOf[DistRedirects].getName)
 35 | 
 36 |   /**
 37 |    * Tries to load the redirects from a cache file.
 38 |    * If not successful, loads the redirects from an RDD.
 39 |    * Updates the cache after loading the redirects from the source.
 40 |    *
 41 |    * @param rdd RDD of WikiPages
 42 |    * @param cache Path to cache file
 43 |    * @param lang Language
 44 |    * @param hadoopConf Configuration
 45 |    * @return Redirects object
 46 |    */
 47 |   def load(rdd: RDD[WikiPage], cache: Path, lang: Language)(implicit hadoopConf: Configuration): Redirects =
 48 |   {
 49 |     //Try to load redirects from the cache
 50 |     try
 51 |     {
 52 |       return loadFromCache(cache)
 53 |     }
 54 |     catch
 55 |       {
 56 |         case ex: Exception => logger.log(Level.INFO, "Will extract redirects from source for " + lang.wikiCode + " wiki, could not load cache file '" + cache.getSchemeWithFileName + "': " + ex)
 57 |       }
 58 | 
 59 |     //Load redirects from RDD
 60 |     val redirects = loadFromRDD(rdd, lang)
 61 | 
 62 |     val dir = cache.getParent
 63 |     if (!dir.exists && !dir.mkdirs()) throw new IOException("cache dir [" + dir.getSchemeWithFileName + "] does not exist and cannot be created")
 64 |     val output = new Output(new BufferedOutputStream(cache.outputStream()))
 65 |     try
 66 |     {
 67 |       DistIOUtils.getKryoInstance.writeClassAndObject(output, redirects.map)
 68 |       logger.info(redirects.map.size + " redirects written to cache file " + cache.getSchemeWithFileName)
 69 |       redirects
 70 |     }
 71 |     finally
 72 |     {
 73 |       output.close()
 74 |     }
 75 |   }
 76 | 
 77 |   /**
 78 |    * Loads the redirects from a cache file.
 79 |    */
 80 |   private def loadFromCache(cache: Path)(implicit hadoopConf: Configuration): Redirects =
 81 |   {
 82 |     logger.info("Loading redirects from cache file " + cache.getSchemeWithFileName)
 83 |     val input = new Input(new BufferedInputStream(cache.inputStream()))
 84 |     try
 85 |     {
 86 |       val redirects = new Redirects(DistIOUtils.getKryoInstance.readClassAndObject(input).asInstanceOf[Map[String, String]])
 87 |       logger.info(redirects.map.size + " redirects loaded from cache file " + cache.getSchemeWithFileName)
 88 |       redirects
 89 |     }
 90 |     finally
 91 |     {
 92 |       input.close()
 93 |     }
 94 |   }
 95 | 
 96 |   /**
 97 |    * Loads the redirects from a source.
 98 |    *
 99 |    * @param rdd RDD of WikiPages
100 |    * @param lang Language
101 |    * @return Redirects object
102 |    */
103 |   def loadFromRDD(rdd: RDD[WikiPage], lang: Language): Redirects =
104 |   {
105 |     logger.info("Loading redirects from source (" + lang.wikiCode + ")")
106 | 
107 |     val regexBC = rdd.sparkContext.broadcast(buildRegex(lang))
108 | 
109 |     // Wrap the map function inside a KryoSerializationWrapper
110 |     //    val mapper = SparkUtils.kryoWrapFunction(new RedirectFinder(langBC))
111 |     //    val redirects = new Redirects(rdd.flatMap(mapper).collectAsMap().toMap)
112 | 
113 |     val redirectsRDD = rdd.flatMap
114 |                        {
115 |                          case page: WikiPage =>
116 |                            val regex = regexBC.value
117 | 
118 |                            val destinationTitle = page.source match
119 |                            {
120 |                              case regex(destination) =>
121 |                                try
122 |                                {
123 |                                  WikiTitle.parse(destination, page.title.language)
124 |                                }
125 |                                catch
126 |                                  {
127 |                                    case ex: WikiParserException =>
128 |                                      Logger.getLogger(Redirects.getClass.getName).log(Level.WARNING, "Couldn't parse redirect destination", ex)
129 |                                      null
130 |                                  }
131 |                              case _ => null
132 |                            }
133 | 
134 |                            if (destinationTitle != page.redirect)
135 |                            {
136 |                              Logger.getLogger(Redirects.getClass.getName).log(Level.WARNING, "wrong redirect. page: [" + page.title + "].\nfound by dbpedia:   [" + destinationTitle + "].\nfound by wikipedia: [" + page.redirect + "]")
137 |                            }
138 | 
139 |                            if (destinationTitle != null && page.title.namespace == Namespace.Template && destinationTitle.namespace == Namespace.Template)
140 |                            {
141 |                              List((page.title.decoded, destinationTitle.decoded))
142 |                            }
143 |                            else
144 |                            {
145 |                              Nil
146 |                            }
147 |                        }
148 | 
149 |     val redirects = new Redirects(redirectsRDD.collectAsMap().toMap)
150 | 
151 |     logger.info("Redirects loaded from source (" + lang.wikiCode + ")")
152 |     redirects
153 |   }
154 | 
155 |   private def buildRegex(lang: Language) =
156 |   {
157 |     val redirects = Redirect(lang).mkString("|")
158 |     // (?ius) enables CASE_INSENSITIVE UNICODE_CASE DOTALL
159 |     // case insensitive and unicode are important - that's what mediawiki does.
160 |     // Note: Although we do not specify a Locale, UNICODE_CASE does mostly the right thing.
161 |     // DOTALL means that '.' also matches line terminators.
162 |     // Reminder: (?:...) are non-capturing groups, '*?' is a reluctant qualifier.
163 |     // (?:#[^\n]*?)? is an optional (the last '?') non-capturing group meaning: there may
164 |     // be a '#' after which everything but line breaks is allowed ('[]{}|<>' are not allowed
165 |     // before the '#'). The match is reluctant ('*?'), which means that we recognize ']]'
166 |     // as early as possible.
167 |     // (?:\|[^\n]*?)? is another optional non-capturing group that reluctantly consumes
168 |     // a '|' character and everything but line breaks after it.
169 |     ("""(?ius)\s*(?:""" + redirects + """)\s*:?\s*\[\[([^\[\]{}|<>\n]+(?:#[^\n]*?)?)(?:\|[^\n]*?)?\]\].*""").r
170 |   }
171 | }
172 | 
173 | 


--------------------------------------------------------------------------------
/download/src/main/scala/org/dbpedia/extraction/dump/download/DumpFileSource.scala:
--------------------------------------------------------------------------------
  1 | package org.dbpedia.extraction.dump.download
  2 | 
  3 | import org.dbpedia.extraction.util.{Language, WikiInfo}
  4 | import scala.io.{Source, Codec}
  5 | import java.net.URL
  6 | import scala.collection.mutable
  7 | import scala.collection.immutable.SortedSet
  8 | import scala.collection.mutable.{ListBuffer, Set}
  9 | import org.apache.hadoop.fs.Path
 10 | import org.dbpedia.extraction.dump.download.actors.message.DumpFile
 11 | 
 12 | /**
 13 |  * Generate DumpFile objects each representing a specific wiki file to download.
 14 |  * Most of the code was taken from LanguageDownloader (extraction-framework).
 15 |  *
 16 |  * TODO: Integrate this to LanguageDownloader and reuse it here (reduce code duplication)?
 17 |  */
 18 | class DumpFileSource(languages: mutable.HashMap[Language, mutable.Set[(String, Boolean)]],
 19 |                      baseUrl: URL,
 20 |                      baseDir: Path,
 21 |                      wikiSuffix: String,
 22 |                      ranges: mutable.HashMap[(Int, Int), mutable.Set[(String, Boolean)]],
 23 |                      dateRange: (String, String),
 24 |                      dumpCount: Int)
 25 |   extends Traversable[DumpFile] with Iterable[DumpFile]
 26 | {
 27 |   private val DateLink = """<a href="(\d{8})/">""".r
 28 |   private val list = new ListBuffer[DumpFile]()
 29 | 
 30 |   override def iterator: Iterator[DumpFile] = list.iterator
 31 | 
 32 |   override def foreach[U](func: DumpFile => U)
 33 |   {
 34 |     if(list.isEmpty)
 35 |     {
 36 |       // resolve page count ranges to languages
 37 |       if (ranges.nonEmpty)
 38 |       {
 39 |         val wikis = WikiInfo.fromURL(WikiInfo.URL, Codec.UTF8)
 40 | 
 41 |         // for all wikis in one of the desired ranges...
 42 |         for (((from, to), files) <- ranges; wiki <- wikis; if from <= wiki.pages && wiki.pages <= to)
 43 |         {
 44 |           // ...add files for this range to files for this language
 45 |           languages.getOrElseUpdate(wiki.language, new mutable.HashSet[(String, Boolean)]) ++= files
 46 |         }
 47 |       }
 48 | 
 49 |       // sort them to have reproducible behavior
 50 |       val languageKeys = SortedSet.empty[Language] ++ languages.keys
 51 |       languageKeys.foreach
 52 |       {
 53 |         lang =>
 54 |           val done = languageKeys.until(lang)
 55 |           val todo = languageKeys.from(lang)
 56 |           println("done: " + done.size + " - " + done.map(_.wikiCode).mkString(","))
 57 |           println("todo: " + todo.size + " - " + languageKeys.from(lang).map(_.wikiCode).mkString(","))
 58 |           for(dumpFile <- LanguageDumpFileSource(lang))
 59 |             list += dumpFile
 60 |       }
 61 |     }
 62 |     list foreach func
 63 |   }
 64 | 
 65 |   private class LanguageDumpFileSource(language: Language) extends Traversable[DumpFile]
 66 |   {
 67 |     val wiki = language.filePrefix + wikiSuffix
 68 |     val mainPage = new URL(baseUrl, wiki + "/")
 69 |     val fileNames = languages(language)
 70 | 
 71 |     override def foreach[U](func: DumpFile => U)
 72 |     {
 73 |       forDates(dateRange, dumpCount, func)
 74 |     }
 75 | 
 76 |     def forDates[U](dateRange: (String, String), dumpCount: Int, func: DumpFile => U)
 77 |     {
 78 |       val (firstDate, lastDate) = dateRange
 79 | 
 80 |       var dates = SortedSet.empty(Ordering[String].reverse)
 81 |       for (line <- Source.fromURL(mainPage).getLines())
 82 |         DateLink.findAllIn(line).matchData.foreach(dates += _.group(1))
 83 | 
 84 |       if (dates.size == 0) throw new Exception("found no date - " + mainPage + " is probably broken or unreachable. check your network / proxy settings.")
 85 | 
 86 |       var count = 0
 87 | 
 88 |       // find date pages that have all files we want
 89 |       for (date <- dates)
 90 |       {
 91 |         if (count < dumpCount && date >= firstDate && date <= lastDate && forDate(date, func)) count += 1
 92 |       }
 93 | 
 94 |       if (count == 0) throw new Exception("found no date on " + mainPage + " in range " + firstDate + "-" + lastDate + " with files " + fileNames.mkString(","))
 95 |     }
 96 | 
 97 |     def forDate[U](date: String, func: DumpFile => U): Boolean =
 98 |     {
 99 |       val datePage = new URL(mainPage, date + "/") // here we could use index.html
100 |     val datePageLines = Source.fromURL(datePage).getLines().toTraversable
101 | 
102 |       // Collect regexes
103 |       val regexes = fileNames.filter(_._2).map(_._1)
104 |       val fileNamesFromRegexes = expandFilenameRegex(date, datePageLines, regexes)
105 |       val staticFileNames = fileNames.filter(!_._2).map(_._1)
106 | 
107 |       val allFileNames = fileNamesFromRegexes ++ staticFileNames
108 |       //      val urls = allFileNames.map(fileName => new URL(baseURL, wiki + "/" + date + "/" + wiki + "-" + date + "-" + fileName))
109 |       val dumpFiles = allFileNames.map(fileName => DumpFile(baseDir.toUri.getPath, wikiSuffix, language.wikiCode, date, fileName))
110 | 
111 | 
112 |       // all the links we need - only for non regexes (we have already checked regex ones)
113 |       val links = new mutable.HashMap[String, String]()
114 |       for (fileName <- staticFileNames) links(fileName) = "<a href=\"/" + wiki + "/" + date + "/" + wiki + "-" + date + "-" + fileName + "\">"
115 |       // Here we should set "<a href=\"/"+wiki+"/"+date+"/"+wiki+"-"+date+"-"+fileName+"\">"
116 |       // but "\"/"+wiki+"/"+date+"/" does not exists in incremental updates, keeping the trailing "\">" should do the trick
117 |       // for (fileName <- fileNames) links(fileName) = wiki+"-"+date+"-"+fileName+"\">"
118 | 
119 |       for (line <- datePageLines)
120 |         links.foreach
121 |         {
122 |           case (fileName, link) => if (line contains link) links -= fileName
123 |         }
124 | 
125 |       // did we find them all?
126 |       // Fail if:
127 |       // - the user specified static file names and not all of them have been found
128 |       // OR
129 |       // - the user specified regular expressions and no file has been found that satisfied them
130 |       if ((staticFileNames.nonEmpty && links.nonEmpty) || (regexes.nonEmpty && fileNamesFromRegexes.isEmpty))
131 |       {
132 |         // TODO: Fix message
133 |         val staticFilesMessage = if (links.nonEmpty) " has no links to [" + links.keys.mkString(",") + "]" else ""
134 |         val dynamicFilesMessage = if (fileNamesFromRegexes.isEmpty && regexes.nonEmpty) " has no links that satisfies [" + regexes.mkString(",") + "]" else ""
135 |         println("date page '" + datePage + staticFilesMessage + dynamicFilesMessage)
136 |         false
137 |       }
138 |       else
139 |       {
140 |         println("date page '" + datePage + "' has all files [" + allFileNames.mkString(",") + "]")
141 |         // run closure over all DumpFiles
142 |         for (dumpFile <- dumpFiles) func(dumpFile)
143 |         true
144 |       }
145 |     }
146 | 
147 |     private def expandFilenameRegex(date: String, index: Traversable[String], filenameRegexes: mutable.Set[String]): mutable.Set[String] =
148 |     {
149 |       // Prepare regexes
150 |       val regexes = filenameRegexes.map(regex => ("<a href=\"/" + wiki + "/" + date + "/" + wiki + "-" + date + "-(" + regex + ")\">").r)
151 | 
152 |       // Result
153 |       val filenames = Set[String]()
154 | 
155 |       for (line <- index)
156 |         regexes.foreach(regex => regex.findAllIn(line).matchData.foreach(filenames += _.group(1)))
157 | 
158 |       filenames
159 |     }
160 |   }
161 | 
162 |   private object LanguageDumpFileSource
163 |   {
164 |     def apply(language: Language) = new LanguageDumpFileSource(language)
165 |   }
166 | 
167 | }
168 | 


--------------------------------------------------------------------------------
/extraction/src/main/scala/org/dbpedia/extraction/spark/io/output/MultipleTextOutputFormat.scala:
--------------------------------------------------------------------------------
  1 | package org.dbpedia.extraction.spark.io.output
  2 | 
  3 | import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat
  4 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat._
  5 | import org.apache.hadoop.mapreduce.{TaskAttemptContext, RecordWriter}
  6 | import scala.collection.mutable
  7 | import org.apache.hadoop.fs.Path
  8 | import org.apache.hadoop.util.ReflectionUtils
  9 | import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat.LineRecordWriter
 10 | import java.io.DataOutputStream
 11 | import org.apache.hadoop.io.compress.{CompressionCodecFactory, CompressionCodec}
 12 | 
 13 | /**
 14 |  * This class extends allows writing output to multiple output files depending upon custom criteria. It filters
 15 |  * every key-value pair and routes them to the corresponding locations.
 16 |  *
 17 |  * Configuration variables:
 18 |  * dbpedia.output.overwrite - Boolean, if set to true, output files will be overwritten if they already exist,
 19 |  * or else an IOException will be thrown (which is also the default behaviour)
 20 |  */
 21 | class MultipleTextOutputFormat[K, V] extends TextOutputFormat[K, V]
 22 | {
 23 |   private val OVERWRITE = "dbpedia.output.overwrite"
 24 | 
 25 |   private class MultipleTextRecordWriter(context: TaskAttemptContext) extends RecordWriter[K, V]
 26 |   {
 27 |     private val recordWriters = mutable.Map[String, RecordWriter[K, V]]()
 28 | 
 29 |     /**
 30 |      * Note: This method is not synchronized, keeping with the rest of the Hadoop code in this framework.
 31 |      * When using this with Spark, set only one core per worker to ensure that only one thread accesses
 32 |      * this method per JVM.
 33 |      */
 34 |     override def write(key: K, value: V)
 35 |     {
 36 |       // Generate the path depending upon key-value pair
 37 |       val finalPath = generateFileNameForKeyValue(key, value)
 38 | 
 39 |       // Extract the actual key and value
 40 |       val actualKey = generateActualKey(key, value)
 41 |       val actualValue = generateActualValue(key, value)
 42 | 
 43 |       // Get the RecordReader for finalPath or create one if needed
 44 |       val writer = recordWriters.getOrElseUpdate(finalPath, createRecordWriter(finalPath, context))
 45 |       writer.write(actualKey, actualValue)
 46 |     }
 47 | 
 48 |     override def close(context: TaskAttemptContext) = recordWriters.foreach(_._2.close(context))
 49 |   }
 50 | 
 51 |   override def getRecordWriter(context: TaskAttemptContext): RecordWriter[K, V] = new MultipleTextRecordWriter(context)
 52 | 
 53 |   /**
 54 |    * Create a new RecordWriter based on the modified output path and the RecordWriter implementation
 55 |    * returned by getBaseRecordWriter().
 56 |    */
 57 |   private def createRecordWriter(pathName: String, context: TaskAttemptContext): RecordWriter[K, V] =
 58 |   {
 59 |     val conf = context.getConfiguration
 60 |     val keyValueSeparator = conf.get(TextOutputFormat.SEPERATOR, "\t")
 61 |     // If overwriteOutput is set to true, output files will be overwritten if they already exist,
 62 |     // or else an IOException will be thrown (which is also the default behaviour)
 63 |     val overwriteOutput = conf.getBoolean(OVERWRITE, false)
 64 | 
 65 |     val (codec, file) = if (inferCodecFromPathName)
 66 |     {
 67 |       val extension = pathName.substring(pathName.lastIndexOf('.'))
 68 |       // Get modified suffixed path
 69 |       val file = getModifiedWorkFile(pathName, context, extension)
 70 |       // Returns Option[CompressionCodec] or None depending on file extension
 71 |       val codec = Option(new CompressionCodecFactory(conf).getCodec(file))
 72 |       (codec, file)
 73 |     }
 74 |     else
 75 |     {
 76 |       val isCompressed = getCompressOutput(context)
 77 |       if (isCompressed)
 78 |       {
 79 |         // Get the CompressionCodec from job configuration
 80 |         val codecClass = getOutputCompressorClass(context, classOf[CompressionCodec])
 81 |         val codec = ReflectionUtils.newInstance(codecClass, conf)
 82 |         val file = getModifiedWorkFile(pathName, context, codec.getDefaultExtension)
 83 |         (Some(codec), file)
 84 |       }
 85 |       else
 86 |       {
 87 |         val file = getModifiedWorkFile(pathName, context, "")
 88 |         (None, file)
 89 |       }
 90 |     }
 91 | 
 92 |     val fs = file.getFileSystem(conf)
 93 |     val fileOutputStream = fs.create(file, overwriteOutput)
 94 | 
 95 |     getBaseRecordWriter(context, fileOutputStream, keyValueSeparator, codec)
 96 |   }
 97 | 
 98 |   /**
 99 |    * Gets the default output path and inserts directoryName between the parent directory and leaf file (part-*).
100 |    */
101 |   private def getModifiedWorkFile(directoryName: String,
102 |                                   context: TaskAttemptContext,
103 |                                   extension: String): Path =
104 |   {
105 |     val path = super.getDefaultWorkFile(context, extension)
106 |     new Path(new Path(path.getParent, directoryName), path.getName)
107 |   }
108 | 
109 |   /**
110 |    * If inferCodecFromPathName is set to true, the output compression codec will be inferred from the suffix/extension
111 |    * in pathName (eg. foobar.gz implies GzipCodec is used), otherwise it uses Hadoop configuration settings.
112 |    *
113 |    * The default behaviour is to use Hadoop configuration settings.
114 |    */
115 |   protected val inferCodecFromPathName: Boolean = false
116 | 
117 |   /**
118 |    * Construct the underlying RecordWriter. By default creates a LineRecordWriter that is used by
119 |    * TextOutputFormat by default.
120 |    *
121 |    * @param context TaskAttemptContext
122 |    * @param out DataOutputStream where output data is written to
123 |    * @param keyValueSeparator String separator between output key and value
124 |    * @param codec Option[CompressionCodec] for handling compression
125 |    * @return A RecordWriter object over the given DataOutputStream
126 |    */
127 |   protected def getBaseRecordWriter(context: TaskAttemptContext,
128 |                                     out: DataOutputStream,
129 |                                     keyValueSeparator: String,
130 |                                     codec: Option[CompressionCodec] = None): RecordWriter[K, V] =
131 |   {
132 |     codec match
133 |     {
134 |       case Some(c) =>
135 |         // Have we an output compression codec?
136 |         new LineRecordWriter[K, V](
137 |                                     new DataOutputStream(c.createOutputStream(out)),
138 |                                     keyValueSeparator
139 |                                   )
140 |       case _ =>
141 |         new LineRecordWriter[K, V](out, keyValueSeparator)
142 |     }
143 |   }
144 | 
145 |   /**
146 |    * Generate the output file name (the directory where the leaf part-* files will be written to)
147 |    * based on the given key and value. The default behavior is that the file name does not depend on them.
148 |    * That is, by default this method returns an empty String.
149 |    *
150 |    * @param key the key of the output data
151 |    * @return generated file name
152 |    */
153 |   protected def generateFileNameForKeyValue(key: K, value: V): String = ""
154 | 
155 |   /**
156 |    * Generate the actual key from the given key/value. The default behavior is that
157 |    * the actual key is equal to the given key.
158 |    *
159 |    * @param key the key of the output data
160 |    * @param value the value of the output data
161 |    * @return the actual key derived from the given key/value
162 |    */
163 |   protected def generateActualKey(key: K, value: V): K = key
164 | 
165 |   /**
166 |    * Generate the actual value from the given key and value. The default behavior is that
167 |    * the actual value is equal to the given value.
168 |    *
169 |    * @param key the key of the output data
170 |    * @param value the value of the output data
171 |    * @return the actual value derived from the given key/value
172 |    */
173 |   protected def generateActualValue(key: K, value: V): V = value
174 | }
175 | 


--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  3 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  4 | 
  5 |     <modelVersion>4.0.0</modelVersion>
  6 | 
  7 |     <groupId>org.dbpedia</groupId>
  8 |     <artifactId>distributed-extraction</artifactId>
  9 |     <packaging>pom</packaging>
 10 |     <version>4.1-SNAPSHOT</version>
 11 |     <name>Parent POM of the DBpedia Distributed Extraction Framework</name>
 12 | 
 13 |     <properties>
 14 |         <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
 15 |         <java.version>1.7</java.version>
 16 |         <scala.version>2.11.4</scala.version>
 17 |         <hadoop.version>2.2.0</hadoop.version>
 18 |         <spark.version>1.3.0</spark.version>
 19 |         <scalatest.scala.version>2.11</scalatest.scala.version>
 20 |         <scalatest.version>2.2.4</scalatest.version>
 21 |         <scala.compiler.Xmx>-Xmx1024m</scala.compiler.Xmx>
 22 |         <!--<skipTests>true</skipTests>-->
 23 |     </properties>
 24 | 
 25 |     <modules>
 26 |         <module>extraction</module>
 27 |         <module>download</module>
 28 |         <module>common</module>
 29 |     </modules>
 30 | 
 31 |     <build>
 32 |         <pluginManagement>
 33 |             <plugins>
 34 |                 <plugin>
 35 |                     <groupId>net.alchim31.maven</groupId>
 36 |                     <artifactId>scala-maven-plugin</artifactId>
 37 |                     <version>3.1.6</version>
 38 |                 </plugin>
 39 |                 <plugin>
 40 |                     <groupId>org.apache.maven.plugins</groupId>
 41 |                     <artifactId>maven-compiler-plugin</artifactId>
 42 |                     <version>3.1</version>
 43 |                 </plugin>
 44 |             </plugins>
 45 |         </pluginManagement>
 46 | 
 47 |         <plugins>
 48 | 
 49 |             <!--
 50 |             Note: we could drop this plugin config and just use scala:compile, but then
 51 |             the Scala plugin doesn't understand sub-modules.
 52 |             -->
 53 |             <plugin>
 54 | 
 55 |                 <groupId>net.alchim31.maven</groupId>
 56 |                 <artifactId>scala-maven-plugin</artifactId>
 57 | 
 58 |                 <configuration>
 59 |                     <args>
 60 |                         <arg>-unchecked</arg>
 61 |                         <arg>-deprecation</arg>
 62 |                         <arg>-feature</arg>
 63 |                     </args>
 64 |                     <jvmArgs>
 65 |                         <jvmArg>${scala.compiler.Xmx}</jvmArg>
 66 |                     </jvmArgs>
 67 |                 </configuration>
 68 | 
 69 |                 <executions>
 70 |                     <execution>
 71 |                         <id>compile</id>
 72 |                         <goals>
 73 |                             <goal>compile</goal>
 74 |                         </goals>
 75 |                         <phase>compile</phase>
 76 |                     </execution>
 77 | 
 78 |                     <execution>
 79 |                         <id>test-compile</id>
 80 |                         <goals>
 81 |                             <goal>testCompile</goal>
 82 |                         </goals>
 83 |                         <phase>test-compile</phase>
 84 |                     </execution>
 85 | 
 86 |                     <execution>
 87 |                         <id>process-resources</id>
 88 |                         <goals>
 89 |                             <goal>compile</goal>
 90 |                         </goals>
 91 |                         <phase>process-resources</phase>
 92 |                     </execution>
 93 |                 </executions>
 94 | 
 95 |             </plugin>
 96 | 
 97 |             <plugin>
 98 |                 <groupId>org.apache.maven.plugins</groupId>
 99 |                 <artifactId>maven-compiler-plugin</artifactId>
100 |                 <configuration>
101 |                     <source>${java.version}</source>
102 |                     <target>${java.version}</target>
103 |                 </configuration>
104 |             </plugin>
105 | 
106 |             <plugin>
107 |                 <artifactId>maven-enforcer-plugin</artifactId>
108 |                 <version>1.3.1</version>
109 |                 <executions>
110 |                     <execution>
111 |                         <goals>
112 |                             <goal>enforce</goal>
113 |                         </goals>
114 |                         <configuration>
115 |                             <rules>
116 |                                 <requireJavaVersion>
117 |                                     <version>${java.version}</version>
118 |                                 </requireJavaVersion>
119 |                             </rules>
120 |                         </configuration>
121 |                     </execution>
122 |                 </executions>
123 |             </plugin>
124 | 
125 |         </plugins>
126 |     </build>
127 | 
128 |     <dependencyManagement>
129 |         <dependencies>
130 |             <dependency>
131 |                 <groupId>org.scala-lang</groupId>
132 |                 <artifactId>scala-library</artifactId>
133 |                 <version>${scala.version}</version>
134 |             </dependency>
135 | 
136 |             <!-- server, live -->
137 |             <dependency>
138 |                 <groupId>org.scala-lang</groupId>
139 |                 <artifactId>scala-actors</artifactId>
140 |                 <version>${scala.version}</version>
141 |             </dependency>
142 | 
143 |             <!-- override scalatest dependency version of scala-reflect -->
144 |             <dependency>
145 |                 <groupId>org.scala-lang</groupId>
146 |                 <artifactId>scala-reflect</artifactId>
147 |                 <version>${scala.version}</version>
148 |             </dependency>
149 | 
150 |             <!-- core, live -->
151 |             <dependency>
152 |                 <groupId>org.scalatest</groupId>
153 |                 <artifactId>scalatest_${scalatest.scala.version}</artifactId>
154 |                 <version>${scalatest.version}</version>
155 |                 <scope>test</scope>
156 |             </dependency>
157 |         </dependencies>
158 |     </dependencyManagement>
159 | 
160 |     <!--<distributionManagement>-->
161 |         <!--<repository>-->
162 |             <!--<id>maven.aksw.internal</id>-->
163 |             <!--<name>AKSW Internal Release Repository</name>-->
164 |             <!--<url>http://maven.aksw.org/archiva/repository/internal</url>-->
165 |         <!--</repository>-->
166 |         <!--<snapshotRepository>-->
167 |             <!--<id>maven.aksw.snapshots</id>-->
168 |             <!--<name>AKSW Snapshot Repository</name>-->
169 |             <!--<url>http://maven.aksw.org/archiva/repository/snapshots</url>-->
170 |         <!--</snapshotRepository>-->
171 |     <!--</distributionManagement>-->
172 | 
173 |     <profiles>
174 |         <profile>
175 |             <id>incremental</id>
176 |             <build>
177 |                 <plugins>
178 |                     <plugin>
179 |                         <groupId>net.alchim31.maven</groupId>
180 |                         <artifactId>scala-maven-plugin</artifactId>
181 | 
182 |                         <configuration>
183 |                             <!-- Enable incremental compilation -->
184 |                             <recompileMode>incremental</recompileMode>
185 |                             <!-- Use an external Zinc server for compilation.
186 |                             If there is no Zinc server currently running then the plugin falls back to regular incremental compilation.
187 |                             Seems pretty safe. -->
188 |                             <useZincServer>true</useZincServer>
189 |                         </configuration>
190 |                     </plugin>
191 |                 </plugins>
192 |             </build>
193 |         </profile>
194 |     </profiles>
195 | </project>
196 | 


--------------------------------------------------------------------------------
/extraction/src/main/scala/org/dbpedia/extraction/dump/extract/DistConfig.scala:
--------------------------------------------------------------------------------
  1 | package org.dbpedia.extraction.dump.extract
  2 | 
  3 | import java.util.Properties
  4 | import scala.collection.JavaConversions.asScalaSet
  5 | import org.dbpedia.extraction.util.ConfigUtils.getValue
  6 | import java.io.File
  7 | import org.apache.spark.storage.StorageLevel
  8 | import java.net.URI
  9 | import org.apache.log4j.Level
 10 | import org.dbpedia.extraction.util.HadoopConfigurable
 11 | import org.apache.hadoop.fs.Path
 12 | 
 13 | /**
 14 |  * Class for distributed configuration. Delegates general stuff except directory/file properties to Config.
 15 |  *
 16 |  * Note that dumpDir, ontologyFile and mappingsDir are first checked in distConfigProps;
 17 |  * if not found they're checked in extractionConfigProps.
 18 |  *
 19 |  * @param distConfigProps Distributed extraction configuration properties
 20 |  * @param extractionConfigProps General extraction framework configuration properties
 21 |  * @see Config
 22 |  */
 23 | class DistConfig(distConfigProps: Properties, extractionConfigProps: Properties, val extractionConfigFile: URI) extends HadoopConfigurable
 24 | {
 25 |   private val extractionConfig = new ExtractionConfig()
 26 | 
 27 |   /** It is recommended that spark-home and spark-master are explicitly provided. */
 28 |   val sparkHome = distConfigProps.getProperty("spark-home", sys.env.get("SPARK_HOME").getOrElse(""))
 29 | 
 30 |   /** By default assume master is runnning locally; use 4 cores */
 31 |   val sparkMaster = distConfigProps.getProperty("spark-master", "local[4]")
 32 | 
 33 |   /** Shows up on Spark Web UI */
 34 |   val sparkAppName = distConfigProps.getProperty("spark-appname", "dbpedia-distributed-extraction-framework")
 35 | 
 36 |   /**
 37 |    * The StorageLevel to be used when calling RDD.persist() unless otherwise specified. Choose any of these:
 38 |    * MEMORY_ONLY
 39 |    * MEMORY_AND_DISK
 40 |    * MEMORY_ONLY_SER
 41 |    * MEMORY_AND_DISK_SER
 42 |    * DISK_ONLY
 43 |    * MEMORY_ONLY_2, MEMORY_AND_DISK_2 etc.
 44 |    *
 45 |    * By default it is set to MEMORY_AND_DISK_SER
 46 |    *
 47 |    * @see org.apache.spark.storage.StorageLevel
 48 |    */
 49 |   val sparkStorageLevel = Option(
 50 |                                   getValue(distConfigProps, "spark-storage-level", required = false)
 51 |                                   {
 52 |                                     level => StorageLevel.getClass.getDeclaredMethod(level).invoke(StorageLevel).asInstanceOf[StorageLevel]
 53 |                                   }
 54 |                                 ).getOrElse(StorageLevel.MEMORY_AND_DISK_SER)
 55 | 
 56 |   /** Map of optional spark configuration properties. See http://spark.apache.org/docs/latest/configuration.html */
 57 |   val sparkProperties = distConfigProps.stringPropertyNames().filter(_.startsWith("spark.")).map(x => (x, distConfigProps.getProperty(x))).toMap
 58 | 
 59 |   /** Path to hadoop core-site.xml */
 60 |   override protected val hadoopCoreConf = distConfigProps.getProperty("hadoop-coresite-xml-path")
 61 | 
 62 |   /** Path to hadoop hdfs-site.xml */
 63 |   override protected val hadoopHdfsConf = distConfigProps.getProperty("hadoop-hdfssite-xml-path")
 64 | 
 65 |   /** Path to hadoop mapred-site.xml */
 66 |   override protected val hadoopMapredConf = distConfigProps.getProperty("hadoop-mapredsite-xml-path")
 67 | 
 68 |   /** This is used for setting log levels for "org.apache", "spark", "org.eclipse.jetty" and "akka" using
 69 |     * SparkUtils.setLogLevels(). It is WARN by default.
 70 |     */
 71 |   val sparkLogLevel = Level.toLevel(distConfigProps.getProperty("logging-level"), Level.WARN)
 72 | 
 73 |   /**
 74 |    * Number of threads to use in the ExecutionContext while calling DistExtractionJob.run() on multiple
 75 |    * extraction jobs in parallel.
 76 |    *
 77 |    * Note that these threads on the driver node do not perform any heavy work except for executing
 78 |    * DistExtractionJob.run() which submits the respective Spark job to the Spark master and waits
 79 |    * for the job to finish.
 80 |    *
 81 |    * By default it is set to Integer.MAX_VALUE so that all extraction jobs are submitted to Spark master
 82 |    * simultaneously, which uses the configured scheduling mechanism to execute the jobs on the cluster.
 83 |    */
 84 |   val extractionJobThreads = distConfigProps.getProperty("extraction-job-threads", Integer.MAX_VALUE.toString).toInt
 85 | 
 86 |   /** Whether output files should be overwritten or not (true/false). This is true by default. */
 87 |   val overwriteOutput = distConfigProps.getProperty("overwrite-output", "true").toBoolean
 88 | 
 89 |   /**
 90 |    * Whether the intermediate RDD[WikiPage] should be cached to Hadoop's filesystem (true/false).
 91 |    * This is false by default.
 92 |    *
 93 |    * Performance implications:
 94 |    * 1. Caching will make further extractions over the same dump much faster.
 95 |    * 2. Caching will force early evaluation of the RDD and will cause some delay before extraction.
 96 |    *
 97 |    * If you are not planning on repeated extractions over the same dump it is best to leave this as it is.
 98 |    */
 99 |   val cacheWikiPageRDD = distConfigProps.getProperty("cache-wikipages", "false").toBoolean
100 | 
101 |   /** Dump directory */
102 |   val dumpDir = getPath("base-dir", pathMustExist = true)
103 | 
104 |   /** Local ontology file, downloaded for speed and reproducibility */
105 |   val ontologyFile = getPath("ontology", pathMustExist = false)
106 | 
107 |   /** Local mappings files, downloaded for speed and reproducibility */
108 |   val mappingsDir = getPath("mappings", pathMustExist = false)
109 | 
110 |   val requireComplete = extractionConfig.requireComplete
111 | 
112 |   val source = extractionConfig.source
113 | 
114 |   val disambiguations = extractionConfig.disambiguations
115 | 
116 |   val wikiName = extractionConfig.wikiName
117 | 
118 |   val parser = extractionConfig.parser
119 | 
120 |   val formats = extractionConfig.formats
121 | 
122 |   val extractorClasses = extractionConfig.extractorClasses
123 | 
124 |   val namespaces = extractionConfig.namespaces
125 | 
126 |   /**
127 |    * Creates a Path from the given property (null if the property is absent) and wraps it in an Option.
128 |    * This method first checks the distributed config properties, then the general extraction config properties.
129 |    *
130 |    * @param property String property key
131 |    * @param pathMustExist Boolean to ensure that the Path, if obtained, actually exists.
132 |    * @throws RuntimeException if the property is defined but the path does not exist
133 |    * @return Option wrapping the obtained Path
134 |    */
135 |   def getPath(property: String, pathMustExist: Boolean): Option[Path] =
136 |   {
137 |     val somePath = Option({
138 |                             val distProp = getValue(distConfigProps, property, required = false)(new Path(_))
139 |                             if(distProp != null)
140 |                             {
141 |                               // If property exists in distributed config file return it.
142 |                               distProp
143 |                             }
144 |                             else
145 |                             {
146 |                               // Or else, try the extraction config file - returns either null or a Path.
147 |                               getValue(extractionConfigProps, property, required = false)(new Path(_))
148 |                             }
149 |                           })
150 | 
151 |     checkPathExists(somePath, pathMustExist)
152 |   }
153 | 
154 |   /**
155 |    * Custom Config subclass that makes the File-based variables null.
156 |    *
157 |    * The distributed extraction framework should only work with Paths. Initialization operations on non-existent
158 |    * Files may cause errors, and are not required anyway.
159 |    */
160 |   private class ExtractionConfig extends Config(extractionConfigProps)
161 |   {
162 |     override lazy val dumpDir: File = null
163 |     override lazy val ontologyFile: File = null
164 |     override lazy val mappingsDir: File = null
165 |   }
166 | 
167 | }
168 | 


--------------------------------------------------------------------------------
/download/src/main/scala/org/dbpedia/extraction/dump/download/actors/Master.scala:
--------------------------------------------------------------------------------
  1 | package org.dbpedia.extraction.dump.download.actors
  2 | 
  3 | import scala.concurrent.duration.{Deadline, FiniteDuration}
  4 | import akka.actor._
  5 | import akka.contrib.pattern.{DistributedPubSubMediator, DistributedPubSubExtension}
  6 | import scala.collection.immutable.Queue
  7 | import org.dbpedia.extraction.dump.download.actors.message.GeneralMessage.{MasterQueueEmpty, ShutdownCluster}
  8 | import org.dbpedia.extraction.dump.download.actors.message._
  9 | import java.net.URL
 10 | import scala.Some
 11 | import akka.contrib.pattern.DistributedPubSubMediator.Put
 12 | 
 13 | /**
 14 |  * Master/driver node actor. This is responsible for accepting download jobs from a client and dividing jobs
 15 |  * among the different Workers, keeping track of download jobs, handling failed jobs, shutting down the cluster etc.
 16 |  *
 17 |  * @param workTimeout Workers need to send download progress reports within this timeout
 18 |  * @param mirrors List of wikipedia mirror URLs
 19 |  * @param threadsPerMirror Number of simultaneous downloads per mirror
 20 |  */
 21 | class Master(workTimeout: FiniteDuration, mirrors: Seq[URL], threadsPerMirror: Int) extends Actor with ActorLogging
 22 | {
 23 | 
 24 |   import Master._
 25 |   import MasterWorkerMessage._
 26 |   import context.dispatcher
 27 | 
 28 |   def scheduler = context.system.scheduler
 29 | 
 30 |   // The DownloadClient and DownloadResultConsumer communicate with the Master through the DistributedPubSubMediator
 31 |   val mediator = DistributedPubSubExtension(context.system).mediator
 32 | 
 33 |   mediator ! Put(self)
 34 | 
 35 |   private var workers = Map[String, WorkerState]()
 36 |   private var pendingDownloads = Queue[DownloadJob]()
 37 |   private var downloadIds = Set[String]()
 38 | 
 39 |   // Keep track of the number of simultaneous downloads per mirror.
 40 |   private var mirrorsInUse = (mirrors zip Seq.fill(mirrors.size)(0)).toMap // Mapping mirror URL to number of simultaneous downloads
 41 | 
 42 |   val cleanupTask = scheduler.schedule(workTimeout / 2, workTimeout / 2,
 43 |                                        self, CleanupTick)
 44 | 
 45 |   override def postStop(): Unit = cleanupTask.cancel()
 46 | 
 47 |   def receive =
 48 |   {
 49 |     case ShutdownCluster =>
 50 |       if (pendingDownloads.isEmpty) // all downloads have finished?
 51 |       {
 52 |         if (workers.isEmpty) // all workers have been unregistered?
 53 |         {
 54 |           log.info("Stopping master!")
 55 |           mediator ! DistributedPubSubMediator.Publish(General, ShutdownCluster)
 56 |           self ! PoisonPill
 57 |           context.stop(self)
 58 |           context.system.shutdown()
 59 |         }
 60 |         else
 61 |         {
 62 |           workers.foreach // still have registered workers?
 63 |           {
 64 |             case (workerId, WorkerState(ref, Idle)) => // send shutdown signal to idle workers and remove them.
 65 |               ref ! ShutdownCluster
 66 |               workers -= workerId
 67 |             case _ => // come back to the busy worker after a period of workTimeout
 68 |           }
 69 |           log.debug("Some workers still busy! Cannot stop master yet!")
 70 |           context.system.scheduler.scheduleOnce(workTimeout, self, ShutdownCluster)
 71 |         }
 72 |       }
 73 |       else
 74 |       {
 75 |         log.debug("Some work pending! Cannot stop master yet!")
 76 |         context.system.scheduler.scheduleOnce(workTimeout, self, ShutdownCluster)
 77 |       }
 78 | 
 79 |     case RemoveWorker(workerId) =>
 80 |       workers -= workerId
 81 | 
 82 |     case p @ ProgressReport(workerId, downloadId, progress) => // Workers send download progress reports at specific intervals
 83 |       log.debug("Heard from worker {}: {} ", workerId, progress)
 84 |       mediator ! DistributedPubSubMediator.Publish(ProgressTopic, DownloadProgress(downloadId, progress))
 85 |       workers.get(workerId) match
 86 |       {
 87 |         case Some(s@WorkerState(_, Busy(downloadJob, deadline))) =>
 88 |           workers += (workerId -> WorkerState(sender, status = Busy(downloadJob, Deadline.now + workTimeout))) // Renew current job deadline
 89 |         case _ =>
 90 |       }
 91 | 
 92 |     case RegisterWorker(workerId) => // Workers register themselves to the master at specific intervals
 93 |       if (workers.contains(workerId))
 94 |       {
 95 |         workers += (workerId -> workers(workerId).copy(ref = sender))
 96 |       }
 97 |       else
 98 |       {
 99 |         log.info("Worker registered: {}", workerId)
100 |         workers += (workerId -> WorkerState(sender, status = Idle))
101 |         if (pendingDownloads.nonEmpty)
102 |           sender ! DownloadIsReady
103 |       }
104 | 
105 |     case WorkerRequestsDownload(workerId) =>
106 |       if (pendingDownloads.nonEmpty)
107 |       {
108 |         workers.get(workerId) match
109 |         {
110 |           case Some(s@WorkerState(_, Idle)) => // is the requesting Worker Idle?
111 |             getFreeMirror foreach
112 |             {
113 |               case url => // We have a free mirror!
114 |                 val (downloadJob, rest) = pendingDownloads.dequeue
115 |                 pendingDownloads = rest
116 |                 val downloadWithMirror = MirroredDownloadJob(url, downloadJob)
117 | 
118 |                 // Publish new download job so that DownloadResultConsumer can keep track of it
119 |                 mediator ! DistributedPubSubMediator.Publish(ProgressTopic, downloadWithMirror)
120 | 
121 |                 sender ! downloadWithMirror // send new download job back to the Worker that sent the job request
122 |                 log.info("Giving worker {} a download job {}", workerId, downloadWithMirror)
123 | 
124 |                 mirrorsInUse += (url -> (mirrorsInUse(url) + 1)) // decrement no. of threads to mirror
125 |                 workers += (workerId -> s.copy(status = Busy(downloadWithMirror, Deadline.now + workTimeout))) // set worker status to Busy
126 |             }
127 |           case _ =>
128 |         }
129 |       }
130 | 
131 |     case DownloadIsDone(workerId, downloadId, outputPath, totalBytes) =>
132 |       workers.get(workerId) match
133 |       {
134 |         case Some(s@WorkerState(_, Busy(downloadJob, _))) if downloadJob.job.downloadId == downloadId =>
135 |           log.debug("Download is done: {} => {} bytes written to {} by worker {}", downloadJob, totalBytes, outputPath, workerId)
136 | 
137 |           val mirror = downloadJob.baseUrl
138 |           mirrorsInUse += (mirror -> (mirrorsInUse(mirror) - 1)) // decrement no. of threads to mirror
139 |           workers += (workerId -> s.copy(status = Idle)) // set worker status to Idle
140 | 
141 |           // publish download result for DownloadResultConsumer to read
142 |           mediator ! DistributedPubSubMediator.Publish(ResultsTopic, DownloadResult(downloadJob, outputPath, totalBytes))
143 | 
144 |           sender ! MasterWorkerMessage.Ack(downloadId) // Ack to worker
145 |         case _ =>
146 |           if (downloadIds.contains(downloadId))
147 |           {
148 |             // previous Ack was lost, confirm again that this is done
149 |             sender ! MasterWorkerMessage.Ack(downloadId)
150 |           }
151 |       }
152 | 
153 |     case DownloadFailed(workerId, downloadId) =>
154 |       workers.get(workerId) match
155 |       {
156 |         case Some(s@WorkerState(_, Busy(downloadJob, _))) if downloadJob.job.downloadId == downloadId =>
157 |           log.info("Download failed: {}", downloadJob)
158 | 
159 |           val mirror = downloadJob.baseUrl
160 |           mirrorsInUse += (mirror -> (mirrorsInUse(mirror) - 1))
161 |           workers += (workerId -> s.copy(status = Idle))
162 | 
163 |           pendingDownloads = pendingDownloads enqueue downloadJob.job // put the download back into queue
164 |           notifyWorkers()
165 |         case _ =>
166 |       }
167 | 
168 |     case job: DownloadJob => // client sent a new DownloadJob
169 |       // idempotent
170 |       if (downloadIds.contains(job.downloadId))
171 |       {
172 |         sender ! Master.Ack(job.downloadId)
173 |       }
174 |       else
175 |       {
176 |         log.info("Accepted download: {}", job)
177 |         pendingDownloads = pendingDownloads enqueue job
178 |         downloadIds += job.downloadId
179 |         sender ! Master.Ack(job.downloadId)
180 |         notifyWorkers()
181 |       }
182 | 
183 |     case CleanupTick => // runs at fixed intervals, removes timed out jobs
184 |       var hasBusy = false
185 |       for ((workerId, s@WorkerState(_, Busy(downloadJob, timeout))) <- workers)
186 |       {
187 |         hasBusy = true
188 |         if (timeout.isOverdue)
189 |         {
190 |           log.info("Download timed out: {}", downloadJob)
191 |           workers -= workerId
192 |           pendingDownloads = pendingDownloads enqueue downloadJob.job
193 |           notifyWorkers()
194 |         }
195 |       }
196 |       // publish MasterQueueEmpty if there are no pending downloads AND no workers are busy
197 |       if(!hasBusy && pendingDownloads.isEmpty) mediator ! DistributedPubSubMediator.Publish(General, MasterQueueEmpty)
198 |   }
199 | 
200 |   def getFreeMirror: Option[URL] =
201 |     mirrorsInUse.find(_._2 < threadsPerMirror) match
202 |     {
203 |       case Some((url, _)) => Some(url)
204 |       case _ => None
205 |     }
206 | 
207 |   /** Tell idle workers that download is ready */
208 |   def notifyWorkers(): Unit =
209 |     if (pendingDownloads.nonEmpty)
210 |     {
211 |       // TODO: Pick workers more intelligently, according to number of bytes downloaded by each worker
212 |       // to encourage better spreading out of downloads over the cluster - better for distributed processing too.
213 |       workers.foreach
214 |       {
215 |         case (_, WorkerState(ref, Idle)) => ref ! DownloadIsReady
216 |         case _ => // busy
217 |       }
218 |     }
219 | 
220 |   // TODO cleanup old workers
221 |   // TODO cleanup old downloadIds
222 | }
223 | 
224 | object Master
225 | {
226 |   val ResultsTopic = "results"
227 |   val ProgressTopic = "progress"
228 |   val General = "general"
229 | 
230 |   def props(workTimeout: FiniteDuration, mirrors: Seq[URL], threadsPerMirror: Int): Props =
231 |     Props(classOf[Master], workTimeout, mirrors, threadsPerMirror)
232 | 
233 |   case class Ack(downloadId: String)
234 | 
235 |   private sealed trait WorkerStatus
236 |   private case object Idle extends WorkerStatus
237 |   private case class Busy(job: MirroredDownloadJob, deadline: Deadline) extends WorkerStatus
238 |   private case class WorkerState(ref: ActorRef, status: WorkerStatus)
239 | 
240 |   private case object CleanupTick
241 | 
242 | }


--------------------------------------------------------------------------------
/extraction/src/main/scala/org/dbpedia/extraction/dump/extract/DistConfigLoader.scala:
--------------------------------------------------------------------------------
  1 | package org.dbpedia.extraction.dump.extract
  2 | 
  3 | import org.dbpedia.extraction.destinations._
  4 | import org.dbpedia.extraction.mappings._
  5 | import org.dbpedia.extraction.ontology.io.OntologyReader
  6 | import org.dbpedia.extraction.sources.{Source, WikiPage, XMLSource, WikiSource}
  7 | import org.dbpedia.extraction.util._
  8 | import org.dbpedia.extraction.util.RichHadoopPath.wrapPath
  9 | import org.dbpedia.extraction.wikiparser.Namespace
 10 | import java.io._
 11 | import java.net.URL
 12 | import java.util.logging.{Level, Logger}
 13 | import org.apache.spark.rdd.RDD
 14 | import org.dbpedia.extraction.dump.download.Download
 15 | import org.apache.hadoop.conf.Configuration
 16 | import org.apache.hadoop.io.LongWritable
 17 | import org.dbpedia.extraction.spark.io.WikiPageWritable
 18 | import org.apache.hadoop.mapreduce.Job
 19 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat
 20 | import org.apache.hadoop.fs.Path
 21 | import org.apache.spark.SparkContext
 22 | import org.dbpedia.extraction.spark.io.input.DBpediaWikiPageInputFormat
 23 | 
 24 | /**
 25 |  * Loads the dump extraction configuration.
 26 |  *
 27 |  * This class configures Spark and sets up the extractors to run using Spark
 28 |  *
 29 |  * TODO: get rid of all config file parsers, use Spring
 30 |  * TODO: Inherit ConfigLoader methods and get rid of redundant code
 31 |  *
 32 |  * @param config DistConfig
 33 |  */
 34 | class DistConfigLoader(config: DistConfig, sparkContext: SparkContext)
 35 | {
 36 |   private val logger = Logger.getLogger(classOf[DistConfigLoader].getName)
 37 |   private val CONFIG_PROPERTIES = "config.properties"
 38 | 
 39 |   /**
 40 |    * Loads the configuration and creates extraction jobs for all configured languages.
 41 |    *
 42 |    * @return Non-strict Traversable over all configured extraction jobs i.e. an extractions job will not be created until it is explicitly requested.
 43 |    */
 44 |   def getExtractionJobs(): Traversable[DistExtractionJob] =
 45 |   {
 46 |     // Create a non-strict view of the extraction jobs
 47 |     // non-strict because we want to create the extraction job when it is needed, not earlier
 48 |     config.extractorClasses.view.map(e => createExtractionJob(e._1, e._2))
 49 |   }
 50 | 
 51 |   /**
 52 |    * Creates an extraction job for a specific language.
 53 |    */
 54 |   private def createExtractionJob(lang: Language, extractorClasses: Seq[Class[_ <: Extractor[_]]]): DistExtractionJob =
 55 |   {
 56 |     val dumpDir = config.dumpDir.get
 57 | 
 58 |     // Finder[Path] works with Hadoop's FileSystem class - operates on HDFS, or the local file system depending
 59 |     // upon whether we are running in local mode or distributed/cluster mode.
 60 |     val finder = new Finder[Path](dumpDir, lang, config.wikiName)
 61 |     val date = latestDate(finder)
 62 | 
 63 |     // Add input sources
 64 |     val job = Job.getInstance(hadoopConfiguration)
 65 |     for (file <- files(config.source, finder, date))
 66 |       FileInputFormat.addInputPath(job, file)
 67 |     hadoopConfiguration = job.getConfiguration // update Configuration
 68 | 
 69 |     // Add the extraction configuration file to distributed cache.
 70 |     // It will be needed in DBpediaCompositeOutputFormat for getting the Formatters.
 71 |     val configPropertiesDCPath = finder.wikiDir.resolve(CONFIG_PROPERTIES) // Path where to the copy config properties file
 72 |     val fs = configPropertiesDCPath.getFileSystem(hadoopConfiguration)
 73 |     fs.copyFromLocalFile(false, true, new Path(config.extractionConfigFile), configPropertiesDCPath) // Copy local file to Hadoop file system
 74 |     job.addCacheFile(configPropertiesDCPath.toUri) // Add to distributed cache
 75 | 
 76 |     // Setup config variables needed by DBpediaWikiPageInputFormat and DBpediaCompositeOutputFormat.
 77 |     hadoopConfiguration.set("dbpedia.config.properties", configPropertiesDCPath.toString)
 78 |     hadoopConfiguration.set("dbpedia.wiki.name", config.wikiName)
 79 |     hadoopConfiguration.set("dbpedia.wiki.language.wikicode", lang.wikiCode)
 80 |     hadoopConfiguration.set("dbpedia.wiki.date", date)
 81 |     hadoopConfiguration.setBoolean("dbpedia.output.overwrite", config.overwriteOutput)
 82 | 
 83 |     // Getting the WikiPages from local on-disk cache saves processing time.
 84 |     val cache = finder.file(date, "articles-rdd")
 85 |     lazy val articlesRDD: RDD[WikiPage] = try
 86 |     {
 87 |       if (!cache.exists)
 88 |         throw new IOException("Cache file " + cache.getSchemeWithFileName + " does not exist.")
 89 |       logger.info("Loading articles from cache file " + cache.getSchemeWithFileName)
 90 |       val loaded = DistIOUtils.loadRDD(sparkContext, classOf[WikiPage], cache)
 91 |       logger.info("WikiPages loaded from cache file " + cache.getSchemeWithFileName)
 92 |       loaded
 93 |     }
 94 |     catch
 95 |       {
 96 |         case ex: Exception =>
 97 |         {
 98 |           logger.log(Level.INFO, "Will read from wiki dump file for " + lang.wikiCode + " wiki, could not load cache file '" + cache.getSchemeWithFileName + "': " + ex)
 99 | 
100 |           // Create RDD with WikiPageWritable elements.
101 |           val rawArticlesRDD: RDD[(LongWritable, WikiPageWritable)] =
102 |             sparkContext.newAPIHadoopRDD(hadoopConfiguration, classOf[DBpediaWikiPageInputFormat], classOf[LongWritable], classOf[WikiPageWritable])
103 | 
104 |           // Unwrap WikiPages and filter unnecessary pages
105 |           val newRdd = rawArticlesRDD.map(_._2.get).filter
106 |                        {
107 |                          page =>
108 |                            page.title.namespace == Namespace.Main ||
109 |                              page.title.namespace == Namespace.File ||
110 |                              page.title.namespace == Namespace.Category ||
111 |                              page.title.namespace == Namespace.Template
112 |                        }.persist(config.sparkStorageLevel)
113 | 
114 |           if (config.cacheWikiPageRDD)
115 |           {
116 |             DistIOUtils.saveRDD(newRdd, cache)
117 |             logger.info("Parsed WikiPages written to cache file " + cache.getSchemeWithFileName)
118 |           }
119 | 
120 |           newRdd
121 |         }
122 |       }
123 | 
124 |     val _ontology =
125 |     {
126 |       val ontologySource = config.ontologyFile match
127 |       {
128 |         case Some(ontologyFile) if ontologyFile.isFile =>
129 |           // Is ontologyFile defined and it is indeed a file?
130 |           XMLSource.fromReader(reader(ontologyFile), Language.Mappings)
131 |         case _ =>
132 |           val namespaces = Set(Namespace.OntologyClass, Namespace.OntologyProperty)
133 |           val url = new URL(Language.Mappings.apiUri)
134 |           val language = Language.Mappings
135 |           WikiSource.fromNamespaces(namespaces, url, language)
136 |       }
137 | 
138 |       new OntologyReader().read(ontologySource)
139 |     }
140 | 
141 |     val _commonsSource =
142 |     {
143 |       try
144 |       {
145 |         val finder = new Finder[Path](config.dumpDir.get, Language("commons"), config.wikiName)
146 |         val date = latestDate(finder)
147 |         XMLSource.fromReaders(readers(config.source, finder, date), Language.Commons, _.namespace == Namespace.File)
148 |       }
149 |       catch
150 |         {
151 |           case ex: Exception =>
152 |             logger.info("Could not load commons source - error: " + ex.getMessage)
153 |             null
154 |         }
155 |     }
156 | 
157 |     val _disambiguations =
158 |     {
159 |       val cache = finder.file(date, "disambiguations-ids.obj")
160 |       try
161 |       {
162 |         DistDisambiguations.load(reader(finder.file(date, config.disambiguations)), cache, lang)
163 |       } catch
164 |         {
165 |           case ex: Exception =>
166 |             logger.info("Could not load disambiguations - error: " + ex.getMessage)
167 |             Disambiguations.empty()
168 |         }
169 |     }
170 | 
171 |     val redirectsCache = finder.file(date, "template-redirects.obj")
172 |     lazy val _redirects = DistRedirects.load(articlesRDD, redirectsCache, lang) // lazy because it will be evaluated in DistExtractionJob.run()
173 | 
174 |     lazy val context = new DumpExtractionContext
175 |     {
176 |       def ontology = _ontology
177 | 
178 |       def commonsSource = _commonsSource
179 | 
180 |       def language = lang
181 | 
182 |       private lazy val _mappingPageSource =
183 |       {
184 |         val namespace = Namespace.mappings(language)
185 | 
186 |         config.mappingsDir match
187 |         {
188 |           case Some(mappingsDir) if mappingsDir.isDirectory =>
189 |             // Is mappingsDir defined and it is indeed a directory?
190 |             val path = new Path(mappingsDir, namespace.name(Language.Mappings).replace(' ', '_') + ".xml")
191 |             XMLSource.fromReader(reader(path), Language.Mappings)
192 |           case _ =>
193 |             val namespaces = Set(namespace)
194 |             val url = new URL(Language.Mappings.apiUri)
195 |             WikiSource.fromNamespaces(namespaces, url, Language.Mappings)
196 |         }
197 |       }
198 | 
199 |       def mappingPageSource: Traversable[WikiPage] = _mappingPageSource
200 | 
201 |       private lazy val _mappings =
202 |       {
203 |         MappingsLoader.load(this)
204 |       }
205 | 
206 |       def mappings: Mappings = _mappings
207 | 
208 |       def articlesSource: Source = null // Not needing raw article source
209 | 
210 |       def redirects: Redirects = _redirects
211 | 
212 |       def disambiguations: Disambiguations = if (_disambiguations != null) _disambiguations else new Disambiguations(Set[Long]())
213 |     }
214 | 
215 |     // Extractors - this is lazily evaluated in DistExtractionJob.run() so that the distributed redirect extraction happens inside run()
216 |     // NOTE: All subsequent references to this val need to be lazy!
217 |     lazy val extractor =
218 |     {
219 |       val _redirects = context.redirects // Trigger evaluation of lazy redirects and load the updated context into extractors.
220 |       val updatedContext = new DumpExtractionContextWrapper(context)
221 |       {
222 |         override def redirects: Redirects = _redirects
223 |       }
224 |       CompositeParseExtractor.load(extractorClasses, updatedContext)
225 |     }
226 | 
227 |     lazy val destination =
228 |     {
229 |       // Create empty directories for all datasets. This is not strictly necessary because Hadoop would create the directories
230 |       // it needs to by itself, though in that case the directories for unused datasets will obviously be absent.
231 |       val datasets = extractor.datasets
232 |       val outputPath = finder.directory(date)
233 | 
234 |       for ((suffix, format) <- config.formats; dataset <- datasets)
235 |       {
236 |         new Path(outputPath, s"${finder.wikiName}-$date-${dataset.name.replace('_', '-')}.$suffix").mkdirs()
237 |       }
238 |       new DistMarkerDestination(new DistDeduplicatingWriterDestination(outputPath, hadoopConfiguration), finder.file(date, Extraction.Complete), false)
239 |     }
240 | 
241 |     lazy val description =
242 |     {
243 |       val datasets = extractor.datasets
244 |       lang.wikiCode + ": " + extractorClasses.size + " extractors (" + extractorClasses.map(_.getSimpleName).mkString(",") + "), " + datasets.size + " datasets (" + datasets.mkString(",") + ")"
245 |     }
246 | 
247 |     new DistExtractionJob(new RootExtractor(extractor), articlesRDD, config.namespaces, destination, lang.wikiCode, description)
248 |   }
249 | 
250 |   implicit var hadoopConfiguration: Configuration = config.hadoopConf
251 | 
252 |   private def writer[T <% FileLike[_]](file: T): () => Writer =
253 |   {
254 |     () => IOUtils.writer(file)
255 |   }
256 | 
257 |   private def reader[T <% FileLike[_]](file: T): () => Reader =
258 |   {
259 |     () => IOUtils.reader(file)
260 |   }
261 | 
262 |   private def readers[T <% FileLike[_]](source: String, finder: Finder[T], date: String): List[() => Reader] =
263 |   {
264 |     files(source, finder, date).map(reader(_))
265 |   }
266 | 
267 |   private def files[T <% FileLike[_]](source: String, finder: Finder[T], date: String): List[T] =
268 |   {
269 | 
270 |     val files = if (source.startsWith("@"))
271 |     {
272 |       // the articles source is a regex - we want to match multiple files
273 |       finder.matchFiles(date, source.substring(1))
274 |     } else List(finder.file(date, source))
275 | 
276 |     logger.info(s"Source is ${source} - ${files.size} file(s) matched")
277 | 
278 |     files
279 |   }
280 | 
281 |   private def latestDate(finder: Finder[_]): String =
282 |   {
283 |     val isSourceRegex = config.source.startsWith("@")
284 |     val source = if (isSourceRegex) config.source.substring(1) else config.source
285 |     val fileName = if (config.requireComplete) Download.Complete else source
286 |     finder.dates(fileName, isSuffixRegex = isSourceRegex).last
287 |   }
288 | }


--------------------------------------------------------------------------------