├── .gitignore
├── .travis.yml
├── README.md
├── clean-install-run
├── common
├── pom.xml
└── src
│ └── main
│ └── scala
│ └── org
│ └── dbpedia
│ └── extraction
│ └── util
│ ├── HadoopConfigurable.scala
│ └── RichHadoopPath.scala
├── download
├── pom.xml
└── src
│ ├── main
│ ├── resources
│ │ ├── application.conf
│ │ └── reference.conf
│ └── scala
│ │ └── org
│ │ └── dbpedia
│ │ └── extraction
│ │ ├── dump
│ │ └── download
│ │ │ ├── ActoredCounter.scala
│ │ │ ├── DistDownload.scala
│ │ │ ├── DistDownloadConfig.scala
│ │ │ ├── DumpFileSource.scala
│ │ │ └── actors
│ │ │ ├── DownloadClient.scala
│ │ │ ├── DownloadJobRunner.scala
│ │ │ ├── DownloadProgressTracker.scala
│ │ │ ├── DownloadResultConsumer.scala
│ │ │ ├── Master.scala
│ │ │ ├── Worker.scala
│ │ │ └── message
│ │ │ ├── DownloadJob.scala
│ │ │ ├── DownloaderProgressMessage.scala
│ │ │ ├── GeneralMessage.scala
│ │ │ ├── MasterWorkerMessage.scala
│ │ │ └── WorkerProgressMessage.scala
│ │ └── util
│ │ └── RemoteExecute.scala
│ └── test
│ └── resources
│ ├── dist-download.properties
│ └── download.properties
├── extraction
├── pom.xml
└── src
│ ├── main
│ └── scala
│ │ └── org
│ │ ├── apache
│ │ └── spark
│ │ │ └── ui
│ │ │ └── jobs
│ │ │ └── DBpediaJobProgressListener.scala
│ │ └── dbpedia
│ │ └── extraction
│ │ ├── destinations
│ │ ├── DistDeduplicatingWriterDestination.scala
│ │ ├── DistDestination.scala
│ │ ├── DistMarkerDestination.scala
│ │ └── DistWrapperDestination.scala
│ │ ├── dump
│ │ └── extract
│ │ │ ├── DistConfig.scala
│ │ │ ├── DistConfigLoader.scala
│ │ │ ├── DistExtraction.scala
│ │ │ ├── DistExtractionJob.scala
│ │ │ └── DumpExtractionContextWrapper.scala
│ │ ├── mappings
│ │ ├── DistDisambiguations.scala
│ │ └── DistRedirects.scala
│ │ ├── spark
│ │ ├── io
│ │ │ ├── QuadSeqWritable.scala
│ │ │ ├── WikiPageWritable.scala
│ │ │ ├── input
│ │ │ │ ├── ByteMatcher.scala
│ │ │ │ ├── DBpediaWikiPageInputFormat.scala
│ │ │ │ └── SeekableInputStream.scala
│ │ │ └── output
│ │ │ │ ├── DBpediaCompositeOutputFormat.scala
│ │ │ │ ├── DBpediaDatasetOutputFormat.scala
│ │ │ │ └── MultipleTextOutputFormat.scala
│ │ └── serialize
│ │ │ ├── KryoExtractionRegistrator.scala
│ │ │ ├── KryoSerializationWrapper.scala
│ │ │ ├── KryoSerializer.scala
│ │ │ ├── LanguageSerializer.scala
│ │ │ ├── LocaleSerializer.scala
│ │ │ ├── LoggerSerializer.scala
│ │ │ ├── ParserUtilsSerializer.scala
│ │ │ ├── WikiPageSerializer.scala
│ │ │ └── WikiTitleSerializer.scala
│ │ └── util
│ │ ├── DistIOUtils.scala
│ │ └── SparkUtils.scala
│ └── test
│ ├── resources
│ ├── config.properties
│ ├── data
│ │ └── enwiki
│ │ │ └── 20160407
│ │ │ └── enwiki-20160407-pages-articles-multistream.xml.bz2
│ └── dist-config.properties
│ └── scala
│ └── org
│ └── dbpedia
│ └── extraction
│ ├── mappings
│ └── DistRedirectsTest.scala
│ └── spark
│ └── io
│ ├── QuadSeqWritableTest.scala
│ ├── WikiPageWritableTest.scala
│ └── WritableTest.scala
├── gce
├── README.md
└── spark_gce.py
├── install-run
├── ontology.owl
├── ontology.xml
├── pom.xml
├── run
└── run-extraction-test
/.gitignore:
--------------------------------------------------------------------------------
1 | target/
2 | .idea/
3 | *~
4 | *.iml
5 | .cache
6 | *.log
7 | *.lck
8 | *.tmp
9 | java_pid*
10 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | sudo: false
2 | language: scala
3 | jdk:
4 | - oraclejdk8
5 | - oraclejdk7
6 | - openjdk7
7 | # branches:
8 | # only:
9 | # - master
10 | before_install:
11 | - sed -i.bak -e 's|https://nexus.codehaus.org/snapshots/|https://oss.sonatype.org/content/repositories/codehaus-snapshots/|g' ~/.m2/settings.xml
12 | script: "mvn test"
13 | notifications:
14 | email:
15 | recipients:
16 | - riteshoneinamillion@gmail.com
17 | on_success: change
18 | on_failure: change
19 | cache:
20 | directories:
21 | - $HOME/.m2
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | DBpedia Distributed Extraction Framework
2 | ==================================
3 |
4 | ## We are looking for maintatiner(s) for this project
5 |
6 | If you want to join, express your interest!
7 |
8 | ## Description
9 |
10 | This is the distributed version of the [DBpedia Information Extraction Framework](https://github.com/dbpedia/extraction-framework/). It uses [Apache Spark](http://spark.apache.org) to extract structured data from Wikipedia in a parallel, distributed manner.
11 |
12 | This is currently a work-in-progress, and the instructions are mostly intended for developers.
13 |
14 | ## Requirements
15 | * Java 7
16 | * Maven 3
17 | * Apache Spark 0.9.1 built with Apache Hadoop 2.2.0
18 |
19 | ## Setup Apache Spark
20 |
21 | ```bash
22 | $ wget http://d3kbcqa49mib13.cloudfront.net/spark-0.9.1-bin-hadoop2.tgz
23 | $ tar xzf http://d3kbcqa49mib13.cloudfront.net/spark-0.9.1-bin-hadoop2.tgz
24 | $ cd spark-0.9.1-bin-hadoop2
25 | $ SCALA_HOME=/usr/share/java MAVEN_OPTS=\"-Xmx2g -XX:MaxPermSize=512M -XX:ReservedCodeCacheSize=512m\" mvn -Dhadoop.version=2.2.0 -Dprotobuf.version=2.5.0 -DskipTests clean package
26 | ```
27 |
28 | Replace SCALA_HOME according to your machine settings. It is necessary to set enough memory for maven to make Spark compile successfully.
29 |
30 | Add the hostnames of your slave nodes (after having downloaded Spark to all nodes) to conf/slaves. There's a bunch of attendant configurations needed for running on a cluster, like ensuring the firewall allows traffic on certain ports, ensuring passwordless access between the master and slave nodes, setting up HDFS and formatting the NameNode etc. Usually in a cluster of N nodes you would run Spark Master and Hadoop's NameNode on one node and the Spark Workers and Hadoop DataNodes on the remaining N-1 nodes.
31 |
32 | Here's a sample `spark-env.sh` for a cluster where the slaves have 4 cores and 15G RAM each:
33 | ```bash
34 | export SCALA_HOME=/usr/share/java[
35 | export SPARK_MEM=2500m
36 | export SPARK_WORKER_CORES=1
37 | export SPARK_WORKER_INSTANCES=4
38 | SPARK_JAVA_OPTS+=" -Dspark.local.dir=/mnt/spark"
39 | export SPARK_JAVA_OPTS
40 | export SPARK_MASTER_IP=192.168.0.100
41 | export JAVA_HOME=/usr/lib/jvm/java-1.7.0-openjdk-1.7.0.65.x86_64
42 | ```
43 |
44 | **Important**: Note that we have set cores (threads) per worker to 1 and set the number of workers equal to the number of cores on the machine. This is because:
45 | * The implementation that Hadoop uses to decode bzip2 files - `CBZip2InputStream` - is not thead-safe (there's a JIRA for that: httips://issues.apache.org/jira/browse/HADOOP-10614). This means that allotting multiple threads to a single worker while using .bz2 input files will cause the jobs to fail.
46 | * Multiple JVMs rather than a single huge JVM often increases performance.
47 |
48 | While running tests we have found that setting `spark.executor.memory` to 2500m - 3000m is a good idea with the above sample configuration. It is given in the sample dist-config.properties file discussed in the next section.
49 |
50 | And at the end:
51 |
52 | ```
53 | sbin/start-all.sh
54 | ```
55 |
56 | We have added a script for setting up Spark and Hadoop on Google Compute Engine with the optimal settings for this framework. You can find it in the **gce** directory.
57 |
58 | Please refer to the [Spark official docs](http://spark.apache.org/docs/0.9.1/spark-standalone.html) for details on how to deploy Spark in standalone mode.
59 |
60 | ## How to Build
61 |
62 | Clone the latest version of the repo and switch to stage branch:
63 |
64 | $ git clone https://github.com/dbpedia/distributed-extraction-framework.git
65 | $ cd distributed-extraction-framework
66 | $ mvn clean install -Dmaven.test.skip=true # Compiles the code without running tests
67 |
68 | ## Dump-based Distributed Extraction
69 |
70 | Follow the instructions given below to download data for the extractions you need to perform. An example of the download.properties file is given at `download/src/test/resources/download.properties`
71 |
72 | In the root directory run the following commands
73 |
74 | $ mvn clean install -Dmaven.test.skip=true # Compiles the code without running tests
75 | $ ./run download config=download.properties # Downloads the wikipedia dumps
76 |
77 | **Points to keep in mind:**
78 |
79 | 1. Before performing extractions you will need a config.properties file for general extraction configuration and a dist-config.properties file for the distributed framework specific configuration (Spark, Hadoop, logging etc.). Examples are given at `extraction/src/test/resources/`.
80 |
81 | 2. The example `extraction/src/test/resources/dist-config.properties` file needs to be modified with a proper spark-home and spark-master (local[N] means N cores on the local node - you can change it to something like `spark://hostname:7077` to run it in distributed mode).
82 |
83 | 3. Prefer pages-articles-multistream.bz2 files to pages-articles.bz2 because they are more efficient for parallel extraction. The former can be decompressed in parallel using Hadoop's splittable Bzip2Codec. Of course, this does not matter when using the pages-articlesX.xml-pXXXXXXXXXXpXXXXXXXXXX.bz2 files (which will be the files of choice for distributed downloads).
84 |
85 | 4. **Important:** Finally, when running on a distributed cluster, it is essential that you set `spark.cores.max` (in dist-config.properties) to **N** \* **M** where N = total no. of slaves, M = `SPARK_WORKER_INSTANCES`. This is to ensure that Spark uses as many cores (over the entire cluster) as many workers there are.
86 |
87 | Now perform parallel extractions on your Spark cluster:
88 |
89 | $ ./run extraction extraction/src/test/resources/config.properties extraction/src/test/resources/dist-config.properties
90 |
91 |
92 | ### Testing
93 | Please see the [wiki page for Testing](https://github.com/dbpedia/distributed-extraction-framework/wiki/Testing) for detailed instructions on how to verify outputs of the distributed extraction framework by comparing them with that of the original.
94 |
95 | ## Distributed Downloads
96 |
97 | This is still a work in progress and there are some issues that need to be solved.
98 |
99 | Have a look at `download/src/test/resources/dist-download.properties` and `download/src/test/resources/download.properties`. You can create your own config files using them. Just make sure that they are present at the same path in all nodes of the cluster.
100 |
101 | After cloning and building the framework on the master node, for each slave node, do this:
102 | ```
103 | rsync -avhz --progress ~/.m2 $SLAVE:~/
104 | rsync -avhz --progress /path/to/distributed-extraction-framework $SLAVE:/path/to/
105 | ../run download distconfig=/path/to/distributed-extraction-framework/download/src/test/resources/dist-download.properties config=/path/to/distributed-extraction-framework/download/src/test/resources/download.properties
106 | ```
107 |
108 | You can find the worker logs at `/path/to/distributed-extraction-framework/logs` of each node.
109 |
110 |
--------------------------------------------------------------------------------
/clean-install-run:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Shortcut for
4 | # mvn -f ../pom.xml clean install && mvn scala:run -Dlauncher=... -DaddArgs=...
5 | # Must be called with one of the modules (core/, dump/, ...) as current directory.
6 | # Example:
7 | # extraction_framework/core> ../clean-install-run LAUNCHER ARG1 ARG2 ARG3
8 | # is equivalent to
9 | # extraction_framework/core> mvn -f ../pom.xml clean install && mvn scala:run "-Dlauncher=LAUNCHER" "-DaddArgs=ARG1|ARG2|ARG3"
10 |
11 | # if we're not on a terminal, use batch mode to avoid ugly log files
12 | [ ! -t 1 ] && BATCH="-B"
13 | mvn $BATCH -f ../pom.xml clean && . ../install-run "$@"
14 |
--------------------------------------------------------------------------------
/common/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
5 | 4.0.0
6 |
7 |
8 | org.dbpedia
9 | distributed-extraction
10 | 4.1-SNAPSHOT
11 |
12 |
13 | org.dbpedia.distributed-extraction
14 | common
15 | 4.1-SNAPSHOT
16 | DBpedia Distributed Extraction Framework Commons
17 |
18 |
19 |
20 |
21 | net.alchim31.maven
22 | scala-maven-plugin
23 |
24 |
25 |
26 |
27 |
28 |
29 | org.dbpedia.extraction
30 | core
31 | 4.1
32 |
33 |
34 |
35 | org.apache.hadoop
36 | hadoop-client
37 | ${hadoop.version}
38 |
39 |
40 |
41 | org.apache.hadoop
42 | hadoop-common
43 | ${hadoop.version}
44 |
45 |
46 |
47 |
48 |
--------------------------------------------------------------------------------
/common/src/main/scala/org/dbpedia/extraction/util/HadoopConfigurable.scala:
--------------------------------------------------------------------------------
1 | package org.dbpedia.extraction.util
2 |
3 | import org.apache.hadoop.fs.Path
4 | import org.dbpedia.extraction.util.RichHadoopPath.wrapPath
5 | import org.apache.hadoop.conf.Configuration
6 |
7 | /**
8 | * Trait for classes that need to create a Hadoop Configuration.
9 | */
10 | trait HadoopConfigurable
11 | {
12 | /** Path to hadoop core-site.xml */
13 | protected val hadoopCoreConf: String
14 |
15 | /** Path to hadoop hdfs-site.xml */
16 | protected val hadoopHdfsConf: String
17 |
18 | /** Path to hadoop mapred-site.xml */
19 | protected val hadoopMapredConf: String
20 |
21 | /** Hadoop Configuration. This is implicit because RichHadoopPath operations need it. */
22 | implicit lazy val hadoopConf =
23 | {
24 | val hadoopConf = new Configuration()
25 |
26 | if (hadoopCoreConf != null)
27 | hadoopConf.addResource(new Path(hadoopCoreConf))
28 | if (hadoopHdfsConf != null)
29 | hadoopConf.addResource(new Path(hadoopHdfsConf))
30 | if (hadoopMapredConf != null)
31 | hadoopConf.addResource(new Path(hadoopMapredConf))
32 |
33 | hadoopConf
34 | }
35 |
36 | /**
37 | * Checks if a Path exists.
38 | *
39 | * @param path Option[Path] if this is None, pathMustExist has no effect.
40 | * @param pathMustExist Boolean to ensure that the Path, if obtained, actually exists.
41 | * @throws RuntimeException if Option[Path] is defined but the path does not exist
42 | * @return the Option[Path] given as input
43 | */
44 | def checkPathExists(path: Option[Path], pathMustExist: Boolean): Option[Path] =
45 | {
46 | // If pathMustExist is set to true, and path is defined but it does not exist, throw an error.
47 | if (pathMustExist && path.isDefined && !path.get.exists)
48 | {
49 | val hadoopHint = if (hadoopCoreConf == null || hadoopHdfsConf == null || hadoopMapredConf == null) " Make sure you configured Hadoop correctly and the directory exists on the configured file system." else ""
50 | throw sys.error("Dir " + path.get.getSchemeWithFileName + " does not exist." + hadoopHint)
51 | }
52 | path
53 | }
54 | }
55 |
--------------------------------------------------------------------------------
/common/src/main/scala/org/dbpedia/extraction/util/RichHadoopPath.scala:
--------------------------------------------------------------------------------
1 | package org.dbpedia.extraction.util
2 |
3 | import java.io.{IOException, FileNotFoundException,OutputStream, InputStream}
4 | import org.apache.hadoop.fs.{FileSystem, Path}
5 | import org.apache.hadoop.conf.Configuration
6 | import java.nio.file.NotDirectoryException
7 | import scala.language.implicitConversions
8 |
9 | object RichHadoopPath {
10 |
11 | implicit def wrapPath(path: Path)(implicit hadoopConf: Configuration) = new RichHadoopPath(path, hadoopConf)
12 |
13 | implicit def toPath(path: String) = new Path(path)
14 |
15 | }
16 |
17 | /**
18 | * This class lets us use org.apache.hadoop.fs.Path seamlessly wherever a FileLike is used.
19 | * Defines additional methods on Path by using an implicit Configuration.
20 | */
21 | class RichHadoopPath(path: Path, conf: Configuration) extends FileLike[Path] {
22 |
23 | private val fs: FileSystem = path.getFileSystem(conf)
24 |
25 | override def toString: String = path.toString
26 |
27 | override def name: String = path.getName
28 |
29 | /**
30 | * @throws NotDirectoryException if the path is not a directory
31 | * @throws FileNotFoundException if the path does not exist
32 | */
33 | override def hasFiles: Boolean = {
34 | isDirectory match {
35 | // Not a directory?
36 | case false => throw new NotDirectoryException(path.toString)
37 | // Contains files?
38 | case true => if(fs.listStatus(path).size > 0) true else false
39 | }
40 | }
41 |
42 | override def delete(recursive: Boolean = false): Unit = {
43 | if(!fs.delete(path, recursive))
44 | throw new IOException("failed to delete path ["+path+"]")
45 | }
46 |
47 | override def resolve(name: String): Path = new Path(path, name)
48 |
49 | override def exists: Boolean = fs.exists(path)
50 |
51 | // TODO: more efficient type than List?
52 | override def names: List[String] = names("*")
53 |
54 | // TODO: more efficient type than List?
55 | def names(glob: String): List[String] = list(glob).map(_.getName)
56 |
57 | // TODO: more efficient type than List?
58 | override def list: List[Path] = list("*")
59 |
60 | // TODO: more efficient type than List?
61 | def list(glob: String): List[Path] = {
62 | val list = fs.globStatus(new Path(path, glob)).map(_.getPath).toList
63 | if(list.isEmpty) throw new IOException("failed to list files in ["+path+"]")
64 | list
65 | }
66 |
67 | override def size: Long = fs.getContentSummary(path).getLength
68 |
69 | override def isFile: Boolean = fs.isFile(path)
70 |
71 | override def isDirectory: Boolean = fs.getFileStatus(path).isDirectory
72 |
73 | override def inputStream(): InputStream = fs.open(path)
74 |
75 | override def outputStream(append: Boolean = false): OutputStream = if(append) fs.append(path) else fs.create(path)
76 |
77 | def mkdirs(): Boolean = fs.mkdirs(path)
78 |
79 | def getSchemeWithFileName: String = fs.getScheme + "://" + path.toUri.getPath
80 | }
81 |
--------------------------------------------------------------------------------
/download/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
5 | 4.0.0
6 |
7 |
8 | org.dbpedia
9 | distributed-extraction
10 | 4.1-SNAPSHOT
11 |
12 |
13 | org.dbpedia.distributed-extraction
14 | download
15 | 4.1-SNAPSHOT
16 | DBpedia Distributed Dump Downloader
17 |
18 |
19 |
20 |
21 |
22 | org.apache.maven.plugins
23 | maven-shade-plugin
24 | 2.2
25 |
26 |
27 |
28 | *:*
29 |
30 | META-INF/*.SF
31 | META-INF/*.DSA
32 | META-INF/*.RSA
33 |
34 |
35 |
36 |
37 |
38 |
39 | downloads-jar
40 | package
41 |
42 | shade
43 |
44 |
45 |
46 |
47 |
51 |
52 | reference.conf
53 |
54 |
56 |
57 | worker.Main
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 | net.alchim31.maven
68 | scala-maven-plugin
69 |
70 |
71 |
72 |
73 | seq-download
74 | org.dbpedia.extraction.dump.download.Download
75 |
84 |
85 |
86 |
87 |
88 | download
89 | org.dbpedia.extraction.dump.download.DistDownload
90 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 | org.dbpedia.extraction
110 | dump
111 | 4.1
112 |
113 |
114 |
115 | org.dbpedia.distributed-extraction
116 | common
117 | 4.1-SNAPSHOT
118 |
119 |
120 |
121 | org.apache.hadoop
122 | hadoop-client
123 | ${hadoop.version}
124 |
125 |
126 |
127 | com.typesafe.akka
128 | akka-contrib_2.10
129 | 2.3.0
130 |
131 |
132 |
133 | com.typesafe.akka
134 | akka-testkit_2.10
135 | 2.3.0
136 |
137 |
138 |
139 | com.jcraft
140 | jsch
141 | 0.1.51
142 |
143 |
144 |
145 |
--------------------------------------------------------------------------------
/download/src/main/resources/application.conf:
--------------------------------------------------------------------------------
1 | akka {
2 | actor.provider = "akka.cluster.ClusterActorRefProvider"
3 |
4 | remote.netty.tcp {
5 | hostname="127.0.0.1"
6 | port=0
7 | }
8 |
9 | extensions = ["akka.contrib.pattern.ClusterReceptionistExtension"]
10 |
11 | akka.cluster.auto-down = on
12 |
13 | auto-down-unreachable-after = 10s
14 |
15 | log-dead-letters = 0
16 |
17 | log-dead-letters-during-shutdown = off
18 | }
19 |
--------------------------------------------------------------------------------
/download/src/main/resources/reference.conf:
--------------------------------------------------------------------------------
1 | akka {
2 | actor.provider = "akka.cluster.ClusterActorRefProvider"
3 |
4 | remote.netty.tcp {
5 | hostname="127.0.0.1"
6 | port=0
7 | }
8 |
9 | extensions = ["akka.contrib.pattern.ClusterReceptionistExtension"]
10 |
11 | akka.cluster.auto-down = on
12 |
13 | auto-down-unreachable-after = 10s
14 |
15 | log-dead-letters = 0
16 |
17 | log-dead-letters-during-shutdown = 0
18 | }
19 |
--------------------------------------------------------------------------------
/download/src/main/scala/org/dbpedia/extraction/dump/download/ActoredCounter.scala:
--------------------------------------------------------------------------------
1 | package org.dbpedia.extraction.dump.download
2 |
3 | import akka.actor.ActorRef
4 | import java.io.InputStream
5 | import java.net.URLConnection
6 | import org.dbpedia.extraction.util.CountingInputStream
7 | import org.dbpedia.extraction.dump.download.actors.message.DownloaderProgressMessage
8 | import DownloaderProgressMessage.{Start, Read}
9 | import Counter.getContentLength
10 |
11 | /**
12 | * A Downloader mixin to be used with DownloadProgressTracker. Sends Start/Read messages to
13 | * the DownloadProgressTracker actor reference.
14 | *
15 | * @see org.dbpedia.extraction.dump.download.actors.DownloadProgressTracker
16 | */
17 | trait ActoredCounter extends Downloader
18 | {
19 | /**
20 | * Reference to a DownloadProgressTracker actor.
21 | */
22 | val progressActor: ActorRef
23 |
24 | protected abstract override def inputStream(conn: URLConnection): InputStream = {
25 | def logger(bytesRead: Long, close: Boolean): Unit = progressActor ! Read(bytesRead)
26 | progressActor ! Start(getContentLength(conn)) // Signal start of download with the total file size in bytes
27 | new CountingInputStream(super.inputStream(conn), logger)
28 | }
29 |
30 | }
31 |
--------------------------------------------------------------------------------
/download/src/main/scala/org/dbpedia/extraction/dump/download/DistDownload.scala:
--------------------------------------------------------------------------------
1 | package org.dbpedia.extraction.dump.download
2 |
3 | import akka.actor._
4 | import akka.cluster.Cluster
5 | import akka.contrib.pattern.{ClusterSingletonManager, ClusterClient}
6 | import com.typesafe.config.ConfigFactory
7 | import scala.concurrent.duration._
8 | import scala.language.postfixOps
9 | import org.dbpedia.extraction.dump.download.actors._
10 | import akka.actor.RootActorPath
11 | import scala.Some
12 | import java.util.logging.Logger
13 | import org.dbpedia.extraction.util.RemoteExecute
14 | import org.dbpedia.extraction.dump.download.actors.DownloadClient.Finished
15 |
16 | /**
17 | * Distributed Wikipedia dump downloader.
18 | *
19 | * While running this on a cluster, make sure that all configuration variables (including the paths to configuration files)
20 | * are valid in all nodes of the cluster, ie. the configuration files need to be present on the worker nodes too.
21 | */
22 | object DistDownload extends RemoteExecute
23 | {
24 | val logger = Logger.getLogger(classOf[DistDownload].getName)
25 |
26 | def main(args: Array[String]): Unit =
27 | {
28 | val config = new DistDownloadConfig(args)
29 | if (config.isMaster)
30 | {
31 | val cluster = new ClusterStartup(config)
32 |
33 | // Start master on the driver node
34 | val joinAddress = cluster.startMaster(None, "driver")
35 | Thread.sleep(5000) // wait a few sec for master to start up
36 |
37 | (config.privateKey, config.sshPassphrase) match
38 | {
39 | case (Some(identity), Some(passphrase)) => // both private key and passphrase are provided
40 | addIdentity(identity, passphrase)
41 | case (Some(identity), None) => // passphrase is empty
42 | addIdentity(identity)
43 | case _ => // no private key provided
44 | }
45 |
46 | for (host <- config.slaves)
47 | {
48 | val session = createSession(config.userName, host)
49 | for (worker <- 1 to config.workersPerSlave)
50 | {
51 | val command = """cd %s/download;mkdir -p ../logs;nohup ../run download join=%s %s > ../logs/%s-%d.out &""".
52 | format(config.homeDir, joinAddress, args.mkString(" "), host, worker)
53 | println(command)
54 | println(execute(session, command))
55 | }
56 | session.disconnect()
57 | }
58 |
59 | // Start download client and result/progress consumer
60 | val client = cluster.startFrontend(joinAddress)
61 | val dumpFiles = new DumpFileSource(config.languages,
62 | config.baseUrl,
63 | config.baseDir,
64 | config.wikiName,
65 | config.ranges,
66 | config.dateRange,
67 | config.dumpCount)
68 | for(dumpFile <- dumpFiles)
69 | client ! dumpFile
70 |
71 | client ! Finished
72 | }
73 | else
74 | {
75 | val cluster = new ClusterStartup(config)
76 | cluster.startWorker(config.joinAddress.get)
77 | }
78 | }
79 | }
80 |
81 | class DistDownload
82 |
83 | class ClusterStartup(config: DistDownloadConfig)
84 | {
85 | def systemName = "Workers"
86 |
87 | private def progressReportTimeout = config.progressReportInterval + 2.seconds
88 |
89 | def startMaster(joinAddressOption: Option[Address], role: String): Address =
90 | {
91 | val conf = ConfigFactory.parseString( s"""akka.cluster.roles=[$role]\nakka.remote.netty.tcp.hostname="${config.master}"""").
92 | withFallback(ConfigFactory.load())
93 | val system = ActorSystem(systemName, conf)
94 | val joinAddress = joinAddressOption.getOrElse(Cluster(system).selfAddress)
95 | Cluster(system).join(joinAddress)
96 | system.actorOf(
97 | ClusterSingletonManager.props(Master.props(
98 | progressReportTimeout,
99 | config.mirrors,
100 | config.threadsPerMirror
101 | ),
102 | "active", PoisonPill, Some(role)
103 | ),
104 | "master")
105 | joinAddress
106 | }
107 |
108 | def startFrontend(joinAddress: akka.actor.Address): ActorRef =
109 | {
110 | val conf = ConfigFactory.parseString( s"""akka.remote.netty.tcp.hostname="${config.master}"""").
111 | withFallback(ConfigFactory.load())
112 | val system = ActorSystem(systemName, conf)
113 | Cluster(system).join(joinAddress)
114 |
115 | val client = system.actorOf(Props[DownloadClient], "client")
116 | system.actorOf(Props[DownloadResultConsumer], "consumer")
117 | client
118 | }
119 |
120 | def startWorker(contactAddress: akka.actor.Address) =
121 | {
122 | val conf = ConfigFactory.load()
123 | val system = ActorSystem(systemName, conf)
124 | val initialContacts = Set(system.actorSelection(RootActorPath(contactAddress) / "user" / "receptionist"))
125 | val clusterClient = system.actorOf(ClusterClient.props(initialContacts), "clusterClient")
126 | system.actorOf(
127 | Worker.props(clusterClient,
128 | DownloadJobRunner.props(config.progressReportInterval,
129 | config.hadoopConf,
130 | config.localTempDir,
131 | config.unzip
132 | ),
133 | config.maxDuplicateProgress
134 | ),
135 | "worker"
136 | )
137 | }
138 | }
--------------------------------------------------------------------------------
/download/src/main/scala/org/dbpedia/extraction/dump/download/DumpFileSource.scala:
--------------------------------------------------------------------------------
1 | package org.dbpedia.extraction.dump.download
2 |
3 | import org.dbpedia.extraction.util.{Language, WikiInfo}
4 | import scala.io.{Source, Codec}
5 | import java.net.URL
6 | import scala.collection.mutable
7 | import scala.collection.immutable.SortedSet
8 | import scala.collection.mutable.{ListBuffer, Set}
9 | import org.apache.hadoop.fs.Path
10 | import org.dbpedia.extraction.dump.download.actors.message.DumpFile
11 |
12 | /**
13 | * Generate DumpFile objects each representing a specific wiki file to download.
14 | * Most of the code was taken from LanguageDownloader (extraction-framework).
15 | *
16 | * TODO: Integrate this to LanguageDownloader and reuse it here (reduce code duplication)?
17 | */
18 | class DumpFileSource(languages: mutable.HashMap[Language, mutable.Set[(String, Boolean)]],
19 | baseUrl: URL,
20 | baseDir: Path,
21 | wikiSuffix: String,
22 | ranges: mutable.HashMap[(Int, Int), mutable.Set[(String, Boolean)]],
23 | dateRange: (String, String),
24 | dumpCount: Int)
25 | extends Traversable[DumpFile] with Iterable[DumpFile]
26 | {
27 | private val DateLink = """""".r
28 | private val list = new ListBuffer[DumpFile]()
29 |
30 | override def iterator: Iterator[DumpFile] = list.iterator
31 |
32 | override def foreach[U](func: DumpFile => U)
33 | {
34 | if(list.isEmpty)
35 | {
36 | // resolve page count ranges to languages
37 | if (ranges.nonEmpty)
38 | {
39 | val wikis = WikiInfo.fromURL(WikiInfo.URL, Codec.UTF8)
40 |
41 | // for all wikis in one of the desired ranges...
42 | for (((from, to), files) <- ranges; wiki <- wikis; if from <= wiki.pages && wiki.pages <= to)
43 | {
44 | // ...add files for this range to files for this language
45 | languages.getOrElseUpdate(wiki.language, new mutable.HashSet[(String, Boolean)]) ++= files
46 | }
47 | }
48 |
49 | // sort them to have reproducible behavior
50 | val languageKeys = SortedSet.empty[Language] ++ languages.keys
51 | languageKeys.foreach
52 | {
53 | lang =>
54 | val done = languageKeys.until(lang)
55 | val todo = languageKeys.from(lang)
56 | println("done: " + done.size + " - " + done.map(_.wikiCode).mkString(","))
57 | println("todo: " + todo.size + " - " + languageKeys.from(lang).map(_.wikiCode).mkString(","))
58 | for(dumpFile <- LanguageDumpFileSource(lang))
59 | list += dumpFile
60 | }
61 | }
62 | list foreach func
63 | }
64 |
65 | private class LanguageDumpFileSource(language: Language) extends Traversable[DumpFile]
66 | {
67 | val wiki = language.filePrefix + wikiSuffix
68 | val mainPage = new URL(baseUrl, wiki + "/")
69 | val fileNames = languages(language)
70 |
71 | override def foreach[U](func: DumpFile => U)
72 | {
73 | forDates(dateRange, dumpCount, func)
74 | }
75 |
76 | def forDates[U](dateRange: (String, String), dumpCount: Int, func: DumpFile => U)
77 | {
78 | val (firstDate, lastDate) = dateRange
79 |
80 | var dates = SortedSet.empty(Ordering[String].reverse)
81 | for (line <- Source.fromURL(mainPage).getLines())
82 | DateLink.findAllIn(line).matchData.foreach(dates += _.group(1))
83 |
84 | if (dates.size == 0) throw new Exception("found no date - " + mainPage + " is probably broken or unreachable. check your network / proxy settings.")
85 |
86 | var count = 0
87 |
88 | // find date pages that have all files we want
89 | for (date <- dates)
90 | {
91 | if (count < dumpCount && date >= firstDate && date <= lastDate && forDate(date, func)) count += 1
92 | }
93 |
94 | if (count == 0) throw new Exception("found no date on " + mainPage + " in range " + firstDate + "-" + lastDate + " with files " + fileNames.mkString(","))
95 | }
96 |
97 | def forDate[U](date: String, func: DumpFile => U): Boolean =
98 | {
99 | val datePage = new URL(mainPage, date + "/") // here we could use index.html
100 | val datePageLines = Source.fromURL(datePage).getLines().toTraversable
101 |
102 | // Collect regexes
103 | val regexes = fileNames.filter(_._2).map(_._1)
104 | val fileNamesFromRegexes = expandFilenameRegex(date, datePageLines, regexes)
105 | val staticFileNames = fileNames.filter(!_._2).map(_._1)
106 |
107 | val allFileNames = fileNamesFromRegexes ++ staticFileNames
108 | // val urls = allFileNames.map(fileName => new URL(baseURL, wiki + "/" + date + "/" + wiki + "-" + date + "-" + fileName))
109 | val dumpFiles = allFileNames.map(fileName => DumpFile(baseDir.toUri.getPath, wikiSuffix, language.wikiCode, date, fileName))
110 |
111 |
112 | // all the links we need - only for non regexes (we have already checked regex ones)
113 | val links = new mutable.HashMap[String, String]()
114 | for (fileName <- staticFileNames) links(fileName) = ""
115 | // Here we should set ""
116 | // but "\"/"+wiki+"/"+date+"/" does not exists in incremental updates, keeping the trailing "\">" should do the trick
117 | // for (fileName <- fileNames) links(fileName) = wiki+"-"+date+"-"+fileName+"\">"
118 |
119 | for (line <- datePageLines)
120 | links.foreach
121 | {
122 | case (fileName, link) => if (line contains link) links -= fileName
123 | }
124 |
125 | // did we find them all?
126 | // Fail if:
127 | // - the user specified static file names and not all of them have been found
128 | // OR
129 | // - the user specified regular expressions and no file has been found that satisfied them
130 | if ((staticFileNames.nonEmpty && links.nonEmpty) || (regexes.nonEmpty && fileNamesFromRegexes.isEmpty))
131 | {
132 | // TODO: Fix message
133 | val staticFilesMessage = if (links.nonEmpty) " has no links to [" + links.keys.mkString(",") + "]" else ""
134 | val dynamicFilesMessage = if (fileNamesFromRegexes.isEmpty && regexes.nonEmpty) " has no links that satisfies [" + regexes.mkString(",") + "]" else ""
135 | println("date page '" + datePage + staticFilesMessage + dynamicFilesMessage)
136 | false
137 | }
138 | else
139 | {
140 | println("date page '" + datePage + "' has all files [" + allFileNames.mkString(",") + "]")
141 | // run closure over all DumpFiles
142 | for (dumpFile <- dumpFiles) func(dumpFile)
143 | true
144 | }
145 | }
146 |
147 | private def expandFilenameRegex(date: String, index: Traversable[String], filenameRegexes: mutable.Set[String]): mutable.Set[String] =
148 | {
149 | // Prepare regexes
150 | val regexes = filenameRegexes.map(regex => ("").r)
151 |
152 | // Result
153 | val filenames = Set[String]()
154 |
155 | for (line <- index)
156 | regexes.foreach(regex => regex.findAllIn(line).matchData.foreach(filenames += _.group(1)))
157 |
158 | filenames
159 | }
160 | }
161 |
162 | private object LanguageDumpFileSource
163 | {
164 | def apply(language: Language) = new LanguageDumpFileSource(language)
165 | }
166 |
167 | }
168 |
--------------------------------------------------------------------------------
/download/src/main/scala/org/dbpedia/extraction/dump/download/actors/DownloadClient.scala:
--------------------------------------------------------------------------------
1 | package org.dbpedia.extraction.dump.download.actors
2 |
3 | import akka.actor.{ActorLogging, Actor}
4 | import java.util.UUID
5 | import scala.concurrent.duration._
6 | import akka.pattern._
7 | import akka.contrib.pattern.{DistributedPubSubMediator, DistributedPubSubExtension}
8 | import akka.contrib.pattern.DistributedPubSubMediator.Send
9 | import org.dbpedia.extraction.dump.download.actors.message.GeneralMessage.{MasterQueueEmpty, ShutdownCluster}
10 | import akka.util.Timeout
11 | import org.dbpedia.extraction.dump.download.actors.message.{DumpFile, DownloadJob}
12 |
13 | /**
14 | * A client actor used to submit download jobs to the master. To submit a job, a DumpFile object is sent as message.
15 | */
16 | class DownloadClient extends Actor with ActorLogging
17 | {
18 |
19 | import DownloadClient._
20 | import context.dispatcher
21 |
22 | def scheduler = context.system.scheduler
23 |
24 | def nextDownloadId(): String = UUID.randomUUID().toString
25 |
26 | val mediator = DistributedPubSubExtension(context.system).mediator
27 | mediator ! DistributedPubSubMediator.Subscribe(Master.General, self)
28 |
29 | implicit val timeout = Timeout(10.seconds)
30 |
31 | var canShutDownCluster = false
32 |
33 | def receive =
34 | {
35 | case Finished =>
36 | // send this when no more DumpFiles are to be added - ready for shutdown
37 | canShutDownCluster = true
38 |
39 | case MasterQueueEmpty =>
40 | if (canShutDownCluster) self ! ShutdownCluster
41 |
42 | case ShutdownCluster =>
43 | mediator ! Send("/user/master/active", ShutdownCluster, localAffinity = false)
44 | context.stop(self)
45 | context.system.shutdown()
46 | context.become(shuttingDown)
47 |
48 | case file: DumpFile =>
49 | self ! DownloadJob(nextDownloadId(), file)
50 |
51 | case job: DownloadJob =>
52 | (mediator ? Send("/user/master/active", job, localAffinity = false)) map
53 | {
54 | case Master.Ack(_) =>
55 | log.info("Job accepted by master: {}", job)
56 | } recover
57 | {
58 | case _ =>
59 | log.info("Job not accepted, retry after a while")
60 | scheduler.scheduleOnce(3.seconds, self, job)
61 | }
62 | }
63 |
64 | def shuttingDown: Receive =
65 | {
66 | case _ => // ignore all messages, shutting down cluster.
67 | }
68 | }
69 |
70 | object DownloadClient
71 | {
72 | case object Finished
73 | }
74 |
--------------------------------------------------------------------------------
/download/src/main/scala/org/dbpedia/extraction/dump/download/actors/DownloadJobRunner.scala:
--------------------------------------------------------------------------------
1 | package org.dbpedia.extraction.dump.download.actors
2 |
3 | import akka.actor.{ActorLogging, Props, Actor}
4 | import akka.pattern.ask
5 | import akka.util.Timeout
6 | import org.dbpedia.extraction.dump.download.{Unzip, ActoredCounter, FileDownloader}
7 | import org.dbpedia.extraction.util.{Language, Finder}
8 | import java.net.URL
9 | import org.dbpedia.extraction.util.RichHadoopPath.wrapPath
10 | import org.apache.hadoop.fs.Path
11 | import org.apache.hadoop.conf.Configuration
12 | import java.io.File
13 | import scala.concurrent.Future
14 | import scala.concurrent.duration._
15 | import scala.language.postfixOps
16 | import org.dbpedia.extraction.dump.download.actors.Worker.DownloadComplete
17 | import org.dbpedia.extraction.dump.download.actors.message.{DumpFile, DownloadJob, MirroredDownloadJob, DownloaderProgressMessage}
18 | import DownloaderProgressMessage.{ProgressEnd, Stop}
19 | import scala.util.{Failure, Success}
20 |
21 | /**
22 | * This actor is used by Worker to run a download job.
23 | *
24 | * @param progressInterval Download progress report interval
25 | * @param hadoopConfiguration Hadoop Configuration
26 | * @param tempDir temporary directory on local file system to download to (before being moved to HDFS)
27 | * @param unzip true if file should be unzipped while downloading, false otherwise
28 | */
29 | class DownloadJobRunner(progressInterval: FiniteDuration, hadoopConfiguration: Configuration, tempDir: File, unzip: Boolean) extends Actor with ActorLogging
30 | {
31 | implicit private val _hadoopConfiguration = hadoopConfiguration
32 | implicit private val progressStopTimeout = Timeout(5 seconds)
33 |
34 | val progress = context.watch(context.actorOf(Props(classOf[DownloadProgressTracker], context.parent, progressInterval), "progress"))
35 |
36 | class Downloader extends FileDownloader with ActoredCounter
37 | {
38 | override val progressActor = progress
39 | }
40 |
41 | val downloader =
42 | if (unzip) new Downloader with Unzip
43 | else new Downloader
44 |
45 | def receive =
46 | {
47 | case job@MirroredDownloadJob(mirror, DownloadJob(_, DumpFile(base, wikiName, lang, date, fileName))) =>
48 | log.debug("Received download job from Worker: {}", job)
49 | val s = sender()
50 | import context.dispatcher
51 |
52 | val baseDir = new Path(base)
53 | val finder = new Finder[Path](baseDir, Language(lang), wikiName)
54 | val wiki = finder.wikiName
55 | val dateDir = baseDir.resolve(wiki).resolve(date)
56 | if (!dateDir.exists && !dateDir.mkdirs) throw new Exception("Target directory [" + dateDir.getSchemeWithFileName + "] does not exist and cannot be created")
57 | if (!tempDir.exists && !tempDir.mkdirs) throw new Exception("Local temporary directory [" + tempDir + "] does not exist and cannot be created")
58 |
59 | val url = new URL(mirror, s"$wiki/$date/$wiki-$date-$fileName")
60 | val targetFile = new File(tempDir, downloader.targetName(url))
61 | if(targetFile.exists) targetFile.delete() // delete file in temp dir if it already exists
62 |
63 | Future(downloader.downloadTo(url, tempDir)).
64 | onComplete
65 | {
66 | case Success(file) =>
67 | // file was downloaded to tempDir; copy it to Hadoop FS.
68 | val fs = dateDir.getFileSystem(hadoopConfiguration)
69 | val outputPath = dateDir.resolve(file.getName)
70 | fs.moveFromLocalFile(new Path(file.toURI), outputPath)
71 | progress ? Stop onSuccess
72 | {
73 | case ProgressEnd(totalBytes) =>
74 | s ! DownloadComplete(outputPath.getSchemeWithFileName, totalBytes) // Tell worker that download is finished
75 | }
76 | case Failure(t) =>
77 | log.info(t.getMessage)
78 | progress ! Stop
79 | }
80 | }
81 | }
82 |
83 | object DownloadJobRunner
84 | {
85 | def props(progressInterval: FiniteDuration, hadoopConfiguration: Configuration, tempDir: File, unzip: Boolean = false): Props =
86 | Props(classOf[DownloadJobRunner], progressInterval, hadoopConfiguration, tempDir, unzip)
87 | }
88 |
--------------------------------------------------------------------------------
/download/src/main/scala/org/dbpedia/extraction/dump/download/actors/DownloadProgressTracker.scala:
--------------------------------------------------------------------------------
1 | package org.dbpedia.extraction.dump.download.actors
2 |
3 | import akka.actor._
4 | import java.util.concurrent.atomic.AtomicLong
5 | import scala.concurrent.duration.FiniteDuration
6 | import org.dbpedia.extraction.dump.download.actors.message.{DownloaderProgressMessage, WorkerProgressMessage}
7 | import scala.Some
8 |
9 | /**
10 | * An actor that receives Start and Read messages, and relays ProgressStart and Progress messages to the client.
11 | * This is used to keep track of download progress - the number of bytes being read in real time.
12 | *
13 | * @param client The actor to send progress messages to
14 | * @param notifyInterval The time interval at which progress reports will be sent to client
15 | */
16 | class DownloadProgressTracker(client: ActorRef, notifyInterval: FiniteDuration) extends Actor with ActorLogging
17 | {
18 | import WorkerProgressMessage._
19 | import DownloaderProgressMessage._
20 | import DownloadProgressTracker._
21 | import context.dispatcher
22 |
23 | def scheduler = context.system.scheduler
24 |
25 | private val bytesRead = new AtomicLong()
26 |
27 | /** This task is used to send Progress messages to client at each interval */
28 | private var progressTaskOption: Option[Cancellable] = None
29 |
30 | override def postStop() = progressTaskOption.foreach(_.cancel())
31 |
32 | def receive =
33 | {
34 | case Start(total) => // Sent by ActoredCounter to signal start of download
35 | if (0 != bytesRead.get() || progressTaskOption.isDefined)
36 | {
37 | log.info("ProgressTracker is already started!")
38 | }
39 | else
40 | {
41 | progressTaskOption = Some(scheduler.schedule(notifyInterval, notifyInterval, self, Tick))
42 | client ! ProgressStart(total)
43 | }
44 |
45 | case Read(bytes) => // Sent by ActoredCounter to signal bytes read
46 | bytesRead.set(bytes)
47 |
48 | case Stop =>
49 | (progressTaskOption, bytesRead.get) match
50 | {
51 | case (Some(progressTask), b) if b != 0 =>
52 | sender ! ProgressEnd(bytesRead.get())
53 | bytesRead.set(0)
54 |
55 | progressTask.cancel()
56 | progressTaskOption = None
57 |
58 | case _ =>
59 | log.info("ProgressTracker is already stopped!")
60 | }
61 |
62 | case Tick =>
63 | client ! Progress(bytesRead.get())
64 | }
65 | }
66 |
67 | object DownloadProgressTracker
68 | {
69 | case object Tick
70 | }
--------------------------------------------------------------------------------
/download/src/main/scala/org/dbpedia/extraction/dump/download/actors/DownloadResultConsumer.scala:
--------------------------------------------------------------------------------
1 | package org.dbpedia.extraction.dump.download.actors
2 |
3 | import akka.actor.{ActorLogging, Actor}
4 | import org.dbpedia.extraction.dump.download.actors.message.GeneralMessage.ShutdownCluster
5 | import akka.contrib.pattern.{DistributedPubSubExtension, DistributedPubSubMediator}
6 | import org.dbpedia.extraction.dump.download.actors.message.WorkerProgressMessage.{Progress, ProgressStart}
7 | import org.dbpedia.extraction.dump.download.actors.message.{DownloadProgress, DownloadResult, DownloadJob, MirroredDownloadJob}
8 |
9 | /**
10 | * This actor is used to print download progress logging messages on the driver/master node.
11 | * Hooks into Master.ResultsTopic and consumes DownloadResult messages.
12 | *
13 | * TODO: Refactor the code to pretty-print better progress results like ByteLogger. Maintain list of jobs
14 | * and log percentage of work done etc.
15 | */
16 | class DownloadResultConsumer extends Actor with ActorLogging
17 | {
18 | var jobs = Map[String, MirroredDownloadJob]()
19 | val mediator = DistributedPubSubExtension(context.system).mediator
20 | mediator ! DistributedPubSubMediator.Subscribe(Master.General, self)
21 | mediator ! DistributedPubSubMediator.Subscribe(Master.ProgressTopic, self)
22 | mediator ! DistributedPubSubMediator.Subscribe(Master.ResultsTopic, self)
23 |
24 | def receive =
25 | {
26 | case _: DistributedPubSubMediator.SubscribeAck =>
27 |
28 | case job @ MirroredDownloadJob(_, DownloadJob(downloadId, _)) =>
29 | log.info("Starting job: {}", job)
30 | jobs += (downloadId -> job)
31 |
32 | case DownloadResult(downloadId, outputPath, bytes) =>
33 | log.info("{}: {} bytes downloaded to {}", downloadId, bytes, outputPath)
34 |
35 | case DownloadProgress(downloadId, p @ ProgressStart(bytes)) =>
36 | log.info("{}: {}", jobs(downloadId), p)
37 |
38 | case DownloadProgress(downloadId, p @ Progress(bytes)) =>
39 | log.info("{}: {}", jobs(downloadId), p)
40 |
41 | case ShutdownCluster =>
42 | context.stop(self)
43 | context.system.shutdown()
44 | }
45 | }
46 |
--------------------------------------------------------------------------------
/download/src/main/scala/org/dbpedia/extraction/dump/download/actors/Master.scala:
--------------------------------------------------------------------------------
1 | package org.dbpedia.extraction.dump.download.actors
2 |
3 | import scala.concurrent.duration.{Deadline, FiniteDuration}
4 | import akka.actor._
5 | import akka.contrib.pattern.{DistributedPubSubMediator, DistributedPubSubExtension}
6 | import scala.collection.immutable.Queue
7 | import org.dbpedia.extraction.dump.download.actors.message.GeneralMessage.{MasterQueueEmpty, ShutdownCluster}
8 | import org.dbpedia.extraction.dump.download.actors.message._
9 | import java.net.URL
10 | import scala.Some
11 | import akka.contrib.pattern.DistributedPubSubMediator.Put
12 |
13 | /**
14 | * Master/driver node actor. This is responsible for accepting download jobs from a client and dividing jobs
15 | * among the different Workers, keeping track of download jobs, handling failed jobs, shutting down the cluster etc.
16 | *
17 | * @param workTimeout Workers need to send download progress reports within this timeout
18 | * @param mirrors List of wikipedia mirror URLs
19 | * @param threadsPerMirror Number of simultaneous downloads per mirror
20 | */
21 | class Master(workTimeout: FiniteDuration, mirrors: Seq[URL], threadsPerMirror: Int) extends Actor with ActorLogging
22 | {
23 |
24 | import Master._
25 | import MasterWorkerMessage._
26 | import context.dispatcher
27 |
28 | def scheduler = context.system.scheduler
29 |
30 | // The DownloadClient and DownloadResultConsumer communicate with the Master through the DistributedPubSubMediator
31 | val mediator = DistributedPubSubExtension(context.system).mediator
32 |
33 | mediator ! Put(self)
34 |
35 | private var workers = Map[String, WorkerState]()
36 | private var pendingDownloads = Queue[DownloadJob]()
37 | private var downloadIds = Set[String]()
38 |
39 | // Keep track of the number of simultaneous downloads per mirror.
40 | private var mirrorsInUse = (mirrors zip Seq.fill(mirrors.size)(0)).toMap // Mapping mirror URL to number of simultaneous downloads
41 |
42 | val cleanupTask = scheduler.schedule(workTimeout / 2, workTimeout / 2,
43 | self, CleanupTick)
44 |
45 | override def postStop(): Unit = cleanupTask.cancel()
46 |
47 | def receive =
48 | {
49 | case ShutdownCluster =>
50 | if (pendingDownloads.isEmpty) // all downloads have finished?
51 | {
52 | if (workers.isEmpty) // all workers have been unregistered?
53 | {
54 | log.info("Stopping master!")
55 | mediator ! DistributedPubSubMediator.Publish(General, ShutdownCluster)
56 | self ! PoisonPill
57 | context.stop(self)
58 | context.system.shutdown()
59 | }
60 | else
61 | {
62 | workers.foreach // still have registered workers?
63 | {
64 | case (workerId, WorkerState(ref, Idle)) => // send shutdown signal to idle workers and remove them.
65 | ref ! ShutdownCluster
66 | workers -= workerId
67 | case _ => // come back to the busy worker after a period of workTimeout
68 | }
69 | log.debug("Some workers still busy! Cannot stop master yet!")
70 | context.system.scheduler.scheduleOnce(workTimeout, self, ShutdownCluster)
71 | }
72 | }
73 | else
74 | {
75 | log.debug("Some work pending! Cannot stop master yet!")
76 | context.system.scheduler.scheduleOnce(workTimeout, self, ShutdownCluster)
77 | }
78 |
79 | case RemoveWorker(workerId) =>
80 | workers -= workerId
81 |
82 | case p @ ProgressReport(workerId, downloadId, progress) => // Workers send download progress reports at specific intervals
83 | log.debug("Heard from worker {}: {} ", workerId, progress)
84 | mediator ! DistributedPubSubMediator.Publish(ProgressTopic, DownloadProgress(downloadId, progress))
85 | workers.get(workerId) match
86 | {
87 | case Some(s@WorkerState(_, Busy(downloadJob, deadline))) =>
88 | workers += (workerId -> WorkerState(sender, status = Busy(downloadJob, Deadline.now + workTimeout))) // Renew current job deadline
89 | case _ =>
90 | }
91 |
92 | case RegisterWorker(workerId) => // Workers register themselves to the master at specific intervals
93 | if (workers.contains(workerId))
94 | {
95 | workers += (workerId -> workers(workerId).copy(ref = sender))
96 | }
97 | else
98 | {
99 | log.info("Worker registered: {}", workerId)
100 | workers += (workerId -> WorkerState(sender, status = Idle))
101 | if (pendingDownloads.nonEmpty)
102 | sender ! DownloadIsReady
103 | }
104 |
105 | case WorkerRequestsDownload(workerId) =>
106 | if (pendingDownloads.nonEmpty)
107 | {
108 | workers.get(workerId) match
109 | {
110 | case Some(s@WorkerState(_, Idle)) => // is the requesting Worker Idle?
111 | getFreeMirror foreach
112 | {
113 | case url => // We have a free mirror!
114 | val (downloadJob, rest) = pendingDownloads.dequeue
115 | pendingDownloads = rest
116 | val downloadWithMirror = MirroredDownloadJob(url, downloadJob)
117 |
118 | // Publish new download job so that DownloadResultConsumer can keep track of it
119 | mediator ! DistributedPubSubMediator.Publish(ProgressTopic, downloadWithMirror)
120 |
121 | sender ! downloadWithMirror // send new download job back to the Worker that sent the job request
122 | log.info("Giving worker {} a download job {}", workerId, downloadWithMirror)
123 |
124 | mirrorsInUse += (url -> (mirrorsInUse(url) + 1)) // decrement no. of threads to mirror
125 | workers += (workerId -> s.copy(status = Busy(downloadWithMirror, Deadline.now + workTimeout))) // set worker status to Busy
126 | }
127 | case _ =>
128 | }
129 | }
130 |
131 | case DownloadIsDone(workerId, downloadId, outputPath, totalBytes) =>
132 | workers.get(workerId) match
133 | {
134 | case Some(s@WorkerState(_, Busy(downloadJob, _))) if downloadJob.job.downloadId == downloadId =>
135 | log.debug("Download is done: {} => {} bytes written to {} by worker {}", downloadJob, totalBytes, outputPath, workerId)
136 |
137 | val mirror = downloadJob.baseUrl
138 | mirrorsInUse += (mirror -> (mirrorsInUse(mirror) - 1)) // decrement no. of threads to mirror
139 | workers += (workerId -> s.copy(status = Idle)) // set worker status to Idle
140 |
141 | // publish download result for DownloadResultConsumer to read
142 | mediator ! DistributedPubSubMediator.Publish(ResultsTopic, DownloadResult(downloadJob, outputPath, totalBytes))
143 |
144 | sender ! MasterWorkerMessage.Ack(downloadId) // Ack to worker
145 | case _ =>
146 | if (downloadIds.contains(downloadId))
147 | {
148 | // previous Ack was lost, confirm again that this is done
149 | sender ! MasterWorkerMessage.Ack(downloadId)
150 | }
151 | }
152 |
153 | case DownloadFailed(workerId, downloadId) =>
154 | workers.get(workerId) match
155 | {
156 | case Some(s@WorkerState(_, Busy(downloadJob, _))) if downloadJob.job.downloadId == downloadId =>
157 | log.info("Download failed: {}", downloadJob)
158 |
159 | val mirror = downloadJob.baseUrl
160 | mirrorsInUse += (mirror -> (mirrorsInUse(mirror) - 1))
161 | workers += (workerId -> s.copy(status = Idle))
162 |
163 | pendingDownloads = pendingDownloads enqueue downloadJob.job // put the download back into queue
164 | notifyWorkers()
165 | case _ =>
166 | }
167 |
168 | case job: DownloadJob => // client sent a new DownloadJob
169 | // idempotent
170 | if (downloadIds.contains(job.downloadId))
171 | {
172 | sender ! Master.Ack(job.downloadId)
173 | }
174 | else
175 | {
176 | log.info("Accepted download: {}", job)
177 | pendingDownloads = pendingDownloads enqueue job
178 | downloadIds += job.downloadId
179 | sender ! Master.Ack(job.downloadId)
180 | notifyWorkers()
181 | }
182 |
183 | case CleanupTick => // runs at fixed intervals, removes timed out jobs
184 | var hasBusy = false
185 | for ((workerId, s@WorkerState(_, Busy(downloadJob, timeout))) <- workers)
186 | {
187 | hasBusy = true
188 | if (timeout.isOverdue)
189 | {
190 | log.info("Download timed out: {}", downloadJob)
191 | workers -= workerId
192 | pendingDownloads = pendingDownloads enqueue downloadJob.job
193 | notifyWorkers()
194 | }
195 | }
196 | // publish MasterQueueEmpty if there are no pending downloads AND no workers are busy
197 | if(!hasBusy && pendingDownloads.isEmpty) mediator ! DistributedPubSubMediator.Publish(General, MasterQueueEmpty)
198 | }
199 |
200 | def getFreeMirror: Option[URL] =
201 | mirrorsInUse.find(_._2 < threadsPerMirror) match
202 | {
203 | case Some((url, _)) => Some(url)
204 | case _ => None
205 | }
206 |
207 | /** Tell idle workers that download is ready */
208 | def notifyWorkers(): Unit =
209 | if (pendingDownloads.nonEmpty)
210 | {
211 | // TODO: Pick workers more intelligently, according to number of bytes downloaded by each worker
212 | // to encourage better spreading out of downloads over the cluster - better for distributed processing too.
213 | workers.foreach
214 | {
215 | case (_, WorkerState(ref, Idle)) => ref ! DownloadIsReady
216 | case _ => // busy
217 | }
218 | }
219 |
220 | // TODO cleanup old workers
221 | // TODO cleanup old downloadIds
222 | }
223 |
224 | object Master
225 | {
226 | val ResultsTopic = "results"
227 | val ProgressTopic = "progress"
228 | val General = "general"
229 |
230 | def props(workTimeout: FiniteDuration, mirrors: Seq[URL], threadsPerMirror: Int): Props =
231 | Props(classOf[Master], workTimeout, mirrors, threadsPerMirror)
232 |
233 | case class Ack(downloadId: String)
234 |
235 | private sealed trait WorkerStatus
236 | private case object Idle extends WorkerStatus
237 | private case class Busy(job: MirroredDownloadJob, deadline: Deadline) extends WorkerStatus
238 | private case class WorkerState(ref: ActorRef, status: WorkerStatus)
239 |
240 | private case object CleanupTick
241 |
242 | }
--------------------------------------------------------------------------------
/download/src/main/scala/org/dbpedia/extraction/dump/download/actors/Worker.scala:
--------------------------------------------------------------------------------
1 | package org.dbpedia.extraction.dump.download.actors
2 |
3 | import akka.actor._
4 | import scala.concurrent.duration._
5 | import java.util.UUID
6 | import akka.actor.SupervisorStrategy.{Stop, Restart}
7 | import org.dbpedia.extraction.dump.download.actors.message._
8 | import GeneralMessage.ShutdownCluster
9 | import scala.language.postfixOps
10 | import org.dbpedia.extraction.dump.download.actors.Worker.DownloadComplete
11 | import scala.Some
12 | import akka.actor.OneForOneStrategy
13 | import akka.contrib.pattern.ClusterClient.SendToAll
14 | import org.dbpedia.extraction.dump.download.actors.message.DownloadJob
15 | import akka.actor.Terminated
16 | import akka.actor.DeathPactException
17 |
18 | /**
19 | * Worker actor that runs on each worker node. This dispatches a download job to a child DownloadJobRunner actor
20 | * which manages download and a DownloadProgressTracker to send progress reports back to the Worker.
21 | *
22 | * @param clusterClient Akka ClusterClient that acts as a proxy to the master
23 | * @param downloadRunnerProps Props for the downloadRunner actor. See Worker.props()
24 | * @param maxDuplicateProgress Maximum number of consecutive duplicate progress read bytes to tolerate
25 | * @param registerInterval The worker registers itself with the master every registerInterval
26 | */
27 | class Worker(clusterClient: ActorRef, downloadRunnerProps: Props, maxDuplicateProgress: Int, registerInterval: FiniteDuration)
28 | extends Actor with ActorLogging
29 | {
30 |
31 | import MasterWorkerMessage._
32 | import WorkerProgressMessage._
33 | import context.dispatcher
34 |
35 | def scheduler = context.system.scheduler
36 |
37 | val workerId = UUID.randomUUID().toString
38 |
39 | // Register to the master at specific intervals.
40 | val registerTask = context.system.scheduler.schedule(0.seconds, registerInterval, clusterClient,
41 | SendToAll("/user/master/active", RegisterWorker(workerId)))
42 |
43 | val downloadRunner = context.watch(context.actorOf(downloadRunnerProps, "runner"))
44 |
45 | var currentDownloadId: Option[String] = None
46 |
47 | private var totalBytes = 0l
48 | private var currentBytes = 0l
49 | private var progressDelays = 0
50 |
51 | def downloadId: String = currentDownloadId match
52 | {
53 | case Some(workId) => workId
54 | case None => throw new IllegalStateException("Not working")
55 | }
56 |
57 | override def supervisorStrategy =
58 | OneForOneStrategy()
59 | {
60 | case _: ActorInitializationException => Stop
61 | case _: DeathPactException => Stop
62 | case _: Exception =>
63 | currentDownloadId foreach (workId => sendToMaster(DownloadFailed(workerId, workId)))
64 | context.become(idle)
65 | Restart
66 | }
67 |
68 | override def postStop(): Unit = registerTask.cancel()
69 |
70 | def receive = idle
71 |
72 | def idle: Receive =
73 | {
74 | case ShutdownCluster => // Master sends ShutdownCluster
75 | sendToMaster(RemoveWorker(workerId))
76 | scheduler.scheduleOnce(5 seconds)
77 | {
78 | registerTask.cancel()
79 | context.stop(downloadRunner)
80 | context.stop(self)
81 | context.system.shutdown()
82 | }
83 |
84 | case DownloadIsReady => // begin 3-way handshake to get download job from master
85 | sendToMaster(WorkerRequestsDownload(workerId))
86 |
87 | case job @ MirroredDownloadJob(_, DownloadJob(downloadId, _)) => // receive new download job
88 | log.info("Got download job: {}", job)
89 | currentDownloadId = Some(downloadId)
90 |
91 | // reset state variables for new download job
92 | currentBytes = 0
93 | totalBytes = 0
94 | progressDelays = 0
95 |
96 | downloadRunner ! job
97 | context.become(working)
98 | }
99 |
100 | def working: Receive =
101 | {
102 | case p @ ProgressStart(total) =>
103 | sendToMaster(ProgressReport(workerId, downloadId, p))
104 | if(totalBytes == 0) totalBytes = total
105 |
106 | case p @ Progress(bytes) =>
107 | sendToMaster(ProgressReport(workerId, downloadId, p))
108 |
109 | // check if number of bytes downloaded has increased.
110 | if(bytes > currentBytes)
111 | {
112 | currentBytes = bytes
113 | progressDelays = 0
114 | }
115 | else
116 | {
117 | progressDelays += 1
118 | }
119 |
120 | if(progressDelays > maxDuplicateProgress && totalBytes != bytes) // too many progress delays?
121 | {
122 | val delay = progressDelays * downloadRunnerProps.args(0).asInstanceOf[FiniteDuration].toSeconds
123 | log.info(s"Download progress of $currentDownloadId has stagnated. No update occurred in $delay seconds!")
124 | sendToMaster(DownloadFailed(workerId, currentDownloadId.get))
125 | }
126 |
127 | case DownloadComplete(output, bytes) => // DownloadJobRunner sends this upon completion
128 | log.info("Download is complete. Output file: {}. Total bytes: {}", output, bytes)
129 | sendToMaster(DownloadIsDone(workerId, downloadId, output, bytes))
130 | context.setReceiveTimeout(10.seconds)
131 | context.become(waitForDownloadIsDoneAck(output, bytes)) // Send news of finished download to Master and wait for ACK.
132 |
133 | case ShutdownCluster =>
134 | log.info("Yikes. Master told me to shutdown, while I'm downloading.")
135 |
136 | case _: MirroredDownloadJob =>
137 | log.info("Yikes. Master gave me a download job, while I'm downloading.")
138 | }
139 |
140 | def waitForDownloadIsDoneAck(outputFilePath: String, bytes: Long): Receive =
141 | {
142 | case Ack(id) if id == downloadId =>
143 | sendToMaster(WorkerRequestsDownload(workerId))
144 | context.setReceiveTimeout(Duration.Undefined)
145 | context.become(idle)
146 | case ReceiveTimeout =>
147 | log.info("No ACK from master, retrying")
148 | sendToMaster(DownloadIsDone(workerId, downloadId, outputFilePath, bytes))
149 | }
150 |
151 | override def unhandled(message: Any): Unit = message match
152 | {
153 | case Terminated(`downloadRunner`) => context.stop(self)
154 | case DownloadIsReady =>
155 | case _ => super.unhandled(message)
156 | }
157 |
158 | def sendToMaster(msg: Any): Unit =
159 | {
160 | clusterClient ! SendToAll("/user/master/active", msg)
161 | }
162 | }
163 |
164 | object Worker
165 | {
166 | def props(clusterClient: ActorRef, downloadRunnerProps: Props, maxDuplicateProgress: Int, registerInterval: FiniteDuration = 10.seconds): Props =
167 | Props(classOf[Worker], clusterClient, downloadRunnerProps, maxDuplicateProgress, registerInterval)
168 |
169 | case class DownloadComplete(outputFilePath: String, bytes: Long)
170 |
171 | }
--------------------------------------------------------------------------------
/download/src/main/scala/org/dbpedia/extraction/dump/download/actors/message/DownloadJob.scala:
--------------------------------------------------------------------------------
1 | package org.dbpedia.extraction.dump.download.actors.message
2 |
3 | import java.net.URL
4 | import org.dbpedia.extraction.dump.download.actors.message.WorkerProgressMessage.ProgressMessage
5 |
6 | /**
7 | * Download job used by the actor framework.
8 | *
9 | * @param downloadId Unique job ID
10 | * @param file URL information
11 | */
12 | case class DownloadJob(downloadId: String, file: DumpFile)
13 |
14 | /**
15 | * Download job wrapped along with the mirror to use for downloading.
16 | * This contains all the information needed by DownloadJobRunner to perform the job.
17 | *
18 | * @param baseUrl URL of the mirror to download from
19 | * @param job download job
20 | */
21 | case class MirroredDownloadJob(baseUrl: URL, job: DownloadJob)
22 |
23 | /**
24 | * Download information for single wiki dump file.
25 | *
26 | * @param baseDir Base directory on Hadoop file system (HDFS for distributed downloads)
27 | * @param wikiSuffix Wiki name suffix (eg. wiki)
28 | * @param language Language wikiCode
29 | * @param date YYYYMMDD date string
30 | * @param fileName URL file name
31 | */
32 | case class DumpFile(baseDir: String, wikiSuffix: String, language: String, date: String, fileName: String)
33 |
34 | /**
35 | * Download job used by the actor framework.
36 | *
37 | * @param job MirroredDownloadJob
38 | * @param outputPath Output path name in scheme://path/fileName format
39 | * @param bytes Total bytes downloaded
40 | */
41 | case class DownloadResult(job: MirroredDownloadJob, outputPath: String, bytes: Long)
42 |
43 | /**
44 | * Progress reports published by Master.
45 | *
46 | * @param downloadId Unique job ID
47 | * @param progress Progress message
48 | */
49 | case class DownloadProgress(downloadId: String, progress: ProgressMessage)
--------------------------------------------------------------------------------
/download/src/main/scala/org/dbpedia/extraction/dump/download/actors/message/DownloaderProgressMessage.scala:
--------------------------------------------------------------------------------
1 | package org.dbpedia.extraction.dump.download.actors.message
2 |
3 | object DownloaderProgressMessage
4 | {
5 | // From Downloader or DownloadJobRunner to DownloadProgressTracker
6 | case class Read(bytesRead: Long)
7 | case class Start(totalBytes: Long) // totalBytes = total content length
8 | case object Stop
9 |
10 | // From DownloadProgressTracker to DownloadJobRunner
11 | case class ProgressEnd(bytes: Long)
12 | }
13 |
--------------------------------------------------------------------------------
/download/src/main/scala/org/dbpedia/extraction/dump/download/actors/message/GeneralMessage.scala:
--------------------------------------------------------------------------------
1 | package org.dbpedia.extraction.dump.download.actors.message
2 |
3 | object GeneralMessage
4 | {
5 | // This message is used by different actors to propagate a cluster shutdown.
6 | case object ShutdownCluster
7 |
8 | // This message is published by the master when the pending download queue is empty.
9 | case object MasterQueueEmpty
10 | }
11 |
--------------------------------------------------------------------------------
/download/src/main/scala/org/dbpedia/extraction/dump/download/actors/message/MasterWorkerMessage.scala:
--------------------------------------------------------------------------------
1 | package org.dbpedia.extraction.dump.download.actors.message
2 |
3 | import org.dbpedia.extraction.dump.download.actors.message.WorkerProgressMessage.ProgressMessage
4 |
5 | object MasterWorkerMessage
6 | {
7 | // Messages from Workers
8 | case class RegisterWorker(workerId: String)
9 | case class WorkerRequestsDownload(workerId: String)
10 | case class DownloadIsDone(workerId: String, downloadId: String, outputPath: String, bytes: Long)
11 | case class DownloadFailed(workerId: String, downloadId: String)
12 | case class ProgressReport(workerId: String, downloadId: String, progress: ProgressMessage) // progress = number of bytes read till now
13 | case class RemoveWorker(workerId: String)
14 |
15 | // Messages to Workers
16 | case object DownloadIsReady
17 | case class Ack(id: String)
18 | }
19 |
--------------------------------------------------------------------------------
/download/src/main/scala/org/dbpedia/extraction/dump/download/actors/message/WorkerProgressMessage.scala:
--------------------------------------------------------------------------------
1 | package org.dbpedia.extraction.dump.download.actors.message
2 |
3 | object WorkerProgressMessage
4 | {
5 | // DownloadProgressTracker to Worker
6 | trait ProgressMessage
7 | case class Progress(bytes: Long) extends ProgressMessage
8 | case class ProgressStart(bytes: Long) extends ProgressMessage
9 | }
10 |
--------------------------------------------------------------------------------
/download/src/main/scala/org/dbpedia/extraction/util/RemoteExecute.scala:
--------------------------------------------------------------------------------
1 | package org.dbpedia.extraction.util
2 |
3 | import com.jcraft.jsch.{JSch, JSchException, ChannelExec, Session}
4 | import java.io.IOException
5 |
6 | /**
7 | * Utility trait for creating an SSH session and executing remote commands.
8 | */
9 | trait RemoteExecute
10 | {
11 | val jsch = new JSch()
12 |
13 | def addIdentity(privateKeyPath: String, passphrase: String) = jsch.addIdentity(privateKeyPath, passphrase)
14 |
15 | def addIdentity(privateKeyPath: String) = jsch.addIdentity(privateKeyPath)
16 |
17 | def createSession(userName: String, host: String): Session =
18 | {
19 | val session = jsch.getSession(userName, host)
20 | session.setConfig("UserKnownHostsFile", "/dev/null")
21 | session.setConfig("CheckHostIP", "no")
22 | session.setConfig("StrictHostKeyChecking", "no")
23 | session.connect()
24 | session
25 | }
26 |
27 | def execute(session: Session, command: String): String =
28 | {
29 | val outputBuffer = new StringBuilder()
30 |
31 | val channel = session.openChannel("exec").asInstanceOf[ChannelExec]
32 | channel.setCommand(command)
33 | channel.connect()
34 | channel.setErrStream(System.err)
35 |
36 | val commandOutput = channel.getInputStream
37 | var readByte = commandOutput.read()
38 |
39 | while (readByte != 0xffffffff)
40 | {
41 | outputBuffer.append(readByte)
42 | readByte = commandOutput.read()
43 | }
44 |
45 | channel.disconnect()
46 | outputBuffer.toString()
47 | }
48 | }
49 |
--------------------------------------------------------------------------------
/download/src/test/resources/dist-download.properties:
--------------------------------------------------------------------------------
1 | # NOTE: format is not java.util.Properties, but org.dbpedia.extraction.dump.download.DownloadConfig
2 |
3 | #distconfig=/example/path/file.cfg
4 | # Path to existing distributed download configuration text file (UTF-8) whose lines contain arguments
5 | # in the format given here. Absolute or relative path. File paths in that config file will be interpreted
6 | # relative to the config file.
7 |
8 | #extraction-framework-home=/path/to/distributed-extraction-framework
9 | # This must be set to the absolute path to the distributed extraction framework (containing this module)
10 | # in all nodes. No default value is set.
11 |
12 | mirrors=http://dumps.wikimedia.org/,http://wikipedia.c3sl.ufpr.br/,http://ftp.fi.muni.cz/pub/wikimedia/,http://dumps.wikimedia.your.org/
13 | # List of mirrors to download from in the form of comma-separated URLs. Choose from the list of mirrors at:
14 | # http://meta.wikimedia.org/wiki/Mirroring_Wikimedia_project_XML_dumps#Current_Mirrors
15 | # Example: mirrors=http://dumps.wikimedia.org/,http://wikipedia.c3sl.ufpr.br,http://ftp.fi.muni.cz/pub/wikimedia/,http://dumps.wikimedia.your.org/
16 |
17 | threads-per-mirror=2
18 | # Number of simultaneous downloads from each mirror per slave node. Set to 2 by default.
19 |
20 | workers-per-slave=2
21 | # Number of workers to run per slave. This is set to 2 by default.
22 | # Setting it to (no. of mirrors) * threads-per-mirror is recommended for exploiting maximum parallelism. On the other hand,
23 | # if your whole cluster has only one public facing IP it is better to set this to a low number like 1.
24 |
25 | progress-interval=2
26 | # Progress report time interval in secs - the driver node receives real-time progress reports for running downloads from the workers.
27 | # If a worker fails to send a progress report of the current download under the given timeout (the timeout is set to something
28 | # like progressReportInterval + 2 to be safe) the download job will be marked as failed and inserted back into the pending
29 | # download queue. This is 2 seconds by default.
30 |
31 | max-duplicate-progress-reports=30
32 | # Maximum number of consecutive duplicate progress read bytes to tolerate. The workers keep track of download progress;
33 | # if a download gets stuck consecutive progress reports will contain the same number of bytes downloaded. If this is set
34 | # to 30 (not recommended to go below that), the worker will declare a job as failed only after getting the same progress
35 | # report for 30 times. By default set to 30.
36 |
37 | local-temp-dir=/tmp
38 | # Local temporary directory on worker nodes. Each dump file/chunk is downloaded to this directory before being moved to
39 | # the configured Hadoop file system. This is /tmp by default.
40 |
41 | #private-key=/path/to/id_rsa
42 | # Optional identity file to connect to cluster nodes via SSH.
43 |
44 | #ssh-passphrase=passphrase
45 | # Optional passphrase for SSH private key.
46 |
47 | sequential-languages=false
48 | # If each language consists of multiple dump files (eg. enwiki-latest-pages-articles1.xml-p000000010p000010000.bz2)
49 | # they are downloaded in parallel. Multiple languages are downloaded in parallel too, giving us 2 levels of
50 | # parallelism. If sequentialLanguages is set to true, one language is downloaded at a time, otherwise,
51 | # all languages are downloaded in parallel.
52 |
53 | #hadoop-coresite-xml-path=/path/to/core-site.xml
54 | # Path to hadoop core-site.xml configuration file.
55 |
56 | #hadoop-hdfssite-xml-path=/path/to/hdfs-site.xml
57 | # Path to hadoop hdfs-site.xml configuration file.
58 |
59 | #hadoop-mapredsite-xml-path=/path/to/mapred-site.xml
60 | # Path to hadoop mapred-site.xml configuration file.
61 |
62 | master=127.0.0.1
63 | # Master node host.
64 |
65 | slaves=127.0.0.1
66 | # List of comma-separated slave hosts. Example: slaves=node1,node2,node3
67 |
68 | base-dir=/tmp/basedir
69 | # Replace by your target folder. If this is omitted here, it is read from the general configuration file if there is any.
70 |
71 | #join=akka.tcp://Workers@hostname:port
72 | # This variable needs to be specified when starting up a worker manually. Do not use this variable unless you know what you're
73 | # doing. The driver node automatically starts up workers on the slaves and takes care of this variable. Never set this variable
74 | # when starting up the master/driver.
--------------------------------------------------------------------------------
/download/src/test/resources/download.properties:
--------------------------------------------------------------------------------
1 | # NOTE: format is not java.util.Properties, but org.dbpedia.extraction.dump.download.DownloadConfig
2 |
3 | # Default download server. It lists mirrors which may be faster.
4 | base-url=http://dumps.wikimedia.org/
5 |
6 | # Replace by your target folder.
7 | base-dir=/home/gonephishing/dbpedia-extraction/distributed-extraction-framework/dumps/files
8 |
9 | # This setting is recommended for large languages that have part files (eg. en, fr). See below. Replace xx/yy by your language.
10 | #download=xx,yy:@pages-articles\d+\.xml.*\.bz2
11 | download=en:pages-articles1.xml-p000000010p000010000.bz2
12 |
13 | # This setting should be provided for small languages that have no part files (eg. li)
14 | #download=xx,yy:pages-articles.xml.bz2
15 |
16 | # You may provide multiple "download=" lines for different types of languages, just like above.
17 |
18 | ###### Download part files ######
19 | #
20 | # Please make sure that the regex actually matches the format used for xx dumps
21 | # by checking http://dumps.wikimedia.org/xxwiki/yyyymmdd
22 | #
23 | # Example:
24 | # enwiki => enwiki-20131120-pages-articles1.xml-p000000010p000010000.bz2 hence @pages-articles\d+\.xml-p\d+p\d+\.bz2 matches
25 | # frwiki => frwiki-20131120-pages-articles1.xml.bz2 hence @pages-articles\d+\.xml\.bz2 matches (the previous regex does not!)
26 | #
27 | # NOTE: @pages-articles\d+\.xml.*\.bz2 is especially recommended when using the distributed downloader because it captures both
28 | # the above types and exploits maximum parallelism by allowing multiple part files to be downloaded and processed simultaneously.
29 | #
30 | # Remember that certain languages have small dumps and therefore no part files at all. They need to be handled with only
31 | # pages-articles.xml.bz2. Example with both small and large languages (setting download multiple times works like appending; so
32 | # adding both download's below is perfectly valid):
33 | #
34 | # download=en,fr:@pages-articles\d+\.xml.*\.bz2
35 | # download=li,bn,ilo:pages-articles.xml.bz2
36 | #
37 | # commonswiki => it does not have part files! This is true for other wikis as well. In this case xx:pages-articles.xml.bz2
38 | # shoud be used (e.g. commons:pages-articles.xml.bz2 or cowiki:pages-articles.xml.bz2)
39 | #
40 | # download=xx:@pages-articles\d+\.xml-p\d+p\d+\.bz2
41 | # download=xx:@pages-articles\d+\.xml.*\.bz2
42 |
43 | # Only needed for the ImageExtractor
44 | # download=commons:pages-articles.xml.bz2
45 |
46 | # Unzip files while downloading? Not necessary, extraction will unzip on the fly. Let's save space.
47 | unzip=false
48 |
49 | # Sometimes connecting to the server fails, so we try five times with pauses of 10 seconds.
50 | retry-max=5
51 | retry-millis=1000
52 |
--------------------------------------------------------------------------------
/extraction/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
5 |
6 | distributed-extraction
7 | org.dbpedia
8 | 4.1-SNAPSHOT
9 |
10 | 4.0.0
11 |
12 | org.dbpedia.distributed-extraction
13 | extraction
14 | 4.1-SNAPSHOT
15 | DBpedia Distributed Dump Extractor
16 |
17 |
18 |
19 |
20 |
21 | org.apache.maven.plugins
22 | maven-shade-plugin
23 | 1.7
24 |
25 |
26 | package
27 |
28 | shade
29 |
30 |
31 |
32 |
33 |
34 |
35 | net.alchim31.maven
36 | scala-maven-plugin
37 |
38 |
39 |
40 |
41 | seq-extraction
42 | org.dbpedia.extraction.dump.extract.Extraction
43 |
44 |
45 | -server
46 |
58 |
59 |
60 |
61 |
62 | extraction
63 | org.dbpedia.extraction.dump.extract.DistExtraction
64 |
65 |
66 | -server
67 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 | org.dbpedia.distributed-extraction
90 | common
91 | 4.1-SNAPSHOT
92 |
93 |
94 |
95 | org.dbpedia.extraction
96 | core
97 | 4.1
98 |
99 |
100 |
101 | org.dbpedia.extraction
102 | dump
103 | 4.1
104 |
105 |
106 |
107 | org.dbpedia.extraction
108 | scripts
109 | 4.1
110 |
111 |
112 |
113 | org.apache.spark
114 | spark-core_2.11
115 | ${spark.version}
116 | provided
117 |
118 |
119 |
120 | org.apache.hadoop
121 | hadoop-client
122 | ${hadoop.version}
123 |
124 |
125 |
126 | org.apache.hadoop
127 | hadoop-common
128 | ${hadoop.version}
129 |
130 |
131 |
132 | org.scalatest
133 | scalatest_2.11
134 | test
135 |
136 |
137 |
138 | junit
139 | junit
140 | 4.8.2
141 | test
142 |
143 |
144 |
145 |
146 |
--------------------------------------------------------------------------------
/extraction/src/main/scala/org/apache/spark/ui/jobs/DBpediaJobProgressListener.scala:
--------------------------------------------------------------------------------
1 | package org.apache.spark.ui.jobs
2 |
3 | import org.apache.spark.{Logging, SparkConf}
4 | import org.apache.spark.scheduler._
5 | import org.apache.spark.scheduler.SparkListenerTaskEnd
6 | import org.apache.spark.scheduler.SparkListenerJobEnd
7 | import org.apache.spark.scheduler.SparkListenerStageSubmitted
8 | import org.apache.spark.scheduler.SparkListenerStageCompleted
9 | import org.apache.spark.scheduler.SparkListenerTaskStart
10 | import org.apache.spark.scheduler.SparkListenerJobStart
11 | import org.dbpedia.extraction.util.StringUtils
12 | import scala.collection.mutable
13 |
14 | /**
15 | * SparkListener implementation that provides real-time logging for jobs, tasks and stages in a
16 | * friendly way omitting most of the details that can be had using Spark's default logging
17 | * system.
18 | *
19 | * This is in the org.apache.spark.ui.jobs package because it needs to extend
20 | * org.apache.spark.ui.jobs.JobProgressListener which is private[spark].
21 | */
22 | class DBpediaJobProgressListener(sc: SparkConf) extends JobProgressListener(sc) with Logging
23 | {
24 | /**
25 | * The time when this class was created (usually along with the SparkContext).
26 | * Milliseconds since midnight, January 1, 1970 UTC.
27 | */
28 | val startTime = System.currentTimeMillis()
29 |
30 | val stageNumTasks = mutable.Map[Int, Int]() // Maps stageId to number of tasks
31 |
32 | override def onStageSubmitted(stageSubmitted: SparkListenerStageSubmitted): Unit =
33 | {
34 | super.onStageSubmitted(stageSubmitted)
35 | val stage = stageSubmitted.stageInfo
36 | val numTasks = stage.numTasks
37 | stageNumTasks.synchronized(stageNumTasks(stage.stageId) = numTasks)
38 | val time = prettyTime(stage.submissionTime.getOrElse(startTime))
39 | logInfo("Stage #%d: Starting stage %s with %d tasks at %s".format(stage.stageId, stage.name, numTasks, time))
40 | }
41 |
42 | override def onStageCompleted(stageCompleted: SparkListenerStageCompleted): Unit =
43 | {
44 | super.onStageCompleted(stageCompleted)
45 | val stage = stageCompleted.stageInfo
46 | val time = prettyTime(stage.completionTime.getOrElse(startTime))
47 | logInfo("Stage #%d: Finished stage %s at %s".format(stage.stageId, stage.name, time))
48 | }
49 |
50 | override def onTaskStart(taskStart: SparkListenerTaskStart): Unit =
51 | {
52 | super.onTaskStart(taskStart)
53 | val executor = taskStart.taskInfo.executorId
54 | val host = taskStart.taskInfo.host
55 | val time = prettyTime(taskStart.taskInfo.launchTime)
56 | val taskId = taskStart.taskInfo.taskId
57 | val stageId = taskStart.taskInfo.taskId
58 | // Get TaskInfos for this stage to compute number of tasks
59 | val numTasks = this.stageIdToInfo.size
60 | //val numTasks = this.stageIdToTaskInfos(stageId).size
61 | logInfo("Stage #%d: Started task #%d on host %s, executor %s at %s. Total tasks submitted: %d".format(stageId, taskId, host, executor, time, numTasks))
62 | }
63 |
64 | override def onTaskEnd(taskEnd: SparkListenerTaskEnd): Unit =
65 | {
66 | super.onTaskEnd(taskEnd)
67 | val time = prettyTime(taskEnd.taskInfo.finishTime)
68 | val taskId = taskEnd.taskInfo.taskId
69 | val stageId = taskEnd.stageId
70 | val totalNumTasks = stageNumTasks(taskEnd.stageId)
71 | // Get TaskInfos for this stage to compute number of tasks
72 | val numTasks = this.stageIdToInfo.size
73 | //val numTasks = this.stageIdToTaskInfos(stageId).size
74 | // Wrap in try/catch to return 0 if no completed/failed tasks for stageId are found in the maps.
75 | val finished = try { this.numCompletedStages } catch { case ex: NoSuchElementException =>0 }
76 | val failed = try { this.numFailedStages } catch { case ex: NoSuchElementException =>0 }
77 | //val finished = try { this.stageIdToTasksComplete(stageId) } catch { case ex: NoSuchElementException => 0 }
78 | //val failed = try { this.stageIdToTasksFailed(stageId) } catch { case ex: NoSuchElementException => 0 }
79 | logInfo("Stage #%d: Finished task #%d at %s. Completed: %d/%d Failed: %d/%d Total Progress: %d/%d".format(stageId, taskId, time, finished, numTasks, failed, numTasks, finished, totalNumTasks))
80 | }
81 |
82 | override def onJobStart(jobStart: SparkListenerJobStart): Unit =
83 | {
84 | super.onJobStart(jobStart)
85 | logInfo("Started job #" + jobStart.jobId)
86 | }
87 |
88 | override def onJobEnd(jobEnd: SparkListenerJobEnd): Unit =
89 | {
90 | super.onJobEnd(jobEnd)
91 | logInfo("Finished job #" + jobEnd.jobId)
92 | }
93 |
94 | override def onTaskGettingResult(taskGettingResult: SparkListenerTaskGettingResult): Unit =
95 | {
96 | super.onTaskGettingResult(taskGettingResult)
97 | }
98 |
99 | private def prettyTime(time: Long) = StringUtils.prettyMillis(time - startTime)
100 | }
--------------------------------------------------------------------------------
/extraction/src/main/scala/org/dbpedia/extraction/destinations/DistDeduplicatingWriterDestination.scala:
--------------------------------------------------------------------------------
1 | package org.dbpedia.extraction.destinations
2 |
3 | import org.apache.hadoop.fs.Path
4 | import org.apache.hadoop.conf.Configuration
5 | import org.apache.spark.rdd.RDD
6 | import org.apache.hadoop.io.Text
7 | import org.dbpedia.extraction.spark.io.QuadSeqWritable
8 | import org.dbpedia.extraction.spark.io.output.DBpediaCompositeOutputFormat
9 | import org.apache.spark.SparkContext._
10 |
11 | /**
12 | * Destination where RDF graphs are deduplicated and written to a Hadoop Path.
13 | *
14 | * @param path Path used by DBpediaCompositeOutputFormat to write outputs
15 | * @param hadoopConfiguration Hadoop Configuration object
16 | */
17 | class DistDeduplicatingWriterDestination(path: Path, hadoopConfiguration: Configuration) extends DistDestination
18 | {
19 | override def open() = ()
20 |
21 | /**
22 | * Writes RDD of quads (after extracting unique quads) to path using DBpediaCompositeOutputFormat.
23 | *
24 | * @param rdd RDD[ Seq[Quad] ]
25 | */
26 | override def write(rdd: RDD[Seq[Quad]])
27 | {
28 | rdd.flatMap
29 | {
30 | quads =>
31 | quads.distinct.groupBy(quad => new Text(quad.dataset)).toSeq.map
32 | {
33 | case (key: Text, quads: Seq[Quad]) => (key, new QuadSeqWritable(quads))
34 | }
35 | }.saveAsNewAPIHadoopFile(path.toString,
36 | classOf[Text],
37 | classOf[QuadSeqWritable],
38 | classOf[DBpediaCompositeOutputFormat],
39 | hadoopConfiguration)
40 | }
41 |
42 | override def close() = ()
43 | }
44 |
--------------------------------------------------------------------------------
/extraction/src/main/scala/org/dbpedia/extraction/destinations/DistDestination.scala:
--------------------------------------------------------------------------------
1 | package org.dbpedia.extraction.destinations
2 |
3 | import org.apache.spark.rdd.RDD
4 |
5 | /**
6 | * A distributed destination for RDF quads.
7 | */
8 | trait DistDestination
9 | {
10 | /**
11 | * Opens this destination. This method should only be called once during the lifetime
12 | * of a destination, and it should not be called concurrently with other methods of this class.
13 | */
14 | def open(): Unit
15 |
16 | /**
17 | * Writes RDD of quads to this destination.
18 | *
19 | * @param rdd RDD[ Seq[Quad] ]
20 | */
21 | def write(rdd: RDD[Seq[Quad]]): Unit
22 |
23 | /**
24 | * Closes this destination. This method should only be called once during the lifetime
25 | * of a destination, and it should not be called concurrently with other methods of this class.
26 | */
27 | def close(): Unit
28 | }
29 |
--------------------------------------------------------------------------------
/extraction/src/main/scala/org/dbpedia/extraction/destinations/DistMarkerDestination.scala:
--------------------------------------------------------------------------------
1 | package org.dbpedia.extraction.destinations
2 |
3 | import org.dbpedia.extraction.util.FileLike
4 | import java.io.IOException
5 |
6 | /**
7 | * MakerDestination that wraps a DistDestination. The code has been taken from MakerDestination.
8 | *
9 | * Handles a marker file that signals that the extraction is either running ('start mode')
10 | * or finished ('end mode').
11 | *
12 | * In 'start mode', the file is created before the extraction starts (it must not already exist)
13 | * and deleted after the extraction ends.
14 | *
15 | * In 'end mode', the file is deleted before the extraction starts (if it already exists)
16 | * and re-created after the extraction ends.
17 | *
18 | * @param file marker file
19 | * @param start 'start mode' if true, 'end mode' if false.
20 | */
21 | class DistMarkerDestination(destination: DistDestination, file: FileLike[_], start: Boolean)
22 | extends DistWrapperDestination(destination)
23 | {
24 | override def open(): Unit =
25 | {
26 | if (start) create() else delete()
27 | super.open()
28 | }
29 |
30 | override def close(): Unit =
31 | {
32 | super.close()
33 | if (!start) create() else delete()
34 | }
35 |
36 | private def create(): Unit =
37 | {
38 | if (file.exists) throw new IOException("file '" + file + "' already exists")
39 | file.outputStream().close()
40 | }
41 |
42 | private def delete(): Unit =
43 | {
44 | if (file.exists) file.delete()
45 | }
46 | }
--------------------------------------------------------------------------------
/extraction/src/main/scala/org/dbpedia/extraction/destinations/DistWrapperDestination.scala:
--------------------------------------------------------------------------------
1 | package org.dbpedia.extraction.destinations
2 |
3 | import org.apache.spark.rdd.RDD
4 |
5 | /**
6 | * Base class for DistDestination objects that forward most calls to another destination.
7 | */
8 | abstract class DistWrapperDestination(destination: DistDestination) extends DistDestination
9 | {
10 | override def open() = destination.open()
11 |
12 | def write(rdd: RDD[Seq[Quad]]) = destination.write(rdd)
13 |
14 | override def close() = destination.close()
15 | }
--------------------------------------------------------------------------------
/extraction/src/main/scala/org/dbpedia/extraction/dump/extract/DistConfig.scala:
--------------------------------------------------------------------------------
1 | package org.dbpedia.extraction.dump.extract
2 |
3 | import java.util.Properties
4 | import scala.collection.JavaConversions.asScalaSet
5 | import org.dbpedia.extraction.util.ConfigUtils.getValue
6 | import java.io.File
7 | import org.apache.spark.storage.StorageLevel
8 | import java.net.URI
9 | import org.apache.log4j.Level
10 | import org.dbpedia.extraction.util.HadoopConfigurable
11 | import org.apache.hadoop.fs.Path
12 |
13 | /**
14 | * Class for distributed configuration. Delegates general stuff except directory/file properties to Config.
15 | *
16 | * Note that dumpDir, ontologyFile and mappingsDir are first checked in distConfigProps;
17 | * if not found they're checked in extractionConfigProps.
18 | *
19 | * @param distConfigProps Distributed extraction configuration properties
20 | * @param extractionConfigProps General extraction framework configuration properties
21 | * @see Config
22 | */
23 | class DistConfig(distConfigProps: Properties, extractionConfigProps: Properties, val extractionConfigFile: URI) extends HadoopConfigurable
24 | {
25 | private val extractionConfig = new ExtractionConfig()
26 |
27 | /** It is recommended that spark-home and spark-master are explicitly provided. */
28 | val sparkHome = distConfigProps.getProperty("spark-home", sys.env.get("SPARK_HOME").getOrElse(""))
29 |
30 | /** By default assume master is runnning locally; use 4 cores */
31 | val sparkMaster = distConfigProps.getProperty("spark-master", "local[4]")
32 |
33 | /** Shows up on Spark Web UI */
34 | val sparkAppName = distConfigProps.getProperty("spark-appname", "dbpedia-distributed-extraction-framework")
35 |
36 | /**
37 | * The StorageLevel to be used when calling RDD.persist() unless otherwise specified. Choose any of these:
38 | * MEMORY_ONLY
39 | * MEMORY_AND_DISK
40 | * MEMORY_ONLY_SER
41 | * MEMORY_AND_DISK_SER
42 | * DISK_ONLY
43 | * MEMORY_ONLY_2, MEMORY_AND_DISK_2 etc.
44 | *
45 | * By default it is set to MEMORY_AND_DISK_SER
46 | *
47 | * @see org.apache.spark.storage.StorageLevel
48 | */
49 | val sparkStorageLevel = Option(
50 | getValue(distConfigProps, "spark-storage-level", required = false)
51 | {
52 | level => StorageLevel.getClass.getDeclaredMethod(level).invoke(StorageLevel).asInstanceOf[StorageLevel]
53 | }
54 | ).getOrElse(StorageLevel.MEMORY_AND_DISK_SER)
55 |
56 | /** Map of optional spark configuration properties. See http://spark.apache.org/docs/latest/configuration.html */
57 | val sparkProperties = distConfigProps.stringPropertyNames().filter(_.startsWith("spark.")).map(x => (x, distConfigProps.getProperty(x))).toMap
58 |
59 | /** Path to hadoop core-site.xml */
60 | override protected val hadoopCoreConf = distConfigProps.getProperty("hadoop-coresite-xml-path")
61 |
62 | /** Path to hadoop hdfs-site.xml */
63 | override protected val hadoopHdfsConf = distConfigProps.getProperty("hadoop-hdfssite-xml-path")
64 |
65 | /** Path to hadoop mapred-site.xml */
66 | override protected val hadoopMapredConf = distConfigProps.getProperty("hadoop-mapredsite-xml-path")
67 |
68 | /** This is used for setting log levels for "org.apache", "spark", "org.eclipse.jetty" and "akka" using
69 | * SparkUtils.setLogLevels(). It is WARN by default.
70 | */
71 | val sparkLogLevel = Level.toLevel(distConfigProps.getProperty("logging-level"), Level.WARN)
72 |
73 | /**
74 | * Number of threads to use in the ExecutionContext while calling DistExtractionJob.run() on multiple
75 | * extraction jobs in parallel.
76 | *
77 | * Note that these threads on the driver node do not perform any heavy work except for executing
78 | * DistExtractionJob.run() which submits the respective Spark job to the Spark master and waits
79 | * for the job to finish.
80 | *
81 | * By default it is set to Integer.MAX_VALUE so that all extraction jobs are submitted to Spark master
82 | * simultaneously, which uses the configured scheduling mechanism to execute the jobs on the cluster.
83 | */
84 | val extractionJobThreads = distConfigProps.getProperty("extraction-job-threads", Integer.MAX_VALUE.toString).toInt
85 |
86 | /** Whether output files should be overwritten or not (true/false). This is true by default. */
87 | val overwriteOutput = distConfigProps.getProperty("overwrite-output", "true").toBoolean
88 |
89 | /**
90 | * Whether the intermediate RDD[WikiPage] should be cached to Hadoop's filesystem (true/false).
91 | * This is false by default.
92 | *
93 | * Performance implications:
94 | * 1. Caching will make further extractions over the same dump much faster.
95 | * 2. Caching will force early evaluation of the RDD and will cause some delay before extraction.
96 | *
97 | * If you are not planning on repeated extractions over the same dump it is best to leave this as it is.
98 | */
99 | val cacheWikiPageRDD = distConfigProps.getProperty("cache-wikipages", "false").toBoolean
100 |
101 | /** Dump directory */
102 | val dumpDir = getPath("base-dir", pathMustExist = true)
103 |
104 | /** Local ontology file, downloaded for speed and reproducibility */
105 | val ontologyFile = getPath("ontology", pathMustExist = false)
106 |
107 | /** Local mappings files, downloaded for speed and reproducibility */
108 | val mappingsDir = getPath("mappings", pathMustExist = false)
109 |
110 | val requireComplete = extractionConfig.requireComplete
111 |
112 | val source = extractionConfig.source
113 |
114 | val disambiguations = extractionConfig.disambiguations
115 |
116 | val wikiName = extractionConfig.wikiName
117 |
118 | val parser = extractionConfig.parser
119 |
120 | val formats = extractionConfig.formats
121 |
122 | val extractorClasses = extractionConfig.extractorClasses
123 |
124 | val namespaces = extractionConfig.namespaces
125 |
126 | /**
127 | * Creates a Path from the given property (null if the property is absent) and wraps it in an Option.
128 | * This method first checks the distributed config properties, then the general extraction config properties.
129 | *
130 | * @param property String property key
131 | * @param pathMustExist Boolean to ensure that the Path, if obtained, actually exists.
132 | * @throws RuntimeException if the property is defined but the path does not exist
133 | * @return Option wrapping the obtained Path
134 | */
135 | def getPath(property: String, pathMustExist: Boolean): Option[Path] =
136 | {
137 | val somePath = Option({
138 | val distProp = getValue(distConfigProps, property, required = false)(new Path(_))
139 | if(distProp != null)
140 | {
141 | // If property exists in distributed config file return it.
142 | distProp
143 | }
144 | else
145 | {
146 | // Or else, try the extraction config file - returns either null or a Path.
147 | getValue(extractionConfigProps, property, required = false)(new Path(_))
148 | }
149 | })
150 |
151 | checkPathExists(somePath, pathMustExist)
152 | }
153 |
154 | /**
155 | * Custom Config subclass that makes the File-based variables null.
156 | *
157 | * The distributed extraction framework should only work with Paths. Initialization operations on non-existent
158 | * Files may cause errors, and are not required anyway.
159 | */
160 | private class ExtractionConfig extends Config(extractionConfigProps)
161 | {
162 | override lazy val dumpDir: File = null
163 | override lazy val ontologyFile: File = null
164 | override lazy val mappingsDir: File = null
165 | }
166 |
167 | }
168 |
--------------------------------------------------------------------------------
/extraction/src/main/scala/org/dbpedia/extraction/dump/extract/DistConfigLoader.scala:
--------------------------------------------------------------------------------
1 | package org.dbpedia.extraction.dump.extract
2 |
3 | import org.dbpedia.extraction.destinations._
4 | import org.dbpedia.extraction.mappings._
5 | import org.dbpedia.extraction.ontology.io.OntologyReader
6 | import org.dbpedia.extraction.sources.{Source, WikiPage, XMLSource, WikiSource}
7 | import org.dbpedia.extraction.util._
8 | import org.dbpedia.extraction.util.RichHadoopPath.wrapPath
9 | import org.dbpedia.extraction.wikiparser.Namespace
10 | import java.io._
11 | import java.net.URL
12 | import java.util.logging.{Level, Logger}
13 | import org.apache.spark.rdd.RDD
14 | import org.dbpedia.extraction.dump.download.Download
15 | import org.apache.hadoop.conf.Configuration
16 | import org.apache.hadoop.io.LongWritable
17 | import org.dbpedia.extraction.spark.io.WikiPageWritable
18 | import org.apache.hadoop.mapreduce.Job
19 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat
20 | import org.apache.hadoop.fs.Path
21 | import org.apache.spark.SparkContext
22 | import org.dbpedia.extraction.spark.io.input.DBpediaWikiPageInputFormat
23 |
24 | /**
25 | * Loads the dump extraction configuration.
26 | *
27 | * This class configures Spark and sets up the extractors to run using Spark
28 | *
29 | * TODO: get rid of all config file parsers, use Spring
30 | * TODO: Inherit ConfigLoader methods and get rid of redundant code
31 | *
32 | * @param config DistConfig
33 | */
34 | class DistConfigLoader(config: DistConfig, sparkContext: SparkContext)
35 | {
36 | private val logger = Logger.getLogger(classOf[DistConfigLoader].getName)
37 | private val CONFIG_PROPERTIES = "config.properties"
38 |
39 | /**
40 | * Loads the configuration and creates extraction jobs for all configured languages.
41 | *
42 | * @return Non-strict Traversable over all configured extraction jobs i.e. an extractions job will not be created until it is explicitly requested.
43 | */
44 | def getExtractionJobs(): Traversable[DistExtractionJob] =
45 | {
46 | // Create a non-strict view of the extraction jobs
47 | // non-strict because we want to create the extraction job when it is needed, not earlier
48 | config.extractorClasses.view.map(e => createExtractionJob(e._1, e._2))
49 | }
50 |
51 | /**
52 | * Creates an extraction job for a specific language.
53 | */
54 | private def createExtractionJob(lang: Language, extractorClasses: Seq[Class[_ <: Extractor[_]]]): DistExtractionJob =
55 | {
56 | val dumpDir = config.dumpDir.get
57 |
58 | // Finder[Path] works with Hadoop's FileSystem class - operates on HDFS, or the local file system depending
59 | // upon whether we are running in local mode or distributed/cluster mode.
60 | val finder = new Finder[Path](dumpDir, lang, config.wikiName)
61 | val date = latestDate(finder)
62 |
63 | // Add input sources
64 | val job = Job.getInstance(hadoopConfiguration)
65 | for (file <- files(config.source, finder, date))
66 | FileInputFormat.addInputPath(job, file)
67 | hadoopConfiguration = job.getConfiguration // update Configuration
68 |
69 | // Add the extraction configuration file to distributed cache.
70 | // It will be needed in DBpediaCompositeOutputFormat for getting the Formatters.
71 | val configPropertiesDCPath = finder.wikiDir.resolve(CONFIG_PROPERTIES) // Path where to the copy config properties file
72 | val fs = configPropertiesDCPath.getFileSystem(hadoopConfiguration)
73 | fs.copyFromLocalFile(false, true, new Path(config.extractionConfigFile), configPropertiesDCPath) // Copy local file to Hadoop file system
74 | job.addCacheFile(configPropertiesDCPath.toUri) // Add to distributed cache
75 |
76 | // Setup config variables needed by DBpediaWikiPageInputFormat and DBpediaCompositeOutputFormat.
77 | hadoopConfiguration.set("dbpedia.config.properties", configPropertiesDCPath.toString)
78 | hadoopConfiguration.set("dbpedia.wiki.name", config.wikiName)
79 | hadoopConfiguration.set("dbpedia.wiki.language.wikicode", lang.wikiCode)
80 | hadoopConfiguration.set("dbpedia.wiki.date", date)
81 | hadoopConfiguration.setBoolean("dbpedia.output.overwrite", config.overwriteOutput)
82 |
83 | // Getting the WikiPages from local on-disk cache saves processing time.
84 | val cache = finder.file(date, "articles-rdd")
85 | lazy val articlesRDD: RDD[WikiPage] = try
86 | {
87 | if (!cache.exists)
88 | throw new IOException("Cache file " + cache.getSchemeWithFileName + " does not exist.")
89 | logger.info("Loading articles from cache file " + cache.getSchemeWithFileName)
90 | val loaded = DistIOUtils.loadRDD(sparkContext, classOf[WikiPage], cache)
91 | logger.info("WikiPages loaded from cache file " + cache.getSchemeWithFileName)
92 | loaded
93 | }
94 | catch
95 | {
96 | case ex: Exception =>
97 | {
98 | logger.log(Level.INFO, "Will read from wiki dump file for " + lang.wikiCode + " wiki, could not load cache file '" + cache.getSchemeWithFileName + "': " + ex)
99 |
100 | // Create RDD with WikiPageWritable elements.
101 | val rawArticlesRDD: RDD[(LongWritable, WikiPageWritable)] =
102 | sparkContext.newAPIHadoopRDD(hadoopConfiguration, classOf[DBpediaWikiPageInputFormat], classOf[LongWritable], classOf[WikiPageWritable])
103 |
104 | // Unwrap WikiPages and filter unnecessary pages
105 | val newRdd = rawArticlesRDD.map(_._2.get).filter
106 | {
107 | page =>
108 | page.title.namespace == Namespace.Main ||
109 | page.title.namespace == Namespace.File ||
110 | page.title.namespace == Namespace.Category ||
111 | page.title.namespace == Namespace.Template
112 | }.persist(config.sparkStorageLevel)
113 |
114 | if (config.cacheWikiPageRDD)
115 | {
116 | DistIOUtils.saveRDD(newRdd, cache)
117 | logger.info("Parsed WikiPages written to cache file " + cache.getSchemeWithFileName)
118 | }
119 |
120 | newRdd
121 | }
122 | }
123 |
124 | val _ontology =
125 | {
126 | val ontologySource = config.ontologyFile match
127 | {
128 | case Some(ontologyFile) if ontologyFile.isFile =>
129 | // Is ontologyFile defined and it is indeed a file?
130 | XMLSource.fromReader(reader(ontologyFile), Language.Mappings)
131 | case _ =>
132 | val namespaces = Set(Namespace.OntologyClass, Namespace.OntologyProperty)
133 | val url = new URL(Language.Mappings.apiUri)
134 | val language = Language.Mappings
135 | WikiSource.fromNamespaces(namespaces, url, language)
136 | }
137 |
138 | new OntologyReader().read(ontologySource)
139 | }
140 |
141 | val _commonsSource =
142 | {
143 | try
144 | {
145 | val finder = new Finder[Path](config.dumpDir.get, Language("commons"), config.wikiName)
146 | val date = latestDate(finder)
147 | XMLSource.fromReaders(readers(config.source, finder, date), Language.Commons, _.namespace == Namespace.File)
148 | }
149 | catch
150 | {
151 | case ex: Exception =>
152 | logger.info("Could not load commons source - error: " + ex.getMessage)
153 | null
154 | }
155 | }
156 |
157 | val _disambiguations =
158 | {
159 | val cache = finder.file(date, "disambiguations-ids.obj")
160 | try
161 | {
162 | DistDisambiguations.load(reader(finder.file(date, config.disambiguations)), cache, lang)
163 | } catch
164 | {
165 | case ex: Exception =>
166 | logger.info("Could not load disambiguations - error: " + ex.getMessage)
167 | Disambiguations.empty()
168 | }
169 | }
170 |
171 | val redirectsCache = finder.file(date, "template-redirects.obj")
172 | lazy val _redirects = DistRedirects.load(articlesRDD, redirectsCache, lang) // lazy because it will be evaluated in DistExtractionJob.run()
173 |
174 | lazy val context = new DumpExtractionContext
175 | {
176 | def ontology = _ontology
177 |
178 | def commonsSource = _commonsSource
179 |
180 | def language = lang
181 |
182 | private lazy val _mappingPageSource =
183 | {
184 | val namespace = Namespace.mappings(language)
185 |
186 | config.mappingsDir match
187 | {
188 | case Some(mappingsDir) if mappingsDir.isDirectory =>
189 | // Is mappingsDir defined and it is indeed a directory?
190 | val path = new Path(mappingsDir, namespace.name(Language.Mappings).replace(' ', '_') + ".xml")
191 | XMLSource.fromReader(reader(path), Language.Mappings)
192 | case _ =>
193 | val namespaces = Set(namespace)
194 | val url = new URL(Language.Mappings.apiUri)
195 | WikiSource.fromNamespaces(namespaces, url, Language.Mappings)
196 | }
197 | }
198 |
199 | def mappingPageSource: Traversable[WikiPage] = _mappingPageSource
200 |
201 | private lazy val _mappings =
202 | {
203 | MappingsLoader.load(this)
204 | }
205 |
206 | def mappings: Mappings = _mappings
207 |
208 | def articlesSource: Source = null // Not needing raw article source
209 |
210 | def redirects: Redirects = _redirects
211 |
212 | def disambiguations: Disambiguations = if (_disambiguations != null) _disambiguations else new Disambiguations(Set[Long]())
213 | }
214 |
215 | // Extractors - this is lazily evaluated in DistExtractionJob.run() so that the distributed redirect extraction happens inside run()
216 | // NOTE: All subsequent references to this val need to be lazy!
217 | lazy val extractor =
218 | {
219 | val _redirects = context.redirects // Trigger evaluation of lazy redirects and load the updated context into extractors.
220 | val updatedContext = new DumpExtractionContextWrapper(context)
221 | {
222 | override def redirects: Redirects = _redirects
223 | }
224 | CompositeParseExtractor.load(extractorClasses, updatedContext)
225 | }
226 |
227 | lazy val destination =
228 | {
229 | // Create empty directories for all datasets. This is not strictly necessary because Hadoop would create the directories
230 | // it needs to by itself, though in that case the directories for unused datasets will obviously be absent.
231 | val datasets = extractor.datasets
232 | val outputPath = finder.directory(date)
233 |
234 | for ((suffix, format) <- config.formats; dataset <- datasets)
235 | {
236 | new Path(outputPath, s"${finder.wikiName}-$date-${dataset.name.replace('_', '-')}.$suffix").mkdirs()
237 | }
238 | new DistMarkerDestination(new DistDeduplicatingWriterDestination(outputPath, hadoopConfiguration), finder.file(date, Extraction.Complete), false)
239 | }
240 |
241 | lazy val description =
242 | {
243 | val datasets = extractor.datasets
244 | lang.wikiCode + ": " + extractorClasses.size + " extractors (" + extractorClasses.map(_.getSimpleName).mkString(",") + "), " + datasets.size + " datasets (" + datasets.mkString(",") + ")"
245 | }
246 |
247 | new DistExtractionJob(new RootExtractor(extractor), articlesRDD, config.namespaces, destination, lang.wikiCode, description)
248 | }
249 |
250 | implicit var hadoopConfiguration: Configuration = config.hadoopConf
251 |
252 | private def writer[T <% FileLike[_]](file: T): () => Writer =
253 | {
254 | () => IOUtils.writer(file)
255 | }
256 |
257 | private def reader[T <% FileLike[_]](file: T): () => Reader =
258 | {
259 | () => IOUtils.reader(file)
260 | }
261 |
262 | private def readers[T <% FileLike[_]](source: String, finder: Finder[T], date: String): List[() => Reader] =
263 | {
264 | files(source, finder, date).map(reader(_))
265 | }
266 |
267 | private def files[T <% FileLike[_]](source: String, finder: Finder[T], date: String): List[T] =
268 | {
269 |
270 | val files = if (source.startsWith("@"))
271 | {
272 | // the articles source is a regex - we want to match multiple files
273 | finder.matchFiles(date, source.substring(1))
274 | } else List(finder.file(date, source))
275 |
276 | logger.info(s"Source is ${source} - ${files.size} file(s) matched")
277 |
278 | files
279 | }
280 |
281 | private def latestDate(finder: Finder[_]): String =
282 | {
283 | val isSourceRegex = config.source.startsWith("@")
284 | val source = if (isSourceRegex) config.source.substring(1) else config.source
285 | val fileName = if (config.requireComplete) Download.Complete else source
286 | finder.dates(fileName, isSuffixRegex = isSourceRegex).last
287 | }
288 | }
--------------------------------------------------------------------------------
/extraction/src/main/scala/org/dbpedia/extraction/dump/extract/DistExtraction.scala:
--------------------------------------------------------------------------------
1 | package org.dbpedia.extraction.dump.extract
2 |
3 | import org.dbpedia.extraction.util.{SparkUtils, ProxyAuthenticator, ConfigUtils}
4 | import java.net.Authenticator
5 | import scala.concurrent.{ExecutionContext, Await, Future, future}
6 | import scala.concurrent.duration.Duration
7 | import java.io.File
8 | import java.util.concurrent.Executors
9 |
10 | /**
11 | * Dump extraction script.
12 | */
13 | object DistExtraction
14 | {
15 |
16 | val Started = "extraction-started"
17 |
18 | val Complete = "extraction-complete"
19 |
20 | def main(args: Array[String]): Unit =
21 | {
22 | require(args != null && args.length >= 2 && args(0).nonEmpty && args(1).nonEmpty, "missing required arguments: ")
23 | Authenticator.setDefault(new ProxyAuthenticator())
24 |
25 | // Load properties
26 | val extractionConfigProps = ConfigUtils.loadConfig(args(0), "UTF-8")
27 | val distConfigProps = ConfigUtils.loadConfig(args(1), "UTF-8")
28 | val distConfig = new DistConfig(distConfigProps, extractionConfigProps, new File(args(0)).toURI)
29 |
30 | // overwrite properties with CLI args
31 | // TODO arguments could be of the format a=b and then property a can be overwritten with "b"
32 |
33 | // Create SparkContext
34 | SparkUtils.setSparkLogLevels(distConfig)
35 | val sparkContext = SparkUtils.getSparkContext(distConfig)
36 |
37 | // Load extraction jobs from configuration
38 | val jobs = new DistConfigLoader(distConfig, sparkContext).getExtractionJobs()
39 |
40 | val executor = Executors.newFixedThreadPool(distConfig.extractionJobThreads)
41 | implicit val ec = ExecutionContext.fromExecutor(executor)
42 | val futures = for (job <- jobs) yield future
43 | {
44 | job.run()
45 | }
46 | Await.result(Future.sequence(futures), Duration.Inf)
47 |
48 | sparkContext.stop()
49 | executor.shutdown()
50 | }
51 | }
52 |
--------------------------------------------------------------------------------
/extraction/src/main/scala/org/dbpedia/extraction/dump/extract/DistExtractionJob.scala:
--------------------------------------------------------------------------------
1 | package org.dbpedia.extraction.dump.extract
2 |
3 | import java.util.logging.{Level, Logger}
4 | import org.dbpedia.extraction.destinations.{Quad, DistDestination}
5 | import org.dbpedia.extraction.mappings.RootExtractor
6 | import org.dbpedia.extraction.sources.WikiPage
7 | import org.dbpedia.extraction.spark.serialize.KryoSerializationWrapper
8 | import org.dbpedia.extraction.wikiparser.Namespace
9 | import org.apache.spark.rdd.RDD
10 | import org.dbpedia.extraction.util.StringUtils
11 | import org.apache.spark.SparkContext._
12 | import org.dbpedia.util.Exceptions
13 |
14 | /**
15 | * Executes an extraction using Spark.
16 | *
17 | * @param extractor The Extractor
18 | * @param rdd The RDD of WikiPages
19 | * @param namespaces Only extract pages in these namespaces
20 | * @param destination The extraction destination. Will be closed after the extraction has been finished.
21 | * @param label user readable label of this extraction job.
22 | */
23 | class DistExtractionJob(extractor: => RootExtractor, rdd: => RDD[WikiPage], namespaces: Set[Namespace], destination: => DistDestination, label: String, description: => String)
24 | {
25 | private val logger = Logger.getLogger(getClass.getName)
26 |
27 | def run(): Unit =
28 | {
29 | val sc = rdd.sparkContext
30 | val allPages = sc.accumulator(0)
31 | val failedPages = sc.accumulator(0)
32 |
33 | val loggerBC = sc.broadcast(logger)
34 | val extractorBC = sc.broadcast(KryoSerializationWrapper(extractor))
35 | val namespacesBC = sc.broadcast(namespaces)
36 |
37 | val startTime = System.currentTimeMillis
38 |
39 | val results: RDD[Seq[Quad]] =
40 | rdd.map
41 | {
42 | page =>
43 | // Take a WikiPage, perform the extraction with a set of extractors and return the results as a Seq[Quad].
44 | val (success, graph) = try
45 | {
46 | (true, if (namespacesBC.value.contains(page.title.namespace)) Some(extractorBC.value.value.apply(page)) else None)
47 | }
48 | catch
49 | {
50 | case ex: Exception =>
51 | loggerBC.value.log(Level.WARNING, "error processing page '" + page.title + "': " + Exceptions.toString(ex, 200))
52 | (false, None)
53 | }
54 |
55 | if (success) allPages += 1 else failedPages += 1
56 |
57 | graph.getOrElse(Nil)
58 | }
59 |
60 | logger.info(description+" started")
61 |
62 | destination.open()
63 |
64 | logger.info("Writing outputs to destination...")
65 |
66 | destination.write(results)
67 |
68 | destination.close()
69 |
70 | val time = System.currentTimeMillis - startTime
71 | println("%s: extracted %d pages in %s (per page: %f ms; failed pages: %d).".format(label,
72 | allPages.value,
73 | StringUtils.prettyMillis(time),
74 | time.toDouble / allPages.value,
75 | failedPages.value))
76 |
77 | logger.info(description+" finished")
78 | }
79 | }
80 |
--------------------------------------------------------------------------------
/extraction/src/main/scala/org/dbpedia/extraction/dump/extract/DumpExtractionContextWrapper.scala:
--------------------------------------------------------------------------------
1 | package org.dbpedia.extraction.dump.extract
2 |
3 | import org.dbpedia.extraction.ontology.Ontology
4 | import org.dbpedia.extraction.sources.{WikiPage, Source}
5 | import org.dbpedia.extraction.util.Language
6 | import org.dbpedia.extraction.mappings.{Disambiguations, Redirects, Mappings}
7 |
8 | /**
9 | * A simple wrapper for a DumpExtractionContext object
10 | *
11 | * @param context
12 | */
13 | class DumpExtractionContextWrapper(context: DumpExtractionContext) extends DumpExtractionContext
14 | {
15 | override def ontology: Ontology = context.ontology
16 |
17 | override def commonsSource: Source = context.commonsSource
18 |
19 | override def language: Language = context.language
20 |
21 | override def mappingPageSource: Traversable[WikiPage] = context.mappingPageSource
22 |
23 | override def mappings: Mappings = context.mappings
24 |
25 | override def articlesSource: Source = context.articlesSource
26 |
27 | override def redirects: Redirects = context.redirects
28 |
29 | override def disambiguations: Disambiguations = context.disambiguations
30 | }
31 |
--------------------------------------------------------------------------------
/extraction/src/main/scala/org/dbpedia/extraction/mappings/DistDisambiguations.scala:
--------------------------------------------------------------------------------
1 | package org.dbpedia.extraction.mappings
2 |
3 | import java.util.logging.{Level, Logger}
4 | import java.io._
5 | import org.apache.hadoop.fs.Path
6 | import org.dbpedia.extraction.util.RichHadoopPath.wrapPath
7 | import org.dbpedia.extraction.util.{DistIOUtils, Language}
8 | import org.apache.hadoop.conf.Configuration
9 | import com.esotericsoftware.kryo.io.{Input, Output}
10 |
11 | /**
12 | * A version of Disambiguations that works with org.apache.hadoop.fs.Path.
13 | *
14 | * @see Disambiguations
15 | */
16 | class DistDisambiguations(override val set : Set[Long]) extends Disambiguations(set)
17 |
18 | object DistDisambiguations
19 | {
20 | private val logger = Logger.getLogger(classOf[DistDisambiguations].getName)
21 |
22 | /**
23 | * Loads disambiguations from cache/source reader.
24 | *
25 | * @param reader Reader to load disambiguations from
26 | * @param cache Path to cache file
27 | * @param lang Language
28 | * @param hadoopConf Configuration
29 | * @return Disambiguations object
30 | */
31 | def load(reader : () => Reader, cache : Path, lang : Language)(implicit hadoopConf: Configuration) : Disambiguations =
32 | {
33 | try
34 | {
35 | return loadFromCache(cache)
36 | }
37 | catch
38 | {
39 | case ex : Exception => logger.log(Level.INFO, "Will extract disambiguations from source for "+lang.wikiCode+" wiki, could not load cache file '"+cache.getSchemeWithFileName+"': "+ex)
40 | }
41 |
42 | val disambiguations = Disambiguations.loadFromFile(reader, lang)
43 |
44 | val dir = cache.getParent
45 | if (!dir.exists && !dir.mkdirs()) throw new IOException("cache dir [" + dir.getSchemeWithFileName + "] does not exist and cannot be created")
46 | val output = new Output(new BufferedOutputStream(cache.outputStream()))
47 |
48 | try
49 | {
50 | DistIOUtils.getKryoInstance.writeClassAndObject(output, disambiguations.set)
51 | logger.info(disambiguations.set.size + " disambiguations written to cache file " + cache.getSchemeWithFileName)
52 | disambiguations
53 | }
54 | finally
55 | {
56 | output.close()
57 | }
58 | }
59 |
60 | /**
61 | * Loads the disambiguations from a cache file.
62 | */
63 | private def loadFromCache(cache : Path)(implicit hadoopConf: Configuration) : Disambiguations =
64 | {
65 | logger.info("Loading disambiguations from cache file " + cache.getSchemeWithFileName)
66 | val input = new Input(new BufferedInputStream(cache.inputStream()))
67 | try
68 | {
69 | val disambiguations = new Disambiguations(DistIOUtils.getKryoInstance.readClassAndObject(input).asInstanceOf[Set[Long]])
70 | logger.info(disambiguations.set.size + " disambiguations loaded from cache file " + cache.getSchemeWithFileName)
71 | disambiguations
72 | }
73 | finally
74 | {
75 | input.close()
76 | }
77 | }
78 | }
79 |
--------------------------------------------------------------------------------
/extraction/src/main/scala/org/dbpedia/extraction/mappings/DistRedirects.scala:
--------------------------------------------------------------------------------
1 | package org.dbpedia.extraction.mappings
2 |
3 | import java.util.logging.{Level, Logger}
4 | import org.dbpedia.extraction.sources.WikiPage
5 | import java.io._
6 | import org.dbpedia.extraction.wikiparser._
7 | import org.dbpedia.extraction.util.{DistIOUtils, Language}
8 | import org.dbpedia.extraction.wikiparser.impl.wikipedia.Redirect
9 | import org.apache.spark.rdd.RDD
10 | import com.esotericsoftware.kryo.io.{Input, Output}
11 | import org.dbpedia.extraction.util.RichHadoopPath.wrapPath
12 | import org.apache.hadoop.fs.Path
13 | import org.apache.hadoop.conf.Configuration
14 | import org.apache.spark.SparkContext._
15 |
16 | /**
17 | * Distributed version of Redirects; uses Spark to compute redirects.
18 | *
19 | * Holds the redirects between wiki pages
20 | * At the moment, only redirects between Templates are considered
21 | *
22 | * @param map Redirect map. Contains decoded template titles.
23 | *
24 | * @see Redirects
25 | */
26 | class DistRedirects(override val map: Map[String, String]) extends Redirects(map)
27 |
28 | /**
29 | * Loads redirects from a cache file or source of Wiki pages.
30 | * At the moment, only redirects between Templates are considered
31 | */
32 | object DistRedirects
33 | {
34 | private val logger = Logger.getLogger(classOf[DistRedirects].getName)
35 |
36 | /**
37 | * Tries to load the redirects from a cache file.
38 | * If not successful, loads the redirects from an RDD.
39 | * Updates the cache after loading the redirects from the source.
40 | *
41 | * @param rdd RDD of WikiPages
42 | * @param cache Path to cache file
43 | * @param lang Language
44 | * @param hadoopConf Configuration
45 | * @return Redirects object
46 | */
47 | def load(rdd: RDD[WikiPage], cache: Path, lang: Language)(implicit hadoopConf: Configuration): Redirects =
48 | {
49 | //Try to load redirects from the cache
50 | try
51 | {
52 | return loadFromCache(cache)
53 | }
54 | catch
55 | {
56 | case ex: Exception => logger.log(Level.INFO, "Will extract redirects from source for " + lang.wikiCode + " wiki, could not load cache file '" + cache.getSchemeWithFileName + "': " + ex)
57 | }
58 |
59 | //Load redirects from RDD
60 | val redirects = loadFromRDD(rdd, lang)
61 |
62 | val dir = cache.getParent
63 | if (!dir.exists && !dir.mkdirs()) throw new IOException("cache dir [" + dir.getSchemeWithFileName + "] does not exist and cannot be created")
64 | val output = new Output(new BufferedOutputStream(cache.outputStream()))
65 | try
66 | {
67 | DistIOUtils.getKryoInstance.writeClassAndObject(output, redirects.map)
68 | logger.info(redirects.map.size + " redirects written to cache file " + cache.getSchemeWithFileName)
69 | redirects
70 | }
71 | finally
72 | {
73 | output.close()
74 | }
75 | }
76 |
77 | /**
78 | * Loads the redirects from a cache file.
79 | */
80 | private def loadFromCache(cache: Path)(implicit hadoopConf: Configuration): Redirects =
81 | {
82 | logger.info("Loading redirects from cache file " + cache.getSchemeWithFileName)
83 | val input = new Input(new BufferedInputStream(cache.inputStream()))
84 | try
85 | {
86 | val redirects = new Redirects(DistIOUtils.getKryoInstance.readClassAndObject(input).asInstanceOf[Map[String, String]])
87 | logger.info(redirects.map.size + " redirects loaded from cache file " + cache.getSchemeWithFileName)
88 | redirects
89 | }
90 | finally
91 | {
92 | input.close()
93 | }
94 | }
95 |
96 | /**
97 | * Loads the redirects from a source.
98 | *
99 | * @param rdd RDD of WikiPages
100 | * @param lang Language
101 | * @return Redirects object
102 | */
103 | def loadFromRDD(rdd: RDD[WikiPage], lang: Language): Redirects =
104 | {
105 | logger.info("Loading redirects from source (" + lang.wikiCode + ")")
106 |
107 | val regexBC = rdd.sparkContext.broadcast(buildRegex(lang))
108 |
109 | // Wrap the map function inside a KryoSerializationWrapper
110 | // val mapper = SparkUtils.kryoWrapFunction(new RedirectFinder(langBC))
111 | // val redirects = new Redirects(rdd.flatMap(mapper).collectAsMap().toMap)
112 |
113 | val redirectsRDD = rdd.flatMap
114 | {
115 | case page: WikiPage =>
116 | val regex = regexBC.value
117 |
118 | val destinationTitle = page.source match
119 | {
120 | case regex(destination) =>
121 | try
122 | {
123 | WikiTitle.parse(destination, page.title.language)
124 | }
125 | catch
126 | {
127 | case ex: WikiParserException =>
128 | Logger.getLogger(Redirects.getClass.getName).log(Level.WARNING, "Couldn't parse redirect destination", ex)
129 | null
130 | }
131 | case _ => null
132 | }
133 |
134 | if (destinationTitle != page.redirect)
135 | {
136 | Logger.getLogger(Redirects.getClass.getName).log(Level.WARNING, "wrong redirect. page: [" + page.title + "].\nfound by dbpedia: [" + destinationTitle + "].\nfound by wikipedia: [" + page.redirect + "]")
137 | }
138 |
139 | if (destinationTitle != null && page.title.namespace == Namespace.Template && destinationTitle.namespace == Namespace.Template)
140 | {
141 | List((page.title.decoded, destinationTitle.decoded))
142 | }
143 | else
144 | {
145 | Nil
146 | }
147 | }
148 |
149 | val redirects = new Redirects(redirectsRDD.collectAsMap().toMap)
150 |
151 | logger.info("Redirects loaded from source (" + lang.wikiCode + ")")
152 | redirects
153 | }
154 |
155 | private def buildRegex(lang: Language) =
156 | {
157 | val redirects = Redirect(lang).mkString("|")
158 | // (?ius) enables CASE_INSENSITIVE UNICODE_CASE DOTALL
159 | // case insensitive and unicode are important - that's what mediawiki does.
160 | // Note: Although we do not specify a Locale, UNICODE_CASE does mostly the right thing.
161 | // DOTALL means that '.' also matches line terminators.
162 | // Reminder: (?:...) are non-capturing groups, '*?' is a reluctant qualifier.
163 | // (?:#[^\n]*?)? is an optional (the last '?') non-capturing group meaning: there may
164 | // be a '#' after which everything but line breaks is allowed ('[]{}|<>' are not allowed
165 | // before the '#'). The match is reluctant ('*?'), which means that we recognize ']]'
166 | // as early as possible.
167 | // (?:\|[^\n]*?)? is another optional non-capturing group that reluctantly consumes
168 | // a '|' character and everything but line breaks after it.
169 | ("""(?ius)\s*(?:""" + redirects + """)\s*:?\s*\[\[([^\[\]{}|<>\n]+(?:#[^\n]*?)?)(?:\|[^\n]*?)?\]\].*""").r
170 | }
171 | }
172 |
173 |
--------------------------------------------------------------------------------
/extraction/src/main/scala/org/dbpedia/extraction/spark/io/QuadSeqWritable.scala:
--------------------------------------------------------------------------------
1 | package org.dbpedia.extraction.spark.io
2 |
3 | import org.dbpedia.extraction.destinations.Quad
4 | import org.apache.hadoop.io.Writable
5 | import org.dbpedia.extraction.util.DistIOUtils
6 | import java.io.{DataOutput, ByteArrayOutputStream, DataInput}
7 | import com.esotericsoftware.kryo.io.{Input, Output}
8 |
9 | /**
10 | * Writable wrapping Seq[Quad] - used by custom OutputFormat
11 | */
12 | class QuadSeqWritable(quads: Seq[Quad]) extends Writable
13 | {
14 | var _quads = quads
15 |
16 | def this() = this(null)
17 |
18 | def set(quads: Seq[Quad])
19 | {
20 | _quads = quads
21 | }
22 |
23 | def get = _quads
24 |
25 | override def write(output: DataOutput)
26 | {
27 | val out = new ByteArrayOutputStream()
28 | val o = new Output(out)
29 | DistIOUtils.getKryoInstance.writeClassAndObject(o, get)
30 | o.close()
31 | val bytes = out.toByteArray
32 | output.writeInt(bytes.size)
33 | output.write(bytes)
34 | }
35 |
36 | override def readFields(input: DataInput)
37 | {
38 | val size = input.readInt()
39 | val bytes = new Array[Byte](size)
40 | input.readFully(bytes)
41 | val i = new Input()
42 | i.setBuffer(bytes)
43 | set(DistIOUtils.getKryoInstance.readClassAndObject(i).asInstanceOf[Seq[Quad]])
44 | i.close()
45 | }
46 | }
47 |
--------------------------------------------------------------------------------
/extraction/src/main/scala/org/dbpedia/extraction/spark/io/WikiPageWritable.scala:
--------------------------------------------------------------------------------
1 | package org.dbpedia.extraction.spark.io
2 |
3 | import org.apache.hadoop.io.Writable
4 | import java.io.{ByteArrayOutputStream, DataOutput, DataInput}
5 | import org.dbpedia.extraction.sources.WikiPage
6 | import com.esotericsoftware.kryo.io.{Input, Output}
7 | import org.dbpedia.extraction.spark.serialize.WikiPageSerializer
8 | import org.dbpedia.extraction.util.DistIOUtils
9 |
10 | /**
11 | * DBpediaWikiPageInputFormat emits values of type WikiPageWritable. This class holds a single WikiPage instance.
12 | * @see DBpediaWikiPageInputFormat
13 | */
14 | class WikiPageWritable(wikiPage: WikiPage) extends Writable
15 | {
16 | var _wikiPage = wikiPage
17 |
18 | def this() = this(null)
19 |
20 | def set(wikiPage: WikiPage)
21 | {
22 | _wikiPage = wikiPage
23 | }
24 |
25 | def get = _wikiPage
26 |
27 | val wps = new WikiPageSerializer
28 |
29 | override def write(output: DataOutput)
30 | {
31 | val out = new ByteArrayOutputStream()
32 | val o = new Output(out)
33 | wps.write(DistIOUtils.getKryoInstance, o, get)
34 | o.close()
35 | val bytes = out.toByteArray
36 | output.writeInt(bytes.size)
37 | output.write(bytes)
38 | }
39 |
40 | override def readFields(input: DataInput)
41 | {
42 | val size = input.readInt()
43 | val bytes = new Array[Byte](size)
44 | input.readFully(bytes)
45 | val i = new Input()
46 | i.setBuffer(bytes)
47 | set(wps.read(DistIOUtils.getKryoInstance, i, classOf[WikiPage]))
48 | i.close()
49 | }
50 | }
--------------------------------------------------------------------------------
/extraction/src/main/scala/org/dbpedia/extraction/spark/io/input/ByteMatcher.scala:
--------------------------------------------------------------------------------
1 | package org.dbpedia.extraction.spark.io.input
2 |
3 | import java.io.InputStream
4 | import org.apache.hadoop.fs.Seekable
5 | import org.apache.hadoop.io.DataOutputBuffer
6 | import scala.annotation.tailrec
7 |
8 | /**
9 | * A class that operates mainly on SeekableInputStreams, iteratively reading chunks of data from an InputStream
10 | * depending upon a match pattern, through the method readUntilMatch().
11 | *
12 | * @param in InputStream to read binary data from
13 | * @param seeker Seekable for the InputStream "in" - used for keeping track of position in the InputStream
14 | */
15 | class ByteMatcher(in: InputStream, seeker: Seekable)
16 | {
17 | private var bytesRead: Long = 0
18 | private var lastMatchedPos: Long = -1
19 | private var currentPos: Long = -1
20 |
21 | def this(is: SeekableInputStream) = this(is, is)
22 |
23 | /**
24 | * @return number of bytes read
25 | */
26 | def getReadBytes: Long = bytesRead
27 |
28 | /**
29 | * @return current position in seeker
30 | */
31 | def getPos: Long = seeker.getPos
32 |
33 | /**
34 | * @return last position when a match was found
35 | */
36 | def getLastMatchedPos: Long = lastMatchedPos
37 |
38 | /**
39 | * @param len number of bytes to skip
40 | */
41 | def skip(len: Long)
42 | {
43 | in.skip(len)
44 | bytesRead += len
45 | }
46 |
47 | /**
48 | * Reads the InputStream until a match is found or "end" number of bytes is reached.
49 | *
50 | * @param textPattern String to match against
51 | * @param end number of bytes to read till - checked against seeker
52 | * @return Boolean true if a match was found, false if EOF was found or stopping point "end" was crossed.
53 | */
54 | def readUntilMatch(textPattern: String, end: Long): Boolean =
55 | {
56 | readUntilMatch(textPattern.getBytes("UTF-8"), 0, end)
57 | }
58 |
59 | /**
60 | * Reads the InputStream while writing to a buffer, until a match is found or "end" number of bytes is reached.
61 | *
62 | * @param textPattern String to match against
63 | * @param end number of bytes to read till - checked against seeker
64 | * @param outputBuffer DataOutputBuffer where the data being read is written to
65 | * @return Boolean true if a match was found, false if EOF was found or stopping point "end" was crossed.
66 | */
67 | def readUntilMatch(textPattern: String, end: Long, outputBuffer: Option[DataOutputBuffer]): Boolean =
68 | {
69 | readUntilMatch(textPattern.getBytes("UTF-8"), 0, end, outputBuffer)
70 | }
71 |
72 | /**
73 | * Reads the InputStream until a match is found or "end" number of bytes is reached.
74 | *
75 | * @param bytePattern Byte array to match against
76 | * @param end number of bytes to read till - checked against seeker
77 | * @return Boolean true if a match was found, false if EOF was found or stopping point "end" was crossed.
78 | */
79 | def readUntilMatch(bytePattern: Array[Byte], end: Long): Boolean =
80 | {
81 | readUntilMatch(bytePattern, 0, end)
82 | }
83 |
84 | /**
85 | * Reads the InputStream while writing to a buffer, until a match is found or "end" number of bytes is reached.
86 | *
87 | * @param bytePattern Byte array to match against
88 | * @param end number of bytes to read till - checked against seeker
89 | * @param outputBuffer DataOutputBuffer where the data being read is written to
90 | * @return Boolean true if a match was found, false if EOF was found or stopping point "end" was crossed.
91 | */
92 | def readUntilMatch(bytePattern: Array[Byte], end: Long, outputBuffer: Option[DataOutputBuffer]): Boolean =
93 | {
94 | readUntilMatch(bytePattern, 0, end, outputBuffer)
95 | }
96 |
97 | @tailrec private def readUntilMatch(matchBytes: Array[Byte], matchIter: Int, end: Long, outputBuffer: Option[DataOutputBuffer] = None): Boolean =
98 | {
99 | var i = matchIter
100 | val b: Int = this.in.read
101 | // EOF at the beginning
102 | if (b == -1) return false
103 |
104 | this.bytesRead += 1
105 |
106 | // Save to the buffer, if any provided
107 | outputBuffer.foreach(_.write(b))
108 |
109 | // Check if we're matching
110 | if (b == matchBytes(i))
111 | {
112 | i += 1
113 | // Whole of matchBytes matched successfully?
114 | if (i >= matchBytes.length) return true
115 | }
116 | else
117 | {
118 | // If not matched, start afresh and increment position.
119 | i = 0
120 | if (this.currentPos != this.getPos)
121 | {
122 | this.lastMatchedPos = this.currentPos
123 | this.currentPos = this.getPos
124 | }
125 | }
126 |
127 | // See if we've passed the stop point
128 | if (i == 0 && this.seeker.getPos >= end) return false
129 |
130 | // Keep reading
131 | readUntilMatch(matchBytes, i, end, outputBuffer)
132 | }
133 | }
--------------------------------------------------------------------------------
/extraction/src/main/scala/org/dbpedia/extraction/spark/io/input/DBpediaWikiPageInputFormat.scala:
--------------------------------------------------------------------------------
1 | package org.dbpedia.extraction.spark.io.input
2 |
3 | import org.apache.hadoop.io.{DataOutputBuffer, LongWritable}
4 | import org.apache.hadoop.io.compress.{CompressionCodecFactory, SplittableCompressionCodec}
5 | import org.apache.hadoop.fs.Path
6 | import scala.xml.XML
7 | import org.dbpedia.extraction.sources.XMLSource
8 | import org.apache.hadoop.mapreduce.lib.input.{FileSplit, FileInputFormat}
9 | import org.apache.hadoop.mapreduce.{JobContext, RecordReader, InputSplit, TaskAttemptContext}
10 | import org.apache.commons.logging.LogFactory
11 | import org.dbpedia.extraction.util.Language
12 | import org.dbpedia.extraction.spark.io.WikiPageWritable
13 |
14 | /**
15 | * Hadoop InputFormat that splits a Wikipedia dump file into WikiPageWritable (representing a single
16 | * org.dbpedia.extraction.sources.WikiPage) chunks.
17 | *
18 | * The WikiPageRecordReader class inside outputs a WikiPageWritable as value and the starting position (byte) as key.
19 | *
20 | * Note that wikipage.language.wikicode needs to be set in Hadoop's Configuration.
21 | */
22 | class DBpediaWikiPageInputFormat extends FileInputFormat[LongWritable, WikiPageWritable]
23 | {
24 | private val LOG = LogFactory.getLog(classOf[DBpediaWikiPageInputFormat])
25 | private val LANGUAGE = "dbpedia.wiki.language.wikicode"
26 |
27 | protected override def isSplitable(context: JobContext, file: Path): Boolean =
28 | {
29 | val codec = new CompressionCodecFactory(context.getConfiguration).getCodec(file)
30 | if (null == codec) true else codec.isInstanceOf[SplittableCompressionCodec]
31 | }
32 |
33 | override def createRecordReader(genericSplit: InputSplit, context: TaskAttemptContext): RecordReader[LongWritable, WikiPageWritable] =
34 | {
35 | val split = genericSplit.asInstanceOf[FileSplit]
36 | LOG.info("getRecordReader start.....split=" + split)
37 | context.setStatus(split.toString)
38 | new WikiPageRecordReader(split, context)
39 | }
40 |
41 | private class WikiPageRecordReader(split: FileSplit, context: TaskAttemptContext) extends RecordReader[LongWritable, WikiPageWritable]
42 | {
43 | private var key: LongWritable = null
44 | private var value: WikiPageWritable = null
45 |
46 | private val conf = context.getConfiguration
47 |
48 | // Language code for this data dump
49 | private val language = Language(conf.get(LANGUAGE))
50 | private val page = new DataOutputBuffer()
51 | private val inputStream = SeekableInputStream(split,
52 | split.getPath.getFileSystem(conf),
53 | new CompressionCodecFactory(conf))
54 | private val matcher = new ByteMatcher(inputStream)
55 |
56 | private val (start, end) =
57 | {
58 | inputStream match
59 | {
60 | case SeekableSplitCompressedInputStream(sin) =>
61 | (sin.getAdjustedStart, sin.getAdjustedEnd + 1)
62 | case _ =>
63 | (split.getStart, split.getStart + split.getLength)
64 | }
65 | }
66 |
67 | private val pageBeginPattern = "".getBytes("UTF-8")
68 | private val pageEndPattern = "".getBytes("UTF-8")
69 |
70 | override def close() = inputStream.close()
71 |
72 | override def getProgress: Float =
73 | {
74 | if (end == start) 1.0f else (getPos - start).asInstanceOf[Float] / (end - start).asInstanceOf[Float]
75 | }
76 |
77 | def getPos: Long = matcher.getPos
78 |
79 | override def initialize(genericInputSplit: InputSplit, context: TaskAttemptContext) = ()
80 |
81 | override def nextKeyValue(): Boolean =
82 | {
83 | // Initialize key and value
84 | if (key == null) key = new LongWritable()
85 | if (value == null) value = new WikiPageWritable()
86 |
87 | if (matcher.getPos < end && matcher.readUntilMatch(pageBeginPattern, end))
88 | {
89 | try
90 | {
91 | page.write(pageBeginPattern)
92 | if (matcher.readUntilMatch(pageEndPattern, end, Some(page)))
93 | {
94 | // Key is set to the position (bytes) where the page is found
95 | key.set(matcher.getPos)
96 |
97 | // Set value to the WikiPage created from the parsed ...
98 | val elem = XML.loadString("" + new String(page.getData.take(page.getLength), "UTF-8") + "")
99 | value.set(XMLSource.fromXML(elem, language).head)
100 |
101 | return true
102 | }
103 | }
104 | finally
105 | {
106 | page.reset()
107 | }
108 | }
109 | false
110 | }
111 |
112 | override def getCurrentKey: LongWritable = key
113 |
114 | override def getCurrentValue: WikiPageWritable = value
115 | }
116 |
117 | }
118 |
--------------------------------------------------------------------------------
/extraction/src/main/scala/org/dbpedia/extraction/spark/io/input/SeekableInputStream.scala:
--------------------------------------------------------------------------------
1 | package org.dbpedia.extraction.spark.io.input
2 |
3 | import org.apache.hadoop.io.compress._
4 | import org.apache.hadoop.fs.{FileSystem, Seekable, FSDataInputStream}
5 | import java.io.{InputStream, FilterInputStream}
6 | import org.apache.hadoop.mapreduce.lib.input.FileSplit
7 |
8 | object SeekableInputStream
9 | {
10 | /**
11 | * Examines a FileSplit and returns the appropriate SeekableInputStream generated from it.
12 | *
13 | * @param split FileSplit to generate the SeekableInputStream from
14 | * @param fs FileSystem
15 | * @param compressionCodecs CompressionCodecFactory
16 | * @return SeekableInputStream to read from split
17 | */
18 | def apply(split: FileSplit, fs: FileSystem, compressionCodecs: CompressionCodecFactory): SeekableInputStream =
19 | {
20 | val path = split.getPath
21 | val start = split.getStart
22 | val end = start + split.getLength
23 |
24 | val codec = compressionCodecs.getCodec(path)
25 | val dataInputStream = fs.open(path)
26 |
27 | codec match
28 | {
29 | case splitableCodec: SplittableCompressionCodec =>
30 | // Is it a splittable compression input stream?
31 | val compressionInputStream = splitableCodec.createInputStream(dataInputStream,
32 | CodecPool.getDecompressor(codec),
33 | start,
34 | end,
35 | SplittableCompressionCodec.READ_MODE.BYBLOCK)
36 | SeekableSplitCompressedInputStream(compressionInputStream)
37 | case null =>
38 | // Input stream not compressed?
39 | dataInputStream.seek(start)
40 | SeekableUncompressedInputStream(dataInputStream)
41 | case _ =>
42 | // Non-splittable compression input stream? No seeking or offsetting is needed
43 | assert(start == 0)
44 | val compressionInputStream = codec.createInputStream(dataInputStream, CodecPool.getDecompressor(codec))
45 | SeekableCompressedInputStream(compressionInputStream, dataInputStream)
46 | }
47 | }
48 | }
49 |
50 | /**
51 | * A SeekableInputStream internally using a SplitCompressionInputStream, ie. compressed by a splittable compression method.
52 | */
53 | case class SeekableSplitCompressedInputStream(sin: SplitCompressionInputStream) extends SeekableInputStream(sin, sin)
54 |
55 | /**
56 | * A compressed SeekableInputStream using a non-splittable compression input stream
57 | */
58 | case class SeekableCompressedInputStream(cin: CompressionInputStream, fsin: FSDataInputStream) extends SeekableInputStream(cin, fsin)
59 |
60 | /**
61 | * SeekableInputStream without compression.
62 | */
63 | case class SeekableUncompressedInputStream(fsin: FSDataInputStream) extends SeekableInputStream(fsin, fsin)
64 |
65 | /**
66 | * Wraps an InputStream and a corresponding Seekable to track its position.
67 | *
68 | * @param in InputStream to read binary data from
69 | * @param seeker Seekable for the InputStream "in" - used for keeping track of position in the InputStream
70 | */
71 | sealed class SeekableInputStream(in: InputStream, seeker: Seekable) extends FilterInputStream(in) with Seekable
72 | {
73 | override def getPos: Long = seeker.getPos
74 |
75 | override def seek(pos: Long) = seeker.seek(pos)
76 |
77 | override def seekToNewSource(targetPos: Long): Boolean = seeker.seekToNewSource(targetPos)
78 |
79 | override def toString: String = in.toString
80 | }
--------------------------------------------------------------------------------
/extraction/src/main/scala/org/dbpedia/extraction/spark/io/output/DBpediaCompositeOutputFormat.scala:
--------------------------------------------------------------------------------
1 | package org.dbpedia.extraction.spark.io.output
2 |
3 | import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat
4 | import org.dbpedia.extraction.spark.io.QuadSeqWritable
5 | import org.apache.hadoop.io.Text
6 | import org.apache.hadoop.mapreduce.{JobContext, RecordWriter, TaskAttemptContext}
7 | import scala.collection.mutable
8 | import org.dbpedia.extraction.destinations.formatters.UriPolicy
9 | import org.dbpedia.extraction.util.ConfigUtils
10 | import org.apache.commons.io.FilenameUtils
11 | import java.io.File
12 | import org.apache.hadoop.fs.{Path, FileSystem}
13 |
14 | /**
15 | * OutputFormat implementation that uses the configured Formatters to write Quads to respective datasets
16 | * through the DBpediaDatasetOutputFormat class. This class uses as many DBpediaDatasetOutputFormat objects
17 | * as there are configured formats. Formats are read in from the provided extraction config properties file.
18 | * This class handles configuration and Formatters, while DBpediaDatasetOutputFormat handles dividing the Quads
19 | * into datasets.
20 | *
21 | * 1. To use this OutputFormat three Strings need to be set in Hadoop's Configuration:
22 | * dbpedia.wiki.name - Config.wikiName, the wiki suffix (eg. wiki)
23 | * dbpedia.wiki.language.wikicode - Language wiki code of the input wiki dump
24 | * dbpedia.wiki.date - Wiki dump date in YYYYMMDD format
25 | * dbpedia.output.overwrite - Boolean, if set to true, output files will be overwritten if they already exist,
26 | * or else an IOException will be thrown (which is also the default behaviour) - this is actually for MultipleTextOutputFormat
27 | * dbpedia.config.properties - HDFS Path at which the extraction config properties file is stored
28 | *
29 | * 2. The extraction config properties file needs to be added to the distributed cache - the HDFS location should be
30 | * configured using dbpedia.config.properties.
31 | *
32 | * 3. Also, the output needs to be grouped by dataset such that each key is a Text representing the dataset
33 | * to which the Quads in the value belong to. Example key: article_categories
34 | *
35 | * NOTE: When using this with Spark set only one core per worker.
36 | *
37 | * Output will look like Hadoop leaf files (eg. part-r-00000) inside directories like enwiki-20140614-article-categories.tql.
38 | * The files will be compressed using the specified compression codec.
39 | *
40 | * @see DBpediaDatasetOutputFormat
41 | */
42 | class DBpediaCompositeOutputFormat extends TextOutputFormat[Text, QuadSeqWritable]
43 | {
44 | private val CONFIG_PROPERTIES = "dbpedia.config.properties"
45 | private val WIKI = "dbpedia.wiki.name"
46 | private val LANGUAGE = "dbpedia.wiki.language.wikicode"
47 | private val DATE = "dbpedia.wiki.date"
48 |
49 | private class DBpediaCompositeRecordWriter(context: TaskAttemptContext) extends RecordWriter[Text, QuadSeqWritable]
50 | {
51 | private val recordWriters = mutable.Map[String, RecordWriter[Text, QuadSeqWritable]]()
52 | private val conf = context.getConfiguration
53 | private val configPropertiesDCPath = conf.get(CONFIG_PROPERTIES)
54 | private val wikiName = conf.get(WIKI)
55 | private val langCode = conf.get(LANGUAGE)
56 | private val date = conf.get(DATE)
57 | private val localConfigPropertiesFile = new Path("./config.properties")
58 | private val formatters =
59 | {
60 | // Deserialize the config Properties object to get the Formatters
61 | println(context.getCacheFiles.mkString("\n"))
62 | val configProperties = context.getCacheFiles.find(_.getPath == configPropertiesDCPath).get
63 |
64 | val fs = FileSystem.get(conf)
65 | // copy config file from distributed cache to raw local FS
66 | fs.copyToLocalFile(false, new Path(configProperties), localConfigPropertiesFile, true)
67 |
68 | val config = ConfigUtils.loadConfig(localConfigPropertiesFile.toString, "UTF-8")
69 | UriPolicy.parseFormats(config, "uri-policy", "format")
70 | }
71 |
72 | /**
73 | * Note: This method is not synchronized, keeping with the rest of the Hadoop code in this framework.
74 | * When using this with Spark set only one core per worker to ensure that only one thread accesses
75 | * this method per JVM.
76 | */
77 | override def write(key: Text, value: QuadSeqWritable)
78 | {
79 | for ((suffix, format) <- formatters)
80 | {
81 | // Each RecordReader writes Quads to corresponding datasets depending upon the Text key.
82 | // See DBpediaDatasetOutputFormat and MultipleTextOutputFormat for details.
83 | val writer = recordWriters.getOrElseUpdate(suffix, new DBpediaDatasetOutputFormat(
84 | langCode,
85 | wikiName,
86 | date,
87 | suffix,
88 | format
89 | ).getRecordWriter(context))
90 | writer.write(key, value)
91 | }
92 | }
93 |
94 | override def close(context: TaskAttemptContext) = recordWriters.foreach(_._2.close(context))
95 | }
96 |
97 | override def getRecordWriter(context: TaskAttemptContext): RecordWriter[Text, QuadSeqWritable] = new DBpediaCompositeRecordWriter(context)
98 |
99 | override def checkOutputSpecs(job: JobContext) = () // allow overwriting output directory
100 | }
101 |
--------------------------------------------------------------------------------
/extraction/src/main/scala/org/dbpedia/extraction/spark/io/output/DBpediaDatasetOutputFormat.scala:
--------------------------------------------------------------------------------
1 | package org.dbpedia.extraction.spark.io.output
2 |
3 | import org.apache.hadoop.io.{Text, NullWritable}
4 | import org.dbpedia.extraction.destinations.formatters.Formatter
5 | import org.apache.hadoop.mapreduce.{TaskAttemptContext, RecordWriter}
6 | import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat.LineRecordWriter
7 | import org.dbpedia.extraction.spark.io.QuadSeqWritable
8 | import java.io.DataOutputStream
9 | import org.apache.hadoop.io.compress.CompressionCodec
10 |
11 | /**
12 | * OutputFormat implementation that writes Quads to respective datasets depending upon the key, after applying
13 | * a given Formatter. This class extends MultipleTextOutputFormat which allows it to write to multiple locations
14 | * (for multiple datasets) depending upon custom criteria.
15 | *
16 | * The output needs to be grouped by dataset such that each key is a Text representing the dataset to which
17 | * the Quads in the value belong to. Example key: article_categories
18 | *
19 | * @param langWikiCode Language wiki code of the input wiki dump
20 | * @param wikiNameSuffix Config.wikiName (eg. wiki)
21 | * @param date Wiki dump date in YYYYMMDD format
22 | * @param outputSuffix Output suffix corresponding to formatter (eg. tql)
23 | * @param formatter Formatter object used to render the Quad objects according to a specific format
24 | */
25 | class DBpediaDatasetOutputFormat(langWikiCode: String,
26 | wikiNameSuffix: String,
27 | date: String,
28 | outputSuffix: String,
29 | formatter: Formatter) extends MultipleTextOutputFormat[Text, QuadSeqWritable]
30 | {
31 | /**
32 | * Construct the underlying RecordWriter. By default creates a LineRecordWriter that is used by
33 | * TextOutputFormat by default.
34 | *
35 | * @param context TaskAttemptContext
36 | * @param out DataOutputStream where output data is written to
37 | * @param keyValueSeparator String separator between output key and value
38 | * @param codec Option[CompressionCodec] for handling compression
39 | * @return A RecordWriter object over the given DataOutputStream
40 | */
41 | override protected def getBaseRecordWriter(context: TaskAttemptContext,
42 | out: DataOutputStream,
43 | keyValueSeparator: String,
44 | codec: Option[CompressionCodec] = None): RecordWriter[Text, QuadSeqWritable] =
45 | {
46 | // Get a LineRecordWriter (the usual RecordWriter used by TextOutputFormat) that ignores keys and writes Text outputs.
47 | val lineWriter = codec match
48 | {
49 | case Some(c) =>
50 | // Have we an output compression codec?
51 | new LineRecordWriter[NullWritable, Text](
52 | new DataOutputStream(c.createOutputStream(out)),
53 | keyValueSeparator
54 | )
55 | case _ =>
56 | new LineRecordWriter[NullWritable, Text](out, keyValueSeparator)
57 | }
58 |
59 | new DBpediaDatasetRecordWriter(lineWriter)
60 | }
61 |
62 | /**
63 | * If inferCodecFromPathName is set to true, the output compression codec will be inferred from the suffix/extension
64 | * in pathName (eg. tql.gz implies GzipCodec is used), otherwise it uses Hadoop configuration settings.
65 | */
66 | override protected val inferCodecFromPathName = true
67 |
68 | /**
69 | * Generate the output file name (the directory where the leaf part-* files will be written to)
70 | * based on the given key and value. The default behavior is that the file name does not depend on them.
71 | * That is, by default this method returns an empty String.
72 | *
73 | * @param key the key of the output data
74 | * @return generated file name
75 | */
76 | override protected def generateFileNameForKeyValue(key: Text, value: QuadSeqWritable): String =
77 | {
78 | val datasetName = key.toString
79 | // eg. enwiki-20140614-article-categories.tql
80 | s"$langWikiCode$wikiNameSuffix-$date-${datasetName.replace('_', '-')}.$outputSuffix"
81 | }
82 |
83 | /**
84 | * RecordWriter that wraps a LineRecordWriter, applies the given Formatter on a Seq[Quad] and writes to
85 | * the LineRecordWriter.
86 | */
87 | private class DBpediaDatasetRecordWriter(lineWriter: LineRecordWriter[NullWritable, Text]) extends RecordWriter[Text, QuadSeqWritable]
88 | {
89 | private val text = new Text("")
90 | private val nullKey = NullWritable.get()
91 |
92 | // Begin writing split with formatter header
93 | text.set(formatter.header.dropRight(1)) // remove newline from header
94 | lineWriter.write(nullKey, text)
95 |
96 | /**
97 | * Note: This method is not synchronized, keeping with the rest of the Hadoop code in this framework.
98 | * When using this with Spark, set only one core per worker to ensure that only one thread accesses
99 | * this method per JVM.
100 | */
101 | override def write(key: Text, value: QuadSeqWritable) =
102 | {
103 | for (quad <- value.get)
104 | {
105 | text.set(formatter.render(quad).dropRight(1)) // remove newline from rendered output
106 | lineWriter.write(nullKey, text)
107 | }
108 | }
109 |
110 | override def close(context: TaskAttemptContext) =
111 | {
112 | text.set(formatter.footer.dropRight(1)) // remove newline from footer
113 | lineWriter.write(nullKey, text)
114 | lineWriter.close(context)
115 | }
116 | }
117 |
118 | }
119 |
--------------------------------------------------------------------------------
/extraction/src/main/scala/org/dbpedia/extraction/spark/io/output/MultipleTextOutputFormat.scala:
--------------------------------------------------------------------------------
1 | package org.dbpedia.extraction.spark.io.output
2 |
3 | import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat
4 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat._
5 | import org.apache.hadoop.mapreduce.{TaskAttemptContext, RecordWriter}
6 | import scala.collection.mutable
7 | import org.apache.hadoop.fs.Path
8 | import org.apache.hadoop.util.ReflectionUtils
9 | import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat.LineRecordWriter
10 | import java.io.DataOutputStream
11 | import org.apache.hadoop.io.compress.{CompressionCodecFactory, CompressionCodec}
12 |
13 | /**
14 | * This class extends allows writing output to multiple output files depending upon custom criteria. It filters
15 | * every key-value pair and routes them to the corresponding locations.
16 | *
17 | * Configuration variables:
18 | * dbpedia.output.overwrite - Boolean, if set to true, output files will be overwritten if they already exist,
19 | * or else an IOException will be thrown (which is also the default behaviour)
20 | */
21 | class MultipleTextOutputFormat[K, V] extends TextOutputFormat[K, V]
22 | {
23 | private val OVERWRITE = "dbpedia.output.overwrite"
24 |
25 | private class MultipleTextRecordWriter(context: TaskAttemptContext) extends RecordWriter[K, V]
26 | {
27 | private val recordWriters = mutable.Map[String, RecordWriter[K, V]]()
28 |
29 | /**
30 | * Note: This method is not synchronized, keeping with the rest of the Hadoop code in this framework.
31 | * When using this with Spark, set only one core per worker to ensure that only one thread accesses
32 | * this method per JVM.
33 | */
34 | override def write(key: K, value: V)
35 | {
36 | // Generate the path depending upon key-value pair
37 | val finalPath = generateFileNameForKeyValue(key, value)
38 |
39 | // Extract the actual key and value
40 | val actualKey = generateActualKey(key, value)
41 | val actualValue = generateActualValue(key, value)
42 |
43 | // Get the RecordReader for finalPath or create one if needed
44 | val writer = recordWriters.getOrElseUpdate(finalPath, createRecordWriter(finalPath, context))
45 | writer.write(actualKey, actualValue)
46 | }
47 |
48 | override def close(context: TaskAttemptContext) = recordWriters.foreach(_._2.close(context))
49 | }
50 |
51 | override def getRecordWriter(context: TaskAttemptContext): RecordWriter[K, V] = new MultipleTextRecordWriter(context)
52 |
53 | /**
54 | * Create a new RecordWriter based on the modified output path and the RecordWriter implementation
55 | * returned by getBaseRecordWriter().
56 | */
57 | private def createRecordWriter(pathName: String, context: TaskAttemptContext): RecordWriter[K, V] =
58 | {
59 | val conf = context.getConfiguration
60 | val keyValueSeparator = conf.get(TextOutputFormat.SEPERATOR, "\t")
61 | // If overwriteOutput is set to true, output files will be overwritten if they already exist,
62 | // or else an IOException will be thrown (which is also the default behaviour)
63 | val overwriteOutput = conf.getBoolean(OVERWRITE, false)
64 |
65 | val (codec, file) = if (inferCodecFromPathName)
66 | {
67 | val extension = pathName.substring(pathName.lastIndexOf('.'))
68 | // Get modified suffixed path
69 | val file = getModifiedWorkFile(pathName, context, extension)
70 | // Returns Option[CompressionCodec] or None depending on file extension
71 | val codec = Option(new CompressionCodecFactory(conf).getCodec(file))
72 | (codec, file)
73 | }
74 | else
75 | {
76 | val isCompressed = getCompressOutput(context)
77 | if (isCompressed)
78 | {
79 | // Get the CompressionCodec from job configuration
80 | val codecClass = getOutputCompressorClass(context, classOf[CompressionCodec])
81 | val codec = ReflectionUtils.newInstance(codecClass, conf)
82 | val file = getModifiedWorkFile(pathName, context, codec.getDefaultExtension)
83 | (Some(codec), file)
84 | }
85 | else
86 | {
87 | val file = getModifiedWorkFile(pathName, context, "")
88 | (None, file)
89 | }
90 | }
91 |
92 | val fs = file.getFileSystem(conf)
93 | val fileOutputStream = fs.create(file, overwriteOutput)
94 |
95 | getBaseRecordWriter(context, fileOutputStream, keyValueSeparator, codec)
96 | }
97 |
98 | /**
99 | * Gets the default output path and inserts directoryName between the parent directory and leaf file (part-*).
100 | */
101 | private def getModifiedWorkFile(directoryName: String,
102 | context: TaskAttemptContext,
103 | extension: String): Path =
104 | {
105 | val path = super.getDefaultWorkFile(context, extension)
106 | new Path(new Path(path.getParent, directoryName), path.getName)
107 | }
108 |
109 | /**
110 | * If inferCodecFromPathName is set to true, the output compression codec will be inferred from the suffix/extension
111 | * in pathName (eg. foobar.gz implies GzipCodec is used), otherwise it uses Hadoop configuration settings.
112 | *
113 | * The default behaviour is to use Hadoop configuration settings.
114 | */
115 | protected val inferCodecFromPathName: Boolean = false
116 |
117 | /**
118 | * Construct the underlying RecordWriter. By default creates a LineRecordWriter that is used by
119 | * TextOutputFormat by default.
120 | *
121 | * @param context TaskAttemptContext
122 | * @param out DataOutputStream where output data is written to
123 | * @param keyValueSeparator String separator between output key and value
124 | * @param codec Option[CompressionCodec] for handling compression
125 | * @return A RecordWriter object over the given DataOutputStream
126 | */
127 | protected def getBaseRecordWriter(context: TaskAttemptContext,
128 | out: DataOutputStream,
129 | keyValueSeparator: String,
130 | codec: Option[CompressionCodec] = None): RecordWriter[K, V] =
131 | {
132 | codec match
133 | {
134 | case Some(c) =>
135 | // Have we an output compression codec?
136 | new LineRecordWriter[K, V](
137 | new DataOutputStream(c.createOutputStream(out)),
138 | keyValueSeparator
139 | )
140 | case _ =>
141 | new LineRecordWriter[K, V](out, keyValueSeparator)
142 | }
143 | }
144 |
145 | /**
146 | * Generate the output file name (the directory where the leaf part-* files will be written to)
147 | * based on the given key and value. The default behavior is that the file name does not depend on them.
148 | * That is, by default this method returns an empty String.
149 | *
150 | * @param key the key of the output data
151 | * @return generated file name
152 | */
153 | protected def generateFileNameForKeyValue(key: K, value: V): String = ""
154 |
155 | /**
156 | * Generate the actual key from the given key/value. The default behavior is that
157 | * the actual key is equal to the given key.
158 | *
159 | * @param key the key of the output data
160 | * @param value the value of the output data
161 | * @return the actual key derived from the given key/value
162 | */
163 | protected def generateActualKey(key: K, value: V): K = key
164 |
165 | /**
166 | * Generate the actual value from the given key and value. The default behavior is that
167 | * the actual value is equal to the given value.
168 | *
169 | * @param key the key of the output data
170 | * @param value the value of the output data
171 | * @return the actual value derived from the given key/value
172 | */
173 | protected def generateActualValue(key: K, value: V): V = value
174 | }
175 |
--------------------------------------------------------------------------------
/extraction/src/main/scala/org/dbpedia/extraction/spark/serialize/KryoExtractionRegistrator.scala:
--------------------------------------------------------------------------------
1 | package org.dbpedia.extraction.spark.serialize
2 |
3 | import org.apache.spark.serializer.KryoRegistrator
4 | import com.esotericsoftware.kryo.Kryo
5 | import scala.Console._
6 | import org.dbpedia.extraction.sources.WikiPage
7 | import org.dbpedia.extraction.wikiparser.{Namespace, WikiTitle}
8 | import org.dbpedia.extraction.util.Language
9 | import java.util.logging.Logger
10 | import org.dbpedia.extraction.dataparser.ParserUtils
11 |
12 | /**
13 | * It's best to register the classes that will be serialized/deserialized with Kryo.
14 | */
15 | class KryoExtractionRegistrator extends KryoRegistrator
16 | {
17 | override def registerClasses(kryo: Kryo)
18 | {
19 | kryo.register(classOf[Array[Object]])
20 | kryo.register(classOf[org.dbpedia.extraction.dataparser.GeoCoordinateParser])
21 | kryo.register(classOf[org.dbpedia.extraction.dataparser.SingleGeoCoordinateParser])
22 | kryo.register(classOf[org.dbpedia.extraction.destinations.Dataset])
23 | kryo.register(classOf[org.dbpedia.extraction.destinations.Quad])
24 | kryo.register(classOf[org.dbpedia.extraction.dump.extract.DistConfigLoader])
25 | kryo.register(classOf[org.dbpedia.extraction.dump.extract.DumpExtractionContext])
26 | kryo.register(classOf[org.dbpedia.extraction.dump.extract.DumpExtractionContextWrapper])
27 | kryo.register(classOf[org.dbpedia.extraction.mappings.ArticleCategoriesExtractor])
28 | kryo.register(classOf[org.dbpedia.extraction.mappings.ArticlePageExtractor])
29 | kryo.register(classOf[org.dbpedia.extraction.mappings.ArticleTemplatesExtractor])
30 | kryo.register(classOf[org.dbpedia.extraction.mappings.CategoryLabelExtractor])
31 | kryo.register(classOf[org.dbpedia.extraction.mappings.CompositeParseExtractor])
32 | kryo.register(classOf[org.dbpedia.extraction.mappings.DistRedirects])
33 | kryo.register(classOf[org.dbpedia.extraction.mappings.ExternalLinksExtractor])
34 | kryo.register(classOf[org.dbpedia.extraction.mappings.GeoExtractor])
35 | kryo.register(classOf[org.dbpedia.extraction.mappings.InfoboxExtractor])
36 | kryo.register(classOf[org.dbpedia.extraction.mappings.InterLanguageLinksExtractor])
37 | kryo.register(classOf[org.dbpedia.extraction.mappings.LabelExtractor])
38 | kryo.register(classOf[org.dbpedia.extraction.mappings.PageIdExtractor])
39 | kryo.register(classOf[org.dbpedia.extraction.mappings.PageLinksExtractor])
40 | kryo.register(classOf[org.dbpedia.extraction.mappings.ProvenanceExtractor])
41 | kryo.register(classOf[org.dbpedia.extraction.mappings.RedirectExtractor])
42 | kryo.register(classOf[org.dbpedia.extraction.mappings.Redirects])
43 | kryo.register(classOf[org.dbpedia.extraction.mappings.RevisionIdExtractor])
44 | kryo.register(classOf[org.dbpedia.extraction.mappings.RootExtractor])
45 | kryo.register(classOf[org.dbpedia.extraction.mappings.SkosCategoriesExtractor])
46 | kryo.register(classOf[org.dbpedia.extraction.dataparser.ParserUtils])
47 | kryo.register(classOf[org.dbpedia.extraction.ontology.datatypes.Datatype])
48 | kryo.register(classOf[org.dbpedia.extraction.ontology.OntologyClass])
49 | kryo.register(classOf[org.dbpedia.extraction.ontology.OntologyDatatypeProperty])
50 | kryo.register(classOf[org.dbpedia.extraction.ontology.OntologyObjectProperty])
51 | kryo.register(classOf[org.dbpedia.extraction.ontology.OntologyProperty])
52 | kryo.register(Class.forName("scala.collection.immutable.$colon$colon"))
53 | kryo.register(Class.forName("scala.collection.immutable.Map$EmptyMap$"))
54 | kryo.register(Class.forName("scala.collection.immutable.Nil$"))
55 | kryo.register(Class.forName("scala.collection.immutable.Set$EmptySet$"))
56 | kryo.register(classOf[scala.collection.mutable.ArrayBuffer[_]])
57 | kryo.register(classOf[Array[scala.collection.Seq[_]]])
58 | kryo.register(classOf[scala.runtime.BoxedUnit])
59 | kryo.register(classOf[Array[scala.Tuple2[_,_]]])
60 | kryo.register(classOf[scala.util.matching.Regex])
61 | kryo.register(classOf[WikiPage], new WikiPageSerializer)
62 | kryo.register(classOf[WikiTitle], new WikiTitleSerializer)
63 | kryo.register(classOf[Namespace])
64 | kryo.register(classOf[Language], new LanguageSerializer)
65 | kryo.register(classOf[Logger], new LoggerSerializer)
66 | kryo.register(classOf[ParserUtils], new ParserUtilsSerializer)
67 | }
68 | }
--------------------------------------------------------------------------------
/extraction/src/main/scala/org/dbpedia/extraction/spark/serialize/KryoSerializationWrapper.scala:
--------------------------------------------------------------------------------
1 | package org.dbpedia.extraction.spark.serialize
2 |
3 | import scala.reflect.ClassTag
4 |
5 | /**
6 | * A wrapper around some unserializable objects that make them both Java
7 | * serializable. Internally, Kryo is used for serialization.
8 | *
9 | * Use KryoSerializationWrapper(value) to create a wrapper.
10 | */
11 | class KryoSerializationWrapper[T: ClassTag] extends Serializable
12 | {
13 |
14 | @transient var value: T = _
15 |
16 | private var valueSerialized: Array[Byte] = _
17 |
18 | // The getter and setter for valueSerialized is used for XML serialization.
19 | def getValueSerialized(): Array[Byte] =
20 | {
21 | valueSerialized = KryoSerializer.serialize(value)
22 | valueSerialized
23 | }
24 |
25 | def setValueSerialized(bytes: Array[Byte]) =
26 | {
27 | valueSerialized = bytes
28 | value = KryoSerializer.deserialize[T](valueSerialized)
29 | }
30 |
31 | // Used for Java serialization.
32 | private def writeObject(out: java.io.ObjectOutputStream)
33 | {
34 | getValueSerialized()
35 | out.defaultWriteObject()
36 | }
37 |
38 | private def readObject(in: java.io.ObjectInputStream)
39 | {
40 | in.defaultReadObject()
41 | setValueSerialized(valueSerialized)
42 | }
43 | }
44 |
45 |
46 | object KryoSerializationWrapper
47 | {
48 | def apply[T: ClassTag](value: T): KryoSerializationWrapper[T] =
49 | {
50 | val wrapper = new KryoSerializationWrapper[T]
51 | wrapper.value = value
52 | wrapper
53 | }
54 | }
--------------------------------------------------------------------------------
/extraction/src/main/scala/org/dbpedia/extraction/spark/serialize/KryoSerializer.scala:
--------------------------------------------------------------------------------
1 | package org.dbpedia.extraction.spark.serialize
2 |
3 | import java.nio.ByteBuffer
4 |
5 | import org.apache.spark.{SparkConf, SparkEnv}
6 | import org.apache.spark.serializer.{KryoSerializer => SparkKryoSerializer}
7 | import scala.reflect.ClassTag
8 |
9 |
10 | /**
11 | * Java object serialization using Kryo. This is much more efficient, but Kryo
12 | * sometimes is buggy to use. We use this mainly to serialize the object
13 | * inspectors.
14 | */
15 | object KryoSerializer
16 | {
17 |
18 | @transient lazy val ser: SparkKryoSerializer =
19 | {
20 | val sparkConf = Option(SparkEnv.get).map(_.conf).getOrElse(new SparkConf())
21 | new SparkKryoSerializer(sparkConf)
22 | }
23 |
24 | def serialize[T: ClassTag](o: T): Array[Byte] =
25 | {
26 | ser.newInstance().serialize(o).array()
27 | }
28 |
29 | def deserialize[T: ClassTag](bytes: Array[Byte]): T =
30 | {
31 | ser.newInstance().deserialize[T](ByteBuffer.wrap(bytes))
32 | }
33 | }
--------------------------------------------------------------------------------
/extraction/src/main/scala/org/dbpedia/extraction/spark/serialize/LanguageSerializer.scala:
--------------------------------------------------------------------------------
1 | package org.dbpedia.extraction.spark.serialize
2 |
3 | import com.esotericsoftware.kryo.{Kryo, Serializer}
4 | import com.esotericsoftware.kryo.io.{Input, Output}
5 | import org.dbpedia.extraction.util.Language
6 |
7 | /**
8 | * Kryo serializer for org.dbpedia.extraction.util.Language
9 | */
10 | class LanguageSerializer extends Serializer[Language]
11 | {
12 | override def write(kryo: Kryo, output: Output, language: Language)
13 | {
14 | output.writeString(language.wikiCode)
15 | }
16 |
17 | override def read(kryo: Kryo, input: Input, languageClass: Class[Language]): Language =
18 | {
19 | val wikiCode = input.readString()
20 | Language(wikiCode)
21 | }
22 | }
23 |
--------------------------------------------------------------------------------
/extraction/src/main/scala/org/dbpedia/extraction/spark/serialize/LocaleSerializer.scala:
--------------------------------------------------------------------------------
1 | package org.dbpedia.extraction.spark.serialize
2 |
3 | import com.esotericsoftware.kryo.{Kryo, Serializer}
4 | import com.esotericsoftware.kryo.io.{Output, Input}
5 | import java.util.Locale
6 |
7 | /**
8 | * Kryo serializer for java.util.Locale
9 | */
10 | class LocaleSerializer extends Serializer[Locale]
11 | {
12 | override def write(kryo: Kryo, output: Output, locale: Locale)
13 | {
14 | output.writeAscii(locale.getLanguage)
15 | output.writeAscii(locale.getCountry)
16 | output.writeAscii(locale.getVariant)
17 | }
18 |
19 | override def read(kryo: Kryo, input: Input, localeClass: Class[Locale]): Locale =
20 | {
21 | new Locale(input.readString(), input.readString(), input.readString())
22 | }
23 | }
24 |
--------------------------------------------------------------------------------
/extraction/src/main/scala/org/dbpedia/extraction/spark/serialize/LoggerSerializer.scala:
--------------------------------------------------------------------------------
1 | package org.dbpedia.extraction.spark.serialize
2 |
3 | import com.esotericsoftware.kryo.{Kryo, Serializer}
4 | import com.esotericsoftware.kryo.io.{Input, Output}
5 | import org.dbpedia.extraction.util.Language
6 | import java.util.logging.Logger
7 |
8 | /**
9 | * Kryo serializer for org.dbpedia.extraction.util.Language
10 | */
11 | class LoggerSerializer extends Serializer[Logger]
12 | {
13 | override def write(kryo: Kryo, output: Output, logger: Logger)
14 | {
15 | output.writeString(logger.getName)
16 | }
17 |
18 | override def read(kryo: Kryo, input: Input, loggerClass: Class[Logger]): Logger =
19 | {
20 | val className = input.readString()
21 | Logger.getLogger(className)
22 | }
23 | }
24 |
--------------------------------------------------------------------------------
/extraction/src/main/scala/org/dbpedia/extraction/spark/serialize/ParserUtilsSerializer.scala:
--------------------------------------------------------------------------------
1 | package org.dbpedia.extraction.spark.serialize
2 |
3 | import com.esotericsoftware.kryo.{Kryo, Serializer}
4 | import com.esotericsoftware.kryo.io.{Input, Output}
5 | import org.dbpedia.extraction.dataparser.ParserUtils
6 | import org.dbpedia.extraction.util.Language
7 |
8 | /**
9 | * Kryo serializer for org.dbpedia.extraction.dataparser.ParserUtils
10 | */
11 | class ParserUtilsSerializer extends Serializer[ParserUtils]
12 | {
13 | override def write(kryo: Kryo, output: Output, parserUtils: ParserUtils) {
14 | kryo.writeObjectOrNull(output, parserUtils.context.language, new LanguageSerializer)
15 | }
16 |
17 | override def read(kryo: Kryo, input: Input, parserUtilsClass: Class[ParserUtils]): ParserUtils = {
18 | val lang = kryo.readObjectOrNull(input, classOf[Language], new LanguageSerializer)
19 | new ParserUtils(new {def language: Language = lang})
20 | }
21 | }
--------------------------------------------------------------------------------
/extraction/src/main/scala/org/dbpedia/extraction/spark/serialize/WikiPageSerializer.scala:
--------------------------------------------------------------------------------
1 | package org.dbpedia.extraction.spark.serialize
2 |
3 | import com.esotericsoftware.kryo.{Kryo, Serializer}
4 | import org.dbpedia.extraction.sources.WikiPage
5 | import com.esotericsoftware.kryo.io.{Output, Input}
6 | import org.dbpedia.extraction.wikiparser.WikiTitle
7 |
8 | /**
9 | * Kryo serializer for org.dbpedia.extraction.sources.WikiPage
10 | */
11 | class WikiPageSerializer extends Serializer[WikiPage]
12 | {
13 | override def write(kryo: Kryo, output: Output, wikiPage: WikiPage)
14 | {
15 | kryo.writeObjectOrNull(output, wikiPage.title, new WikiTitleSerializer)
16 | kryo.writeObjectOrNull(output, wikiPage.redirect, new WikiTitleSerializer)
17 | output.writeLong(wikiPage.id)
18 | output.writeLong(wikiPage.revision)
19 | output.writeLong(wikiPage.timestamp)
20 | output.writeLong(wikiPage.contributorID)
21 | output.writeString(wikiPage.contributorName)
22 | output.writeString(wikiPage.source)
23 | output.writeString(wikiPage.format)
24 | }
25 |
26 | override def read(kryo: Kryo, input: Input, wikiPageClass: Class[WikiPage]): WikiPage =
27 | {
28 | val title = kryo.readObjectOrNull(input, classOf[WikiTitle], new WikiTitleSerializer)
29 | val redirect = kryo.readObjectOrNull(input, classOf[WikiTitle], new WikiTitleSerializer)
30 | val id = input.readLong()
31 | val revision = input.readLong()
32 | val timestamp = input.readLong()
33 | val contributorID = input.readLong()
34 | val contributorName = input.readString()
35 | val source = input.readString()
36 | val format = input.readString()
37 | new WikiPage(title, redirect, id, revision, timestamp, contributorID, contributorName, source, format)
38 | }
39 | }
40 |
--------------------------------------------------------------------------------
/extraction/src/main/scala/org/dbpedia/extraction/spark/serialize/WikiTitleSerializer.scala:
--------------------------------------------------------------------------------
1 | package org.dbpedia.extraction.spark.serialize
2 |
3 | import com.esotericsoftware.kryo.{Kryo, Serializer}
4 | import com.esotericsoftware.kryo.serializers.FieldSerializer
5 | import com.esotericsoftware.kryo.io.{Input, Output}
6 | import org.dbpedia.extraction.wikiparser.{Namespace, WikiTitle}
7 | import org.dbpedia.extraction.util.Language
8 |
9 | /**
10 | * Kryo serializer for org.dbpedia.extraction.wikiparser.WikiTitle
11 | */
12 | class WikiTitleSerializer extends Serializer[WikiTitle]
13 | {
14 | override def write(kryo: Kryo, output: Output, wikiTitle: WikiTitle)
15 | {
16 | output.writeString(wikiTitle.decoded)
17 | kryo.writeObjectOrNull(output, wikiTitle.language, new LanguageSerializer)
18 | kryo.writeObjectOrNull(output, wikiTitle.namespace, new FieldSerializer(kryo, classOf[Namespace]))
19 | output.writeBoolean(wikiTitle.isInterLanguageLink)
20 | output.writeString(wikiTitle.fragment)
21 | }
22 |
23 | override def read(kryo: Kryo, input: Input, wikiTitleClass: Class[WikiTitle]): WikiTitle =
24 | {
25 | val decoded = input.readString()
26 | val language = kryo.readObjectOrNull(input, classOf[Language], new LanguageSerializer)
27 | val namespace = kryo.readObjectOrNull(input, classOf[Namespace], new FieldSerializer(kryo, classOf[Namespace]))
28 | val isInterLanguageLink = input.readBoolean()
29 | val fragment = input.readString()
30 | new WikiTitle(decoded, namespace, language, isInterLanguageLink, fragment)
31 | }
32 | }
33 |
--------------------------------------------------------------------------------
/extraction/src/main/scala/org/dbpedia/extraction/util/DistIOUtils.scala:
--------------------------------------------------------------------------------
1 | package org.dbpedia.extraction.util
2 |
3 | import com.esotericsoftware.kryo.Kryo
4 | import org.dbpedia.extraction.spark.serialize.KryoSerializer
5 | import org.apache.spark.SparkContext
6 | import org.apache.spark.rdd.RDD
7 | import org.apache.hadoop.io.{BytesWritable, NullWritable}
8 | import java.io.{ByteArrayInputStream, ByteArrayOutputStream}
9 | import com.esotericsoftware.kryo.io.{Input, Output}
10 | import org.apache.spark.SparkContext._
11 | import scala.reflect.ClassTag
12 | import org.apache.hadoop.fs.Path
13 | import org.apache.hadoop.conf.Configuration
14 | import org.apache.hadoop.mapreduce.Job
15 | import org.apache.hadoop.mapreduce.lib.input.{SequenceFileInputFormat, FileInputFormat}
16 |
17 | /**
18 | * Kryo file operations helper methods
19 | */
20 | object DistIOUtils
21 | {
22 | private val kryo: ThreadLocal[Kryo] = new ThreadLocal[Kryo]
23 | {
24 | override def initialValue = getNewKryo()
25 | }
26 |
27 | /**
28 | * @return returns a thread-local instance of Kryo
29 | */
30 | def getKryoInstance: Kryo = kryo.get()
31 |
32 | /**
33 | * @return new Kryo instance.
34 | */
35 | def getNewKryo(): Kryo = KryoSerializer.ser.newKryo()
36 |
37 | /**
38 | * Loads an RDD saved as a SequenceFile containing objects serialized by Kryo,
39 | * with NullWritable keys and BytesWritable values.
40 | * @param sc SparkContext
41 | * @param path String path to existing file. Can be on local file system or HDFS, S3 etc. See Spark docs.
42 | * @return deserialized RDD
43 | */
44 | def loadRDD[T: ClassTag](sc: SparkContext, rddClass: Class[T], path: String): RDD[T] =
45 | {
46 | val arrayOfRddClass = Class.forName("[L" + rddClass.getName + ";")
47 | val serializedRDD = sc.sequenceFile(path, classOf[NullWritable], classOf[BytesWritable])
48 | serializedRDD.values.flatMap(x => deserialize(x.getBytes, arrayOfRddClass).asInstanceOf[Array[T]])
49 | }
50 |
51 | /**
52 | * Loads an RDD saved as a SequenceFile containing objects serialized by Kryo,
53 | * with NullWritable keys and BytesWritable values.
54 | * @param sc SparkContext
55 | * @param path String path to existing file. Can be on local file system or HDFS, S3 etc. See Spark docs.
56 | * @return deserialized RDD
57 | */
58 | def loadRDD[T: ClassTag](sc: SparkContext, rddClass: Class[T], path: Path): RDD[T] =
59 | {
60 | val arrayOfRddClass = Class.forName("[L" + rddClass.getName + ";")
61 | val conf = new Configuration()
62 | val job = Job.getInstance(conf)
63 | FileInputFormat.addInputPath(job, path)
64 | val updatedConf = job.getConfiguration
65 | val serializedRDD = sc.newAPIHadoopRDD(updatedConf, classOf[SequenceFileInputFormat[NullWritable, BytesWritable]], classOf[NullWritable], classOf[BytesWritable])
66 | serializedRDD.values.flatMap(x => deserialize(x.getBytes, arrayOfRddClass).asInstanceOf[Array[T]])
67 | }
68 |
69 | /**
70 | * Saves an RDD as a SequenceFile containing objects serialized by Kryo,
71 | * with NullWritable keys and BytesWritable values.
72 | * @param rdd Spark RDD
73 | * @param path String path to existing file. Can be on local file system or HDFS, S3 etc. See Spark docs.
74 | */
75 | def saveRDD(rdd: RDD[_ <: AnyRef], path: String)
76 | {
77 | rdd.mapPartitions(iter => iter.grouped(50).map(_.toArray))
78 | .map(x => (NullWritable.get(), new BytesWritable(serialize(x)))).saveAsSequenceFile(path)
79 | }
80 |
81 | /**
82 | * Saves an RDD as a SequenceFile containing objects serialized by Kryo,
83 | * with NullWritable keys and BytesWritable values.
84 | * @param rdd Spark RDD
85 | * @param path String path to existing file. Can be on local file system or HDFS, S3 etc. See Spark docs.
86 | */
87 | def saveRDD(rdd: RDD[_ <: AnyRef], path: Path)
88 | {
89 | rdd.mapPartitions(iter => iter.grouped(50).map(_.toArray))
90 | .map(x => (NullWritable.get(), new BytesWritable(serialize(x)))).saveAsSequenceFile(path.toString)
91 | }
92 |
93 | // TODO: Add unit tests with code similar to:
94 | // /**
95 | // * Temporary method to test if serialization-deserialization works properly
96 | // */
97 | // def testSerDe(rdd: RDD[_ <: AnyRef], path: String) {
98 | // val serialized = rdd.map(x => (NullWritable.get(), new BytesWritable(serialize(x))))
99 | // serialized.saveAsSequenceFile(path)
100 | //
101 | // val deserialized : RDD[_ <: AnyRef] = serialized.values.map(x => {
102 | // deserialize(x.getBytes, classOf[WikiPage]).asInstanceOf[WikiPage]
103 | // })
104 | //
105 | // //Assertions below to test if (de)serialization works properly.
106 | // assert(deserialized.first().toString == rdd.first().toString)
107 | // assert(deserialized.count() == rdd.count())
108 | // }
109 | //
110 | // /**
111 | // * Temporary method to test if saveAsKryoFile() and openFromKryoFile() work consistently.
112 | // */
113 | // def testSaveOpen(sc: SparkContext, rdd: RDD[_ <: WikiPage], path: String) {
114 | // saveRDD(rdd, path)
115 | // val deserialized = loadRDD(sc, path)
116 | //
117 | // //Test to ensure we're saving as many WikiPages as we're retrieving after deserialization
118 | // assert(deserialized.count() == rdd.count())
119 | // }
120 |
121 | /**
122 | * @param x Any object
123 | * @return serialized Array of Bytes
124 | */
125 | def serialize(x: Any): Array[Byte] =
126 | {
127 | val stream = new ByteArrayOutputStream()
128 | val output = new Output(stream)
129 | getKryoInstance.writeObject(output, x)
130 | output.close()
131 | stream.toByteArray
132 | }
133 |
134 | /**
135 | * @param x Array of Bytes - serialized version of an object
136 | * @param c Class of the object
137 | * @return the object deserialized by Kryo
138 | */
139 | def deserialize[T](x: Array[Byte], c: Class[T]) =
140 | {
141 | getKryoInstance.readObject(new Input(new ByteArrayInputStream(x)), c)
142 | }
143 | }
--------------------------------------------------------------------------------
/extraction/src/main/scala/org/dbpedia/extraction/util/SparkUtils.scala:
--------------------------------------------------------------------------------
1 | package org.dbpedia.extraction.util
2 |
3 | import org.apache.spark.{Logging, SparkContext, SparkConf}
4 | import org.dbpedia.extraction.dump.extract.DistConfig
5 | import org.apache.log4j.{Logger, Level}
6 | import java.nio.file.{Paths, Files}
7 | import java.io.FileNotFoundException
8 | import scala.reflect.ClassTag
9 | import org.apache.spark.rdd.RDD
10 | import org.dbpedia.extraction.spark.serialize.KryoSerializationWrapper
11 | import org.apache.spark.ui.jobs.DBpediaJobProgressListener
12 |
13 | /**
14 | * Utility functions specific to Spark
15 | */
16 | object SparkUtils
17 | {
18 | /**
19 | * Stores the SparkContext instance.
20 | */
21 | private var sc: SparkContext = null
22 |
23 | /**
24 | * Set all loggers to the given log level. Returns a map of the value of every logger
25 | * @param level
26 | * @param loggers
27 | * @return
28 | */
29 | def setLogLevels(level: org.apache.log4j.Level, loggers: TraversableOnce[String]) =
30 | {
31 | loggers.map
32 | {
33 | loggerName =>
34 | val logger = Logger.getLogger(loggerName)
35 | val prevLevel = logger.getLevel()
36 | logger.setLevel(level)
37 | loggerName -> prevLevel
38 | }.toMap
39 | }
40 |
41 | /**
42 | * Sets log levels for Spark and its peripheral libraries to DistConfig.sparkLogLevel.
43 | */
44 | def setSparkLogLevels(config: DistConfig) =
45 | {
46 | setLogLevels(config.sparkLogLevel, Seq("org.apache", "spark", "org.eclipse.jetty", "akka"))
47 | }
48 |
49 | /**
50 | * Creates and returns a new SparkContext taking configuration info from Config
51 | * @param config
52 | * @return
53 | */
54 | def getSparkContext(config: DistConfig) =
55 | synchronized
56 | {
57 | if (sc == null)
58 | {
59 | val conf = new SparkConf().setMaster(config.sparkMaster).setAppName(config.sparkAppName)
60 | for ((property, value) <- config.sparkProperties)
61 | conf.set(property, value)
62 | conf.setSparkHome(config.sparkHome)
63 | val distJarName = if (Files.exists(Paths.get("target/extraction-4.1-SNAPSHOT.jar")))
64 | {
65 | "target/extraction-4.1-SNAPSHOT.jar"
66 | } else if (Files.exists(Paths.get("extraction/target/extraction-4.1-SNAPSHOT.jar")))
67 | {
68 | "extraction/target/extraction-4.1-SNAPSHOT.jar"
69 | } else
70 | {
71 | throw new FileNotFoundException("extraction-4.1-SNAPSHOT.jar cannot be found in extraction/target. Please run mvn install -Dmaven.test.skip=true to build JAR first.")
72 | }
73 |
74 | conf.setJars(List(distJarName))
75 | conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
76 | conf.set("spark.kryo.registrator", "org.dbpedia.extraction.spark.serialize.KryoExtractionRegistrator")
77 | conf.set("spark.kryoserializer.buffer.mb", "100")
78 | sc = new SparkContext(conf)
79 | // No logging is done upon omitting 'with Logging' - some package problem?
80 | setLogLevels(Level.INFO, Seq("org.apache.spark.ui.jobs.DBpediaJobProgressListener"))
81 | sc.addSparkListener(new DBpediaJobProgressListener(conf))
82 | }
83 | sc
84 | }
85 |
86 | /**
87 | * Return an iterator that contains all of the elements in given RDD.
88 | * The iterator will consume as much memory as the largest partition in the RDD.
89 | *
90 | * @param rdd
91 | * @return iterator for rdd's elements
92 | */
93 | def rddToLocalIterator[T: ClassTag](rdd: RDD[T]): Iterator[T] =
94 | {
95 | def collectPartition(p: Int): Array[T] =
96 | {
97 | sc.runJob(rdd, (iter: Iterator[T]) => iter.toArray, Seq(p), allowLocal = false).head
98 | }
99 | (0 until rdd.partitions.length).iterator.flatMap(i => collectPartition(i))
100 | }
101 |
102 | /**
103 | * Returns the function object wrapped inside a KryoSerializationWrapper.
104 | * This is useful for having Kryo-serialization for Spark closures.
105 | *
106 | * @param function
107 | * @return
108 | */
109 | def kryoWrapFunction[T, U](function: (T => U)): (T => U) =
110 | {
111 | def genMapper(kryoWrapper: KryoSerializationWrapper[(T => U)])(input: T): U =
112 | {
113 | kryoWrapper.value.apply(input)
114 | }
115 |
116 | genMapper(KryoSerializationWrapper(function)) _
117 | }
118 | }
119 |
--------------------------------------------------------------------------------
/extraction/src/test/resources/config.properties:
--------------------------------------------------------------------------------
1 | # download and extraction target dir
2 | # This can be a directory on HDFS or a local directory, depending on the Hadoop configuration files given in dist-config.properties
3 | base-dir=src/test/resources/data
4 | # Source file. If source file name ends with .gz or .bz2, it is unzipped on the fly.
5 | # Must exist in the directory xxwiki/yyyymmdd and have the prefix xxwiki-yyyymmdd-
6 | # where xx is the wiki code and yyyymmdd is the dump date.
7 |
8 | # default (prefer multistream bz2 files):
9 | source=pages-articles-multistream.xml.bz2
10 |
11 | # alternatives:
12 | # source=pages-articles.xml.gz
13 | # source=pages-articles.xml
14 |
15 | ###### Extract from part files ######
16 | #
17 | # Please make sure that the regex actually matches the format used by ALL the wikis you are going to extract from!!!!
18 | # One that should work in all cases is
19 | # source=@pages-articles\\d*\\.xml(-p\\d+p\\d+)?\\.bz2
20 | #
21 | # NOTE: when using the above regex you should make sure you do not have part files AND regular dump files together
22 | # for the same wiki, e.g. frwiki-20131120-pages-articles1.xml.bz2 and frwiki-20131120-pages-articles.xml.bz2, as they
23 | # BOTH will match and that will result in duplicate output data
24 | #
25 | # Example:
26 | # enwiki => enwiki-latest-pages-articles1.xml-p000000010p000010000.bz2 hence @pages-articles\\d+\\.xml-p\\d+p\\d+\\.bz2 matches
27 | # frwiki => frwiki-latest-pages-articles1.xml.bz2 hence @pages-articles\\d+\\.xml\\.bz2 matches (the previous regex does not!)
28 | # commonswiki => it does not have part files! This is true for other wikis as well.
29 | #
30 | # source=@pages-articles\\d+\\.xml-p\\d+p\\d+\\.bz2
31 |
32 | # In case of multistream chunks
33 | # source=@pages-articles-multistream\\.xml\\.\\d+\\.bz2
34 |
35 | # use only directories that contain a 'download-complete' file? Default is false.
36 | require-download-complete=false
37 |
38 | # List of languages or article count ranges, e.g. 'en,de,fr' or '10000-20000' or '10000-', or '@mappings'
39 | languages=en
40 | # extractor class names starting with "." are prefixed by "org.dbpedia.extraction.mappings"
41 |
42 | extractors=.ArticleCategoriesExtractor,.ArticleTemplatesExtractor,.CategoryLabelExtractor,\
43 | .ExternalLinksExtractor,.GeoExtractor,.InterLanguageLinksExtractor,.LabelExtractor,.PageIdExtractor,\
44 | .PageLinksExtractor,.RedirectExtractor,.RevisionIdExtractor,.ProvenanceExtractor,.SkosCategoriesExtractor,\
45 | .ArticlePageExtractor
46 |
47 | #extractors=.InfoboxExtractor
48 |
49 | # if ontology and mapping files are not given or do not exist, download info from mappings.dbpedia.org
50 | ontology=ontology.xml
51 | mappings=../mappings
52 |
53 | # Serialization URI policies and file formats. Quick guide:
54 | # uri-policy keys: uri, generic, xml-safe, reject-long
55 | # uri-policy position modifiers: -subjects, -predicates, -objects, -datatypes, -contexts
56 | # uri-policy values: comma-separated languages or '*' for all languages
57 | # format values: n-triples, n-quads, turtle-triples, turtle-quads, trix-triples, trix-quads
58 | # See http://git.io/DBpedia-serialization-format-properties for details.
59 |
60 | # For backwards compatibility, en uses generic URIs. All others use local IRIs.
61 | uri-policy.uri=uri:en; generic:en; xml-safe-predicates:*; reject-long:*
62 | uri-policy.iri=generic:en; xml-safe-predicates:*; reject-long:*
63 |
64 | # NT is unreadable anyway - might as well use URIs for en
65 | format.nt.gz=n-triples;uri-policy.uri
66 | format.nq.gz=n-quads;uri-policy.uri
67 |
68 | # Turtle is much more readable - use nice IRIs for all languages
69 | format.ttl.gz=turtle-triples;uri-policy.iri
70 | format.tql.gz=turtle-quads;uri-policy.iri
71 |
--------------------------------------------------------------------------------
/extraction/src/test/resources/data/enwiki/20160407/enwiki-20160407-pages-articles-multistream.xml.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dbpedia/distributed-extraction-framework/ad039712889000a085dd3d0ab66a15ddde99573d/extraction/src/test/resources/data/enwiki/20160407/enwiki-20160407-pages-articles-multistream.xml.bz2
--------------------------------------------------------------------------------
/extraction/src/test/resources/dist-config.properties:
--------------------------------------------------------------------------------
1 | # The SPARK_HOME environment variable should be set to this, Spark's location
2 | spark-home=/home/user/engine/spark-0.9.1-bin-hadoop2/
3 |
4 | # Paths to the Hadoop configuration files, if any. These are needed for HDFS.
5 | # hadoop-coresite-xml-path=/home/user/engine/hadoop-2.2.0/etc/hadoop/core-site.xml
6 | # hadoop-hdfssite-xml-path=/home/user/engine/hadoop-2.2.0/etc/hadoop/hdfs-site.xml
7 | # hadoop-mapredsite-xml-path=/home/user/engine/hadoop-2.2.0/etc/hadoop/mapred-site.xml
8 |
9 | # Refer to README.md for advice
10 | spark.executor.memory=2500m
11 |
12 | # Replace local[8] with something like spark://192.168.0.100 to go into distributed mode.
13 | spark-master=local[8]
14 |
15 | # When running on a distributed cluster, it is essential that you set spark.cores.max to N * M
16 | # where N = total no. of slave machines, M = SPARK_WORKER_INSTANCES (from spark-env.sh)
17 | # This is to ensure that Spark uses as many cores (over the entire cluster) as many workers there are.
18 | spark.cores.max=8
19 |
20 | # You can add more spark.* variables here. All variables starting with spark. will be provided to SparkConf
21 |
22 | # This is used for setting log levels for "org.apache", "spark", "org.eclipse.jetty" and "akka" using
23 | # SparkUtils.setLogLevels(). It is WARN by default to prevent excessive logging from Spark.
24 | # It is a good idea to set it to INFO while debugging/testing out the framework.
25 | # Refer to org.apache.log4j.Level for more details
26 | # logging-level=INFO
27 |
28 | # WARNING: If base-dir is set here, the base-dir in config.properties (the original DBpedia extraction configuration) is ignored.
29 | # base-dir=/data
30 |
31 | # Please refer to the source code for org.dbpedia.extraction.dump.extract.DistConfig for the complete set of configuration variables
32 | # TODO: Add info on all configuration variables here.
--------------------------------------------------------------------------------
/extraction/src/test/scala/org/dbpedia/extraction/mappings/DistRedirectsTest.scala:
--------------------------------------------------------------------------------
1 | package org.dbpedia.extraction.mappings
2 |
3 | import org.junit.Assert._
4 | import org.dbpedia.extraction.sources.{Source, XMLSource, WikiPage}
5 | import org.apache.spark.rdd.RDD
6 | import org.dbpedia.extraction.util._
7 | import java.io.File
8 | import org.dbpedia.extraction.wikiparser.Namespace
9 | import org.dbpedia.extraction.dump.extract.{Config, DistConfig}
10 | import org.dbpedia.extraction.dump.download.Download
11 | import org.dbpedia.extraction.util.RichFile.wrapFile
12 | import org.scalatest.FunSuite
13 | import org.scalatest.junit.JUnitRunner
14 | import org.junit.runner.RunWith
15 | import org.apache.hadoop.conf.Configuration
16 | import org.apache.hadoop.fs.Path
17 | import org.dbpedia.extraction.util.RichHadoopPath.wrapPath
18 |
19 | /**
20 | * Unit Test for the DistRedirects class.
21 | *
22 | * This test expects a DBpedia extraction configuration properties file named "config.properties" and a distributed
23 | * framework configuration file named "dist-config.properties" to be present at the test/resources directory.
24 | *
25 | * It's better to use a small data dump like the liwiki dump to run the test.
26 | *
27 | * TODO: Add some wiki dump content to test upon rather than rely upon an external wiki dump file and config files.
28 | */
29 | @RunWith(classOf[JUnitRunner])
30 | class DistRedirectsTest extends FunSuite
31 | {
32 | val CONFIG_FILE = "config.properties"
33 | val SPARK_CONFIG_FILE = "dist-config.properties"
34 |
35 | // Fixtures shared between all tests in this class
36 | val (distConfig: DistConfig,
37 | articleSource: Source,
38 | rdd: RDD[WikiPage],
39 | language: Language,
40 | date: String,
41 | distFinder: Finder[Path]) = try
42 | {
43 | val configFileResource = getClass.getClassLoader.getResource(CONFIG_FILE)
44 | val sparkConfigFileResource = getClass.getClassLoader.getResource(SPARK_CONFIG_FILE)
45 |
46 | //Check if the wiki-pages file and config.properties file are present
47 | assertNotNull("Test file %s missing from distributed/src/test/resources".format(CONFIG_FILE), configFileResource)
48 | assertNotNull("Test file %s missing from distributed/src/test/resources".format(SPARK_CONFIG_FILE), sparkConfigFileResource)
49 |
50 | val configProperties = ConfigUtils.loadConfig(configFileResource.toURI.getPath, "UTF-8")
51 | val distConfigProperties = ConfigUtils.loadConfig(sparkConfigFileResource.toURI.getPath, "UTF-8")
52 | val config = new Config(configProperties)
53 | val distConfig = new DistConfig(distConfigProperties, configProperties, configFileResource.toURI)
54 | implicit val hadoopConfiguration = distConfig.hadoopConf
55 | val lang = config.extractorClasses.iterator.next()._1
56 |
57 | val localFinder = new Finder[File](config.dumpDir, lang, config.wikiName)
58 | val distFinder = new Finder[Path](distConfig.dumpDir.get, lang, config.wikiName)
59 | val date = latestDate(config, localFinder)
60 |
61 | // Get the readers for the test dump files
62 | val articlesReaders = files(config.source, localFinder, date).map(x => () => IOUtils.reader(x))
63 |
64 | // Get the article source for Redirects to load from
65 | val articleSource = XMLSource.fromReaders(articlesReaders, lang,
66 | title => title.namespace == Namespace.Main || title.namespace == Namespace.File ||
67 | title.namespace == Namespace.Category || title.namespace == Namespace.Template)
68 |
69 | SparkUtils.setSparkLogLevels(distConfig)
70 | val sc = SparkUtils.getSparkContext(distConfig)
71 | // Generate RDD from the article source for DistRedirects to load from in parallel
72 | // Naively calls toArray on Seq, only for testing
73 | val rdd = sc.parallelize(articleSource.toSeq, 8)
74 | (distConfig, articleSource, rdd, lang, date, distFinder)
75 | } catch{ case ex:Exception => ex.printStackTrace(); (null, null,null, null,null, null)}
76 |
77 | implicit def hadoopConfiguration: Configuration = distConfig.hadoopConf
78 |
79 | test("Verify DistRedirects.loadFromRDD output")
80 | {
81 | val distRedirects = DistRedirects.loadFromRDD(rdd, language)
82 | val redirects = Redirects.loadFromSource(articleSource, language)
83 | assertEquals("Testing DistRedirects.loadFromRDD failed!", redirects.map, distRedirects.map)
84 | }
85 |
86 | test("Verify DistRedirects.load output")
87 | {
88 | val cache = distFinder.file(date, "template-redirects.obj")
89 | var distRedirects = DistRedirects.load(rdd, cache, language)
90 | var redirects = Redirects.loadFromSource(articleSource, language)
91 | assertEquals("Testing DistRedirects.loadFromRDD failed!", redirects.map, distRedirects.map)
92 |
93 | // Try again so that cache gets used
94 | distRedirects = DistRedirects.load(rdd, cache, language)
95 | redirects = Redirects.loadFromSource(articleSource, language)
96 | assertEquals("Testing DistRedirects.loadFromRDD failed!", redirects.map, distRedirects.map)
97 | }
98 |
99 | // Taken from org.dbpedia.extraction.dump.extract.Config
100 | def latestDate(config: Config, finder: Finder[_]): String =
101 | {
102 | val isSourceRegex = config.source.startsWith("@")
103 | val source = if (isSourceRegex) config.source.substring(1) else config.source
104 | val fileName = if (config.requireComplete) Download.Complete else source
105 | finder.dates(fileName, isSuffixRegex = isSourceRegex).last
106 | }
107 |
108 | // Taken from org.dbpedia.extraction.dump.extract.Config
109 | def files(source: String, finder: Finder[File], date: String): List[File] =
110 | {
111 |
112 | val files = if (source.startsWith("@"))
113 | {
114 | // the articles source is a regex - we want to match multiple files
115 | finder.matchFiles(date, source.substring(1))
116 | } else List(finder.file(date, source))
117 |
118 | files
119 | }
120 | }
--------------------------------------------------------------------------------
/extraction/src/test/scala/org/dbpedia/extraction/spark/io/QuadSeqWritableTest.scala:
--------------------------------------------------------------------------------
1 | package org.dbpedia.extraction.spark.io
2 |
3 | import org.junit.runner.RunWith
4 | import org.scalatest.junit.JUnitRunner
5 | import org.scalatest.FunSuiteLike
6 | import org.dbpedia.extraction.destinations.Quad
7 | import scala.util.Random
8 | import org.junit.Assert._
9 |
10 | @RunWith(classOf[JUnitRunner])
11 | class QuadSeqWritableTest extends WritableTest[QuadSeqWritable] with FunSuiteLike
12 | {
13 | test("Verify that serialization-deserialization works properly")
14 | {
15 | // Create random List[Quad]
16 | val sampleQuads = for (i <- (0 until 100).toList) yield new Quad(Random.nextString(10),
17 | Random.nextString(10),
18 | Random.nextString(10),
19 | Random.nextString(10),
20 | Random.nextString(10),
21 | Random.nextString(10),
22 | Random.nextString(10))
23 |
24 | val writable1 = new QuadSeqWritable(sampleQuads)
25 | val writable2 = new QuadSeqWritable()
26 |
27 | performReadWriteRoundTrip(writable1, writable2)
28 | assertEquals(writable1.get, writable2.get)
29 | }
30 | }
31 |
--------------------------------------------------------------------------------
/extraction/src/test/scala/org/dbpedia/extraction/spark/io/WikiPageWritableTest.scala:
--------------------------------------------------------------------------------
1 | package org.dbpedia.extraction.spark.io
2 |
3 | import org.junit.Assert._
4 | import org.junit.runner.RunWith
5 | import org.scalatest.junit.JUnitRunner
6 | import org.scalatest.FunSuiteLike
7 | import org.dbpedia.extraction.sources.XMLSource
8 | import scala.xml.XML
9 | import org.dbpedia.extraction.util.Language
10 |
11 | @RunWith(classOf[JUnitRunner])
12 | class WikiPageWritableTest extends WritableTest[WikiPageWritable] with FunSuiteLike
13 | {
14 | test("Verify that serialization-deserialization works properly")
15 | {
16 | val samplePage =
17 | """
18 | |
19 | | Lèmburg
20 | | 0
21 | | 13
22 | |
23 | |
24 | | 196988
25 | | 5980
26 | | 2010-01-25T20:24:26Z
27 | |
28 | | PahlesBot
29 | | 458
30 | |
31 | |
32 | | Bot: automatisch tekst vervangen (-#redirect +#REDIRECT)
33 | | #REDIRECT [[Limburg]]
34 | | 2uewphqvpum37i9d7g5okf5c3m643c7
35 | | wikitext
36 | | text/x-wiki
37 | |
38 | |
39 | """.stripMargin
40 |
41 | val wikiPage = XMLSource.fromXML(XML.loadString("" + samplePage + ""), Language("li")).head
42 | val writable1 = new WikiPageWritable(wikiPage)
43 | val writable2 = new WikiPageWritable()
44 |
45 | performReadWriteRoundTrip(writable1, writable2)
46 | assertEquals(writable1.get.toString, writable2.get.toString)
47 | }
48 | }
49 |
--------------------------------------------------------------------------------
/extraction/src/test/scala/org/dbpedia/extraction/spark/io/WritableTest.scala:
--------------------------------------------------------------------------------
1 | package org.dbpedia.extraction.spark.io
2 |
3 | import org.apache.hadoop.io.Writable
4 | import java.io.{ByteArrayInputStream, DataInputStream, DataOutputStream, ByteArrayOutputStream}
5 |
6 | abstract class WritableTest[T <: Writable]
7 | {
8 | /**
9 | * Utility method that takes two Writables as parameters, writes the first Writable to a byte
10 | * array and reads it back into the second Writable.
11 | *
12 | * @param oldWritable Writable to be serialized and deserialized again
13 | * @param newWritable Writable where oldWritable is deserialized into after serialization.
14 | */
15 | def performReadWriteRoundTrip(oldWritable: T, newWritable: T) =
16 | {
17 | val bos = new ByteArrayOutputStream
18 | val dos = new DataOutputStream(bos)
19 | oldWritable.write(dos)
20 | newWritable.readFields(new DataInputStream(new ByteArrayInputStream(bos.toByteArray)))
21 | }
22 | }
23 |
--------------------------------------------------------------------------------
/gce/README.md:
--------------------------------------------------------------------------------
1 | Spark GCE
2 | =========
3 |
4 | Spark GCE is like Spark Ec2 but for those who run their cluster on Google Cloud.
5 |
6 | - Make sure you have installed and authenticated gcutils where you are running this script.
7 | - Helps you launch a spark cluster in the Google Cloud
8 | - Attaches 100GB empty disk to all nodes in the cluster
9 | - Installs and configures Spark and HDFS automatically
10 | - Starts the Shark server automatically
11 |
12 | Spark GCE is a python script which will help you launch a spark cluster in the google cloud like the way the spark_ec2 script does for AWS.
13 |
14 | Usage
15 | -----
16 |
17 | > ***spark_gce.py project-name number-of-slaves slave-type master-type identity-file zone cluster-name spark-mem workers-per-node cores-per-worker local-log-dir***
18 | >
19 | >>
20 | >> - **project-name**: Name of the project where you are going to launch your spark cluster.
21 | >>
22 | >> - **number-of-slave**: Number of slaves that you want to launch.
23 | >>
24 | >> - **slave-type**: Instance type for the slave machines.
25 | >>
26 | >> - **master-type**: Instance type for the master node.
27 | >>
28 | >> - **identity-file**: Identity file to authenticate with your GCE instances, Usually resides at *~/.ssh/google_compute_engine* once you authenticate using gcutils.
29 | >>
30 | >> - **zone:** Specify the zone where you are going to launch the cluster.
31 | >>
32 | >> - **cluster-name**: Name the cluster that you are going to launch.
33 | >>
34 | >> - **spark-mem**: Amount of memory per Spark worker (as a JVM memory string eg. 2500m, 2g)
35 | >>
36 | >> - **workers-per-node**: Number of workers to run on each slave node
37 | >>
38 | >> - **cores-per-worker**: Number of cores each worker should use (optional, 1 by default)
39 | >>
40 | >> - **local-log-dir**: A local directory to download nmon logs from all the nodes (optional, empty, or no logging by default)
41 | >>
42 | >
43 | > ***spark_gce.py project-name cluster-name [identity-fle local-log-dir] destroy***
44 | >
45 | >> - **project-name**: Name of the project where the spark cluster is at.
46 | >> - **cluster-name**: Name of the cluster that you are going to destroy.
47 | >> - **NOTE**: If you had specified a local-log-dir while starting the cluster, provide it here too, along with the identity-file, else skip both.
48 |
49 |
50 | Installation
51 | --------------
52 |
53 | ```sh
54 | git clone git@github.com:dbpedia/distributed-extraction-framework.git
55 | cd gce
56 | python spark_gce.py
57 | ```
58 |
--------------------------------------------------------------------------------
/install-run:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Shortcut for
4 | # mvn -f ../pom.xml install && mvn scala:run -Dlauncher=... -DaddArgs=...
5 | # Must be called with one of the modules (core/, dump/, ...) as current directory.
6 | # Example:
7 | # extraction_framework/core> ../install-run LAUNCHER ARG1 ARG2 ARG3
8 | # is equivalent to
9 | # extraction_framework/core> mvn -f ../pom.xml install && mvn scala:run "-Dlauncher=LAUNCHER" "-DaddArgs=ARG1|ARG2|ARG3"
10 |
11 | # if we're not on a terminal, use batch mode to avoid ugly log files
12 | [ ! -t 1 ] && BATCH="-B"
13 | mvn $BATCH -f ../pom.xml install && . ../run "$@"
14 |
--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
4 |
5 | 4.0.0
6 |
7 | org.dbpedia
8 | distributed-extraction
9 | pom
10 | 4.1-SNAPSHOT
11 | Parent POM of the DBpedia Distributed Extraction Framework
12 |
13 |
14 | UTF-8
15 | 1.7
16 | 2.11.4
17 | 2.2.0
18 | 1.3.0
19 | 2.11
20 | 2.2.4
21 | -Xmx1024m
22 |
23 |
24 |
25 |
26 | extraction
27 | download
28 | common
29 |
30 |
31 |
32 |
33 |
34 |
35 | net.alchim31.maven
36 | scala-maven-plugin
37 | 3.1.6
38 |
39 |
40 | org.apache.maven.plugins
41 | maven-compiler-plugin
42 | 3.1
43 |
44 |
45 |
46 |
47 |
48 |
49 |
53 |
54 |
55 | net.alchim31.maven
56 | scala-maven-plugin
57 |
58 |
59 |
60 | -unchecked
61 | -deprecation
62 | -feature
63 |
64 |
65 | ${scala.compiler.Xmx}
66 |
67 |
68 |
69 |
70 |
71 | compile
72 |
73 | compile
74 |
75 | compile
76 |
77 |
78 |
79 | test-compile
80 |
81 | testCompile
82 |
83 | test-compile
84 |
85 |
86 |
87 | process-resources
88 |
89 | compile
90 |
91 | process-resources
92 |
93 |
94 |
95 |
96 |
97 |
98 | org.apache.maven.plugins
99 | maven-compiler-plugin
100 |
101 | ${java.version}
102 | ${java.version}
103 |
104 |
105 |
106 |
107 | maven-enforcer-plugin
108 | 1.3.1
109 |
110 |
111 |
112 | enforce
113 |
114 |
115 |
116 |
117 | ${java.version}
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 | org.scala-lang
132 | scala-library
133 | ${scala.version}
134 |
135 |
136 |
137 |
138 | org.scala-lang
139 | scala-actors
140 | ${scala.version}
141 |
142 |
143 |
144 |
145 | org.scala-lang
146 | scala-reflect
147 | ${scala.version}
148 |
149 |
150 |
151 |
152 | org.scalatest
153 | scalatest_${scalatest.scala.version}
154 | ${scalatest.version}
155 | test
156 |
157 |
158 |
159 |
160 |
161 |
162 |
163 |
164 |
165 |
166 |
167 |
168 |
169 |
170 |
171 |
172 |
173 |
174 |
175 | incremental
176 |
177 |
178 |
179 | net.alchim31.maven
180 | scala-maven-plugin
181 |
182 |
183 |
184 | incremental
185 |
188 | true
189 |
190 |
191 |
192 |
193 |
194 |
195 |
196 |
--------------------------------------------------------------------------------
/run:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Shortcut for mvn scala:run -Dlauncher=... -DaddArgs=...
4 | # Must be called with one of the modules (core/, dump/, ...) as current directory.
5 | # Example:
6 | # extraction_framework/core> ../run LAUNCHER ARG1 ARG2 ARG3
7 | # is equivalent to
8 | # extraction_framework/core> mvn scala:run "-Dlauncher=LAUNCHER" "-DaddArgs=ARG1|ARG2|ARG3"
9 |
10 | LAUNCHER="$1"
11 |
12 | ADD_ARGS="$2"
13 | for ARG in ${@:3}
14 | do
15 | ADD_ARGS="$ADD_ARGS|$ARG"
16 | done
17 |
18 | # export MAVEN_OPTS='-Xmx4096M -XX:MaxPermSize=1024M -XX:+HeapDumpOnOutOfMemoryError -XX:+PrintGC -XX:+PrintGCTimeStamps'
19 | # export MAVEN_DEBUG='-X -e'
20 |
21 | # if we're not on a terminal, use batch mode to avoid ugly log files
22 | [ ! -t 1 ] && BATCH="-B"
23 | mvn $MAVEN_DEBUG $BATCH scala:run "-Dlauncher=$LAUNCHER" "-DaddArgs=$ADD_ARGS"
24 |
--------------------------------------------------------------------------------
/run-extraction-test:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Performs both normal sequential extraction and distributed extraction and compares the outputs.
4 |
5 | CONFIG_FILE="$1"
6 | SPARK_CONF_FILE="$2"
7 |
8 | echo "===================================================================="
9 | echo "Running sequential extraction"
10 | echo "===================================================================="
11 | ./run seq-extraction $CONFIG_FILE
12 | mkdir /tmp/dbpedia-test-seq-extraction
13 | mv `grep base-dir $CONFIG_FILE | sed -ne 's/^base-dir=//p'`/*wiki/*/*wiki*.gz /tmp/dbpedia-test-seq-extraction
14 |
15 | echo "===================================================================="
16 | echo "Running distributed extraction"
17 | echo "===================================================================="
18 | ./run extraction $CONFIG_FILE $SPARK_CONF_FILE
19 | mkdir /tmp/dbpedia-test-par-extraction
20 | cp -rf `grep base-dir $CONFIG_FILE | sed -ne 's/^base-dir=//p'`/*wiki/*/*wiki*.gz /tmp/dbpedia-test-par-extraction/
21 |
22 | echo "===================================================================="
23 | echo "Computing diffs:"
24 | echo "===================================================================="
25 | diffs=`diff <(gzip -dc /tmp/dbpedia-test-seq-extraction/*.gz | grep -v "^#" | sort) <(gzip -dc /tmp/dbpedia-test-par-extraction/*wiki*.gz/part*.gz | grep -v "^#" | sort)`
26 | if [ -z "$diffs" ]; then
27 | echo "Outputs match!"
28 | else
29 | echo $diffs
30 | fi
31 |
32 |
33 | rm -rf /tmp/dbpedia-test-???-extraction
34 |
--------------------------------------------------------------------------------