├── .gitignore ├── .travis.yml ├── README.md ├── clean-install-run ├── common ├── pom.xml └── src │ └── main │ └── scala │ └── org │ └── dbpedia │ └── extraction │ └── util │ ├── HadoopConfigurable.scala │ └── RichHadoopPath.scala ├── download ├── pom.xml └── src │ ├── main │ ├── resources │ │ ├── application.conf │ │ └── reference.conf │ └── scala │ │ └── org │ │ └── dbpedia │ │ └── extraction │ │ ├── dump │ │ └── download │ │ │ ├── ActoredCounter.scala │ │ │ ├── DistDownload.scala │ │ │ ├── DistDownloadConfig.scala │ │ │ ├── DumpFileSource.scala │ │ │ └── actors │ │ │ ├── DownloadClient.scala │ │ │ ├── DownloadJobRunner.scala │ │ │ ├── DownloadProgressTracker.scala │ │ │ ├── DownloadResultConsumer.scala │ │ │ ├── Master.scala │ │ │ ├── Worker.scala │ │ │ └── message │ │ │ ├── DownloadJob.scala │ │ │ ├── DownloaderProgressMessage.scala │ │ │ ├── GeneralMessage.scala │ │ │ ├── MasterWorkerMessage.scala │ │ │ └── WorkerProgressMessage.scala │ │ └── util │ │ └── RemoteExecute.scala │ └── test │ └── resources │ ├── dist-download.properties │ └── download.properties ├── extraction ├── pom.xml └── src │ ├── main │ └── scala │ │ └── org │ │ ├── apache │ │ └── spark │ │ │ └── ui │ │ │ └── jobs │ │ │ └── DBpediaJobProgressListener.scala │ │ └── dbpedia │ │ └── extraction │ │ ├── destinations │ │ ├── DistDeduplicatingWriterDestination.scala │ │ ├── DistDestination.scala │ │ ├── DistMarkerDestination.scala │ │ └── DistWrapperDestination.scala │ │ ├── dump │ │ └── extract │ │ │ ├── DistConfig.scala │ │ │ ├── DistConfigLoader.scala │ │ │ ├── DistExtraction.scala │ │ │ ├── DistExtractionJob.scala │ │ │ └── DumpExtractionContextWrapper.scala │ │ ├── mappings │ │ ├── DistDisambiguations.scala │ │ └── DistRedirects.scala │ │ ├── spark │ │ ├── io │ │ │ ├── QuadSeqWritable.scala │ │ │ ├── WikiPageWritable.scala │ │ │ ├── input │ │ │ │ ├── ByteMatcher.scala │ │ │ │ ├── DBpediaWikiPageInputFormat.scala │ │ │ │ └── SeekableInputStream.scala │ │ │ └── output │ │ │ │ ├── DBpediaCompositeOutputFormat.scala │ │ │ │ ├── DBpediaDatasetOutputFormat.scala │ │ │ │ └── MultipleTextOutputFormat.scala │ │ └── serialize │ │ │ ├── KryoExtractionRegistrator.scala │ │ │ ├── KryoSerializationWrapper.scala │ │ │ ├── KryoSerializer.scala │ │ │ ├── LanguageSerializer.scala │ │ │ ├── LocaleSerializer.scala │ │ │ ├── LoggerSerializer.scala │ │ │ ├── ParserUtilsSerializer.scala │ │ │ ├── WikiPageSerializer.scala │ │ │ └── WikiTitleSerializer.scala │ │ └── util │ │ ├── DistIOUtils.scala │ │ └── SparkUtils.scala │ └── test │ ├── resources │ ├── config.properties │ ├── data │ │ └── enwiki │ │ │ └── 20160407 │ │ │ └── enwiki-20160407-pages-articles-multistream.xml.bz2 │ └── dist-config.properties │ └── scala │ └── org │ └── dbpedia │ └── extraction │ ├── mappings │ └── DistRedirectsTest.scala │ └── spark │ └── io │ ├── QuadSeqWritableTest.scala │ ├── WikiPageWritableTest.scala │ └── WritableTest.scala ├── gce ├── README.md └── spark_gce.py ├── install-run ├── ontology.owl ├── ontology.xml ├── pom.xml ├── run └── run-extraction-test /.gitignore: -------------------------------------------------------------------------------- 1 | target/ 2 | .idea/ 3 | *~ 4 | *.iml 5 | .cache 6 | *.log 7 | *.lck 8 | *.tmp 9 | java_pid* 10 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | sudo: false 2 | language: scala 3 | jdk: 4 | - oraclejdk8 5 | - oraclejdk7 6 | - openjdk7 7 | # branches: 8 | # only: 9 | # - master 10 | before_install: 11 | - sed -i.bak -e 's|https://nexus.codehaus.org/snapshots/|https://oss.sonatype.org/content/repositories/codehaus-snapshots/|g' ~/.m2/settings.xml 12 | script: "mvn test" 13 | notifications: 14 | email: 15 | recipients: 16 | - riteshoneinamillion@gmail.com 17 | on_success: change 18 | on_failure: change 19 | cache: 20 | directories: 21 | - $HOME/.m2 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | DBpedia Distributed Extraction Framework 2 | ================================== 3 | 4 | ## We are looking for maintatiner(s) for this project 5 | 6 | If you want to join, express your interest! 7 | 8 | ## Description 9 | 10 | This is the distributed version of the [DBpedia Information Extraction Framework](https://github.com/dbpedia/extraction-framework/). It uses [Apache Spark](http://spark.apache.org) to extract structured data from Wikipedia in a parallel, distributed manner. 11 | 12 | This is currently a work-in-progress, and the instructions are mostly intended for developers. 13 | 14 | ## Requirements 15 | * Java 7 16 | * Maven 3 17 | * Apache Spark 0.9.1 built with Apache Hadoop 2.2.0 18 | 19 | ## Setup Apache Spark 20 | 21 | ```bash 22 | $ wget http://d3kbcqa49mib13.cloudfront.net/spark-0.9.1-bin-hadoop2.tgz 23 | $ tar xzf http://d3kbcqa49mib13.cloudfront.net/spark-0.9.1-bin-hadoop2.tgz 24 | $ cd spark-0.9.1-bin-hadoop2 25 | $ SCALA_HOME=/usr/share/java MAVEN_OPTS=\"-Xmx2g -XX:MaxPermSize=512M -XX:ReservedCodeCacheSize=512m\" mvn -Dhadoop.version=2.2.0 -Dprotobuf.version=2.5.0 -DskipTests clean package 26 | ``` 27 | 28 | Replace SCALA_HOME according to your machine settings. It is necessary to set enough memory for maven to make Spark compile successfully. 29 | 30 | Add the hostnames of your slave nodes (after having downloaded Spark to all nodes) to conf/slaves. There's a bunch of attendant configurations needed for running on a cluster, like ensuring the firewall allows traffic on certain ports, ensuring passwordless access between the master and slave nodes, setting up HDFS and formatting the NameNode etc. Usually in a cluster of N nodes you would run Spark Master and Hadoop's NameNode on one node and the Spark Workers and Hadoop DataNodes on the remaining N-1 nodes. 31 | 32 | Here's a sample `spark-env.sh` for a cluster where the slaves have 4 cores and 15G RAM each: 33 | ```bash 34 | export SCALA_HOME=/usr/share/java[ 35 | export SPARK_MEM=2500m 36 | export SPARK_WORKER_CORES=1 37 | export SPARK_WORKER_INSTANCES=4 38 | SPARK_JAVA_OPTS+=" -Dspark.local.dir=/mnt/spark" 39 | export SPARK_JAVA_OPTS 40 | export SPARK_MASTER_IP=192.168.0.100 41 | export JAVA_HOME=/usr/lib/jvm/java-1.7.0-openjdk-1.7.0.65.x86_64 42 | ``` 43 | 44 | **Important**: Note that we have set cores (threads) per worker to 1 and set the number of workers equal to the number of cores on the machine. This is because: 45 | * The implementation that Hadoop uses to decode bzip2 files - `CBZip2InputStream` - is not thead-safe (there's a JIRA for that: httips://issues.apache.org/jira/browse/HADOOP-10614). This means that allotting multiple threads to a single worker while using .bz2 input files will cause the jobs to fail. 46 | * Multiple JVMs rather than a single huge JVM often increases performance. 47 | 48 | While running tests we have found that setting `spark.executor.memory` to 2500m - 3000m is a good idea with the above sample configuration. It is given in the sample dist-config.properties file discussed in the next section. 49 | 50 | And at the end: 51 | 52 | ``` 53 | sbin/start-all.sh 54 | ``` 55 | 56 | We have added a script for setting up Spark and Hadoop on Google Compute Engine with the optimal settings for this framework. You can find it in the **gce** directory. 57 | 58 | Please refer to the [Spark official docs](http://spark.apache.org/docs/0.9.1/spark-standalone.html) for details on how to deploy Spark in standalone mode. 59 | 60 | ## How to Build 61 | 62 | Clone the latest version of the repo and switch to stage branch: 63 | 64 | $ git clone https://github.com/dbpedia/distributed-extraction-framework.git 65 | $ cd distributed-extraction-framework 66 | $ mvn clean install -Dmaven.test.skip=true # Compiles the code without running tests 67 | 68 | ## Dump-based Distributed Extraction 69 | 70 | Follow the instructions given below to download data for the extractions you need to perform. An example of the download.properties file is given at `download/src/test/resources/download.properties` 71 | 72 | In the root directory run the following commands 73 | 74 | $ mvn clean install -Dmaven.test.skip=true # Compiles the code without running tests 75 | $ ./run download config=download.properties # Downloads the wikipedia dumps 76 | 77 | **Points to keep in mind:** 78 | 79 | 1. Before performing extractions you will need a config.properties file for general extraction configuration and a dist-config.properties file for the distributed framework specific configuration (Spark, Hadoop, logging etc.). Examples are given at `extraction/src/test/resources/`. 80 | 81 | 2. The example `extraction/src/test/resources/dist-config.properties` file needs to be modified with a proper spark-home and spark-master (local[N] means N cores on the local node - you can change it to something like `spark://hostname:7077` to run it in distributed mode). 82 | 83 | 3. Prefer pages-articles-multistream.bz2 files to pages-articles.bz2 because they are more efficient for parallel extraction. The former can be decompressed in parallel using Hadoop's splittable Bzip2Codec. Of course, this does not matter when using the pages-articlesX.xml-pXXXXXXXXXXpXXXXXXXXXX.bz2 files (which will be the files of choice for distributed downloads). 84 | 85 | 4. **Important:** Finally, when running on a distributed cluster, it is essential that you set `spark.cores.max` (in dist-config.properties) to **N** \* **M** where N = total no. of slaves, M = `SPARK_WORKER_INSTANCES`. This is to ensure that Spark uses as many cores (over the entire cluster) as many workers there are. 86 | 87 | Now perform parallel extractions on your Spark cluster: 88 | 89 | $ ./run extraction extraction/src/test/resources/config.properties extraction/src/test/resources/dist-config.properties 90 | 91 | 92 | ### Testing 93 | Please see the [wiki page for Testing](https://github.com/dbpedia/distributed-extraction-framework/wiki/Testing) for detailed instructions on how to verify outputs of the distributed extraction framework by comparing them with that of the original. 94 | 95 | ## Distributed Downloads 96 | 97 | This is still a work in progress and there are some issues that need to be solved. 98 | 99 | Have a look at `download/src/test/resources/dist-download.properties` and `download/src/test/resources/download.properties`. You can create your own config files using them. Just make sure that they are present at the same path in all nodes of the cluster. 100 | 101 | After cloning and building the framework on the master node, for each slave node, do this: 102 | ``` 103 | rsync -avhz --progress ~/.m2 $SLAVE:~/ 104 | rsync -avhz --progress /path/to/distributed-extraction-framework $SLAVE:/path/to/ 105 | ../run download distconfig=/path/to/distributed-extraction-framework/download/src/test/resources/dist-download.properties config=/path/to/distributed-extraction-framework/download/src/test/resources/download.properties 106 | ``` 107 | 108 | You can find the worker logs at `/path/to/distributed-extraction-framework/logs` of each node. 109 | 110 | -------------------------------------------------------------------------------- /clean-install-run: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Shortcut for 4 | # mvn -f ../pom.xml clean install && mvn scala:run -Dlauncher=... -DaddArgs=... 5 | # Must be called with one of the modules (core/, dump/, ...) as current directory. 6 | # Example: 7 | # extraction_framework/core> ../clean-install-run LAUNCHER ARG1 ARG2 ARG3 8 | # is equivalent to 9 | # extraction_framework/core> mvn -f ../pom.xml clean install && mvn scala:run "-Dlauncher=LAUNCHER" "-DaddArgs=ARG1|ARG2|ARG3" 10 | 11 | # if we're not on a terminal, use batch mode to avoid ugly log files 12 | [ ! -t 1 ] && BATCH="-B" 13 | mvn $BATCH -f ../pom.xml clean && . ../install-run "$@" 14 | -------------------------------------------------------------------------------- /common/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | 8 | org.dbpedia 9 | distributed-extraction 10 | 4.1-SNAPSHOT 11 | 12 | 13 | org.dbpedia.distributed-extraction 14 | common 15 | 4.1-SNAPSHOT 16 | DBpedia Distributed Extraction Framework Commons 17 | 18 | 19 | 20 | 21 | net.alchim31.maven 22 | scala-maven-plugin 23 | 24 | 25 | 26 | 27 | 28 | 29 | org.dbpedia.extraction 30 | core 31 | 4.1 32 | 33 | 34 | 35 | org.apache.hadoop 36 | hadoop-client 37 | ${hadoop.version} 38 | 39 | 40 | 41 | org.apache.hadoop 42 | hadoop-common 43 | ${hadoop.version} 44 | 45 | 46 | 47 | 48 | -------------------------------------------------------------------------------- /common/src/main/scala/org/dbpedia/extraction/util/HadoopConfigurable.scala: -------------------------------------------------------------------------------- 1 | package org.dbpedia.extraction.util 2 | 3 | import org.apache.hadoop.fs.Path 4 | import org.dbpedia.extraction.util.RichHadoopPath.wrapPath 5 | import org.apache.hadoop.conf.Configuration 6 | 7 | /** 8 | * Trait for classes that need to create a Hadoop Configuration. 9 | */ 10 | trait HadoopConfigurable 11 | { 12 | /** Path to hadoop core-site.xml */ 13 | protected val hadoopCoreConf: String 14 | 15 | /** Path to hadoop hdfs-site.xml */ 16 | protected val hadoopHdfsConf: String 17 | 18 | /** Path to hadoop mapred-site.xml */ 19 | protected val hadoopMapredConf: String 20 | 21 | /** Hadoop Configuration. This is implicit because RichHadoopPath operations need it. */ 22 | implicit lazy val hadoopConf = 23 | { 24 | val hadoopConf = new Configuration() 25 | 26 | if (hadoopCoreConf != null) 27 | hadoopConf.addResource(new Path(hadoopCoreConf)) 28 | if (hadoopHdfsConf != null) 29 | hadoopConf.addResource(new Path(hadoopHdfsConf)) 30 | if (hadoopMapredConf != null) 31 | hadoopConf.addResource(new Path(hadoopMapredConf)) 32 | 33 | hadoopConf 34 | } 35 | 36 | /** 37 | * Checks if a Path exists. 38 | * 39 | * @param path Option[Path] if this is None, pathMustExist has no effect. 40 | * @param pathMustExist Boolean to ensure that the Path, if obtained, actually exists. 41 | * @throws RuntimeException if Option[Path] is defined but the path does not exist 42 | * @return the Option[Path] given as input 43 | */ 44 | def checkPathExists(path: Option[Path], pathMustExist: Boolean): Option[Path] = 45 | { 46 | // If pathMustExist is set to true, and path is defined but it does not exist, throw an error. 47 | if (pathMustExist && path.isDefined && !path.get.exists) 48 | { 49 | val hadoopHint = if (hadoopCoreConf == null || hadoopHdfsConf == null || hadoopMapredConf == null) " Make sure you configured Hadoop correctly and the directory exists on the configured file system." else "" 50 | throw sys.error("Dir " + path.get.getSchemeWithFileName + " does not exist." + hadoopHint) 51 | } 52 | path 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /common/src/main/scala/org/dbpedia/extraction/util/RichHadoopPath.scala: -------------------------------------------------------------------------------- 1 | package org.dbpedia.extraction.util 2 | 3 | import java.io.{IOException, FileNotFoundException,OutputStream, InputStream} 4 | import org.apache.hadoop.fs.{FileSystem, Path} 5 | import org.apache.hadoop.conf.Configuration 6 | import java.nio.file.NotDirectoryException 7 | import scala.language.implicitConversions 8 | 9 | object RichHadoopPath { 10 | 11 | implicit def wrapPath(path: Path)(implicit hadoopConf: Configuration) = new RichHadoopPath(path, hadoopConf) 12 | 13 | implicit def toPath(path: String) = new Path(path) 14 | 15 | } 16 | 17 | /** 18 | * This class lets us use org.apache.hadoop.fs.Path seamlessly wherever a FileLike is used. 19 | * Defines additional methods on Path by using an implicit Configuration. 20 | */ 21 | class RichHadoopPath(path: Path, conf: Configuration) extends FileLike[Path] { 22 | 23 | private val fs: FileSystem = path.getFileSystem(conf) 24 | 25 | override def toString: String = path.toString 26 | 27 | override def name: String = path.getName 28 | 29 | /** 30 | * @throws NotDirectoryException if the path is not a directory 31 | * @throws FileNotFoundException if the path does not exist 32 | */ 33 | override def hasFiles: Boolean = { 34 | isDirectory match { 35 | // Not a directory? 36 | case false => throw new NotDirectoryException(path.toString) 37 | // Contains files? 38 | case true => if(fs.listStatus(path).size > 0) true else false 39 | } 40 | } 41 | 42 | override def delete(recursive: Boolean = false): Unit = { 43 | if(!fs.delete(path, recursive)) 44 | throw new IOException("failed to delete path ["+path+"]") 45 | } 46 | 47 | override def resolve(name: String): Path = new Path(path, name) 48 | 49 | override def exists: Boolean = fs.exists(path) 50 | 51 | // TODO: more efficient type than List? 52 | override def names: List[String] = names("*") 53 | 54 | // TODO: more efficient type than List? 55 | def names(glob: String): List[String] = list(glob).map(_.getName) 56 | 57 | // TODO: more efficient type than List? 58 | override def list: List[Path] = list("*") 59 | 60 | // TODO: more efficient type than List? 61 | def list(glob: String): List[Path] = { 62 | val list = fs.globStatus(new Path(path, glob)).map(_.getPath).toList 63 | if(list.isEmpty) throw new IOException("failed to list files in ["+path+"]") 64 | list 65 | } 66 | 67 | override def size: Long = fs.getContentSummary(path).getLength 68 | 69 | override def isFile: Boolean = fs.isFile(path) 70 | 71 | override def isDirectory: Boolean = fs.getFileStatus(path).isDirectory 72 | 73 | override def inputStream(): InputStream = fs.open(path) 74 | 75 | override def outputStream(append: Boolean = false): OutputStream = if(append) fs.append(path) else fs.create(path) 76 | 77 | def mkdirs(): Boolean = fs.mkdirs(path) 78 | 79 | def getSchemeWithFileName: String = fs.getScheme + "://" + path.toUri.getPath 80 | } 81 | -------------------------------------------------------------------------------- /download/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | 8 | org.dbpedia 9 | distributed-extraction 10 | 4.1-SNAPSHOT 11 | 12 | 13 | org.dbpedia.distributed-extraction 14 | download 15 | 4.1-SNAPSHOT 16 | DBpedia Distributed Dump Downloader 17 | 18 | 19 | 20 | 21 | 22 | org.apache.maven.plugins 23 | maven-shade-plugin 24 | 2.2 25 | 26 | 27 | 28 | *:* 29 | 30 | META-INF/*.SF 31 | META-INF/*.DSA 32 | META-INF/*.RSA 33 | 34 | 35 | 36 | 37 | 38 | 39 | downloads-jar 40 | package 41 | 42 | shade 43 | 44 | 45 | 46 | 47 | 51 | 52 | reference.conf 53 | 54 | 56 | 57 | worker.Main 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | net.alchim31.maven 68 | scala-maven-plugin 69 | 70 | 71 | 72 | 73 | seq-download 74 | org.dbpedia.extraction.dump.download.Download 75 | 84 | 85 | 86 | 87 | 88 | download 89 | org.dbpedia.extraction.dump.download.DistDownload 90 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | org.dbpedia.extraction 110 | dump 111 | 4.1 112 | 113 | 114 | 115 | org.dbpedia.distributed-extraction 116 | common 117 | 4.1-SNAPSHOT 118 | 119 | 120 | 121 | org.apache.hadoop 122 | hadoop-client 123 | ${hadoop.version} 124 | 125 | 126 | 127 | com.typesafe.akka 128 | akka-contrib_2.10 129 | 2.3.0 130 | 131 | 132 | 133 | com.typesafe.akka 134 | akka-testkit_2.10 135 | 2.3.0 136 | 137 | 138 | 139 | com.jcraft 140 | jsch 141 | 0.1.51 142 | 143 | 144 | 145 | -------------------------------------------------------------------------------- /download/src/main/resources/application.conf: -------------------------------------------------------------------------------- 1 | akka { 2 | actor.provider = "akka.cluster.ClusterActorRefProvider" 3 | 4 | remote.netty.tcp { 5 | hostname="127.0.0.1" 6 | port=0 7 | } 8 | 9 | extensions = ["akka.contrib.pattern.ClusterReceptionistExtension"] 10 | 11 | akka.cluster.auto-down = on 12 | 13 | auto-down-unreachable-after = 10s 14 | 15 | log-dead-letters = 0 16 | 17 | log-dead-letters-during-shutdown = off 18 | } 19 | -------------------------------------------------------------------------------- /download/src/main/resources/reference.conf: -------------------------------------------------------------------------------- 1 | akka { 2 | actor.provider = "akka.cluster.ClusterActorRefProvider" 3 | 4 | remote.netty.tcp { 5 | hostname="127.0.0.1" 6 | port=0 7 | } 8 | 9 | extensions = ["akka.contrib.pattern.ClusterReceptionistExtension"] 10 | 11 | akka.cluster.auto-down = on 12 | 13 | auto-down-unreachable-after = 10s 14 | 15 | log-dead-letters = 0 16 | 17 | log-dead-letters-during-shutdown = 0 18 | } 19 | -------------------------------------------------------------------------------- /download/src/main/scala/org/dbpedia/extraction/dump/download/ActoredCounter.scala: -------------------------------------------------------------------------------- 1 | package org.dbpedia.extraction.dump.download 2 | 3 | import akka.actor.ActorRef 4 | import java.io.InputStream 5 | import java.net.URLConnection 6 | import org.dbpedia.extraction.util.CountingInputStream 7 | import org.dbpedia.extraction.dump.download.actors.message.DownloaderProgressMessage 8 | import DownloaderProgressMessage.{Start, Read} 9 | import Counter.getContentLength 10 | 11 | /** 12 | * A Downloader mixin to be used with DownloadProgressTracker. Sends Start/Read messages to 13 | * the DownloadProgressTracker actor reference. 14 | * 15 | * @see org.dbpedia.extraction.dump.download.actors.DownloadProgressTracker 16 | */ 17 | trait ActoredCounter extends Downloader 18 | { 19 | /** 20 | * Reference to a DownloadProgressTracker actor. 21 | */ 22 | val progressActor: ActorRef 23 | 24 | protected abstract override def inputStream(conn: URLConnection): InputStream = { 25 | def logger(bytesRead: Long, close: Boolean): Unit = progressActor ! Read(bytesRead) 26 | progressActor ! Start(getContentLength(conn)) // Signal start of download with the total file size in bytes 27 | new CountingInputStream(super.inputStream(conn), logger) 28 | } 29 | 30 | } 31 | -------------------------------------------------------------------------------- /download/src/main/scala/org/dbpedia/extraction/dump/download/DistDownload.scala: -------------------------------------------------------------------------------- 1 | package org.dbpedia.extraction.dump.download 2 | 3 | import akka.actor._ 4 | import akka.cluster.Cluster 5 | import akka.contrib.pattern.{ClusterSingletonManager, ClusterClient} 6 | import com.typesafe.config.ConfigFactory 7 | import scala.concurrent.duration._ 8 | import scala.language.postfixOps 9 | import org.dbpedia.extraction.dump.download.actors._ 10 | import akka.actor.RootActorPath 11 | import scala.Some 12 | import java.util.logging.Logger 13 | import org.dbpedia.extraction.util.RemoteExecute 14 | import org.dbpedia.extraction.dump.download.actors.DownloadClient.Finished 15 | 16 | /** 17 | * Distributed Wikipedia dump downloader. 18 | * 19 | * While running this on a cluster, make sure that all configuration variables (including the paths to configuration files) 20 | * are valid in all nodes of the cluster, ie. the configuration files need to be present on the worker nodes too. 21 | */ 22 | object DistDownload extends RemoteExecute 23 | { 24 | val logger = Logger.getLogger(classOf[DistDownload].getName) 25 | 26 | def main(args: Array[String]): Unit = 27 | { 28 | val config = new DistDownloadConfig(args) 29 | if (config.isMaster) 30 | { 31 | val cluster = new ClusterStartup(config) 32 | 33 | // Start master on the driver node 34 | val joinAddress = cluster.startMaster(None, "driver") 35 | Thread.sleep(5000) // wait a few sec for master to start up 36 | 37 | (config.privateKey, config.sshPassphrase) match 38 | { 39 | case (Some(identity), Some(passphrase)) => // both private key and passphrase are provided 40 | addIdentity(identity, passphrase) 41 | case (Some(identity), None) => // passphrase is empty 42 | addIdentity(identity) 43 | case _ => // no private key provided 44 | } 45 | 46 | for (host <- config.slaves) 47 | { 48 | val session = createSession(config.userName, host) 49 | for (worker <- 1 to config.workersPerSlave) 50 | { 51 | val command = """cd %s/download;mkdir -p ../logs;nohup ../run download join=%s %s > ../logs/%s-%d.out &""". 52 | format(config.homeDir, joinAddress, args.mkString(" "), host, worker) 53 | println(command) 54 | println(execute(session, command)) 55 | } 56 | session.disconnect() 57 | } 58 | 59 | // Start download client and result/progress consumer 60 | val client = cluster.startFrontend(joinAddress) 61 | val dumpFiles = new DumpFileSource(config.languages, 62 | config.baseUrl, 63 | config.baseDir, 64 | config.wikiName, 65 | config.ranges, 66 | config.dateRange, 67 | config.dumpCount) 68 | for(dumpFile <- dumpFiles) 69 | client ! dumpFile 70 | 71 | client ! Finished 72 | } 73 | else 74 | { 75 | val cluster = new ClusterStartup(config) 76 | cluster.startWorker(config.joinAddress.get) 77 | } 78 | } 79 | } 80 | 81 | class DistDownload 82 | 83 | class ClusterStartup(config: DistDownloadConfig) 84 | { 85 | def systemName = "Workers" 86 | 87 | private def progressReportTimeout = config.progressReportInterval + 2.seconds 88 | 89 | def startMaster(joinAddressOption: Option[Address], role: String): Address = 90 | { 91 | val conf = ConfigFactory.parseString( s"""akka.cluster.roles=[$role]\nakka.remote.netty.tcp.hostname="${config.master}""""). 92 | withFallback(ConfigFactory.load()) 93 | val system = ActorSystem(systemName, conf) 94 | val joinAddress = joinAddressOption.getOrElse(Cluster(system).selfAddress) 95 | Cluster(system).join(joinAddress) 96 | system.actorOf( 97 | ClusterSingletonManager.props(Master.props( 98 | progressReportTimeout, 99 | config.mirrors, 100 | config.threadsPerMirror 101 | ), 102 | "active", PoisonPill, Some(role) 103 | ), 104 | "master") 105 | joinAddress 106 | } 107 | 108 | def startFrontend(joinAddress: akka.actor.Address): ActorRef = 109 | { 110 | val conf = ConfigFactory.parseString( s"""akka.remote.netty.tcp.hostname="${config.master}""""). 111 | withFallback(ConfigFactory.load()) 112 | val system = ActorSystem(systemName, conf) 113 | Cluster(system).join(joinAddress) 114 | 115 | val client = system.actorOf(Props[DownloadClient], "client") 116 | system.actorOf(Props[DownloadResultConsumer], "consumer") 117 | client 118 | } 119 | 120 | def startWorker(contactAddress: akka.actor.Address) = 121 | { 122 | val conf = ConfigFactory.load() 123 | val system = ActorSystem(systemName, conf) 124 | val initialContacts = Set(system.actorSelection(RootActorPath(contactAddress) / "user" / "receptionist")) 125 | val clusterClient = system.actorOf(ClusterClient.props(initialContacts), "clusterClient") 126 | system.actorOf( 127 | Worker.props(clusterClient, 128 | DownloadJobRunner.props(config.progressReportInterval, 129 | config.hadoopConf, 130 | config.localTempDir, 131 | config.unzip 132 | ), 133 | config.maxDuplicateProgress 134 | ), 135 | "worker" 136 | ) 137 | } 138 | } -------------------------------------------------------------------------------- /download/src/main/scala/org/dbpedia/extraction/dump/download/DumpFileSource.scala: -------------------------------------------------------------------------------- 1 | package org.dbpedia.extraction.dump.download 2 | 3 | import org.dbpedia.extraction.util.{Language, WikiInfo} 4 | import scala.io.{Source, Codec} 5 | import java.net.URL 6 | import scala.collection.mutable 7 | import scala.collection.immutable.SortedSet 8 | import scala.collection.mutable.{ListBuffer, Set} 9 | import org.apache.hadoop.fs.Path 10 | import org.dbpedia.extraction.dump.download.actors.message.DumpFile 11 | 12 | /** 13 | * Generate DumpFile objects each representing a specific wiki file to download. 14 | * Most of the code was taken from LanguageDownloader (extraction-framework). 15 | * 16 | * TODO: Integrate this to LanguageDownloader and reuse it here (reduce code duplication)? 17 | */ 18 | class DumpFileSource(languages: mutable.HashMap[Language, mutable.Set[(String, Boolean)]], 19 | baseUrl: URL, 20 | baseDir: Path, 21 | wikiSuffix: String, 22 | ranges: mutable.HashMap[(Int, Int), mutable.Set[(String, Boolean)]], 23 | dateRange: (String, String), 24 | dumpCount: Int) 25 | extends Traversable[DumpFile] with Iterable[DumpFile] 26 | { 27 | private val DateLink = """""".r 28 | private val list = new ListBuffer[DumpFile]() 29 | 30 | override def iterator: Iterator[DumpFile] = list.iterator 31 | 32 | override def foreach[U](func: DumpFile => U) 33 | { 34 | if(list.isEmpty) 35 | { 36 | // resolve page count ranges to languages 37 | if (ranges.nonEmpty) 38 | { 39 | val wikis = WikiInfo.fromURL(WikiInfo.URL, Codec.UTF8) 40 | 41 | // for all wikis in one of the desired ranges... 42 | for (((from, to), files) <- ranges; wiki <- wikis; if from <= wiki.pages && wiki.pages <= to) 43 | { 44 | // ...add files for this range to files for this language 45 | languages.getOrElseUpdate(wiki.language, new mutable.HashSet[(String, Boolean)]) ++= files 46 | } 47 | } 48 | 49 | // sort them to have reproducible behavior 50 | val languageKeys = SortedSet.empty[Language] ++ languages.keys 51 | languageKeys.foreach 52 | { 53 | lang => 54 | val done = languageKeys.until(lang) 55 | val todo = languageKeys.from(lang) 56 | println("done: " + done.size + " - " + done.map(_.wikiCode).mkString(",")) 57 | println("todo: " + todo.size + " - " + languageKeys.from(lang).map(_.wikiCode).mkString(",")) 58 | for(dumpFile <- LanguageDumpFileSource(lang)) 59 | list += dumpFile 60 | } 61 | } 62 | list foreach func 63 | } 64 | 65 | private class LanguageDumpFileSource(language: Language) extends Traversable[DumpFile] 66 | { 67 | val wiki = language.filePrefix + wikiSuffix 68 | val mainPage = new URL(baseUrl, wiki + "/") 69 | val fileNames = languages(language) 70 | 71 | override def foreach[U](func: DumpFile => U) 72 | { 73 | forDates(dateRange, dumpCount, func) 74 | } 75 | 76 | def forDates[U](dateRange: (String, String), dumpCount: Int, func: DumpFile => U) 77 | { 78 | val (firstDate, lastDate) = dateRange 79 | 80 | var dates = SortedSet.empty(Ordering[String].reverse) 81 | for (line <- Source.fromURL(mainPage).getLines()) 82 | DateLink.findAllIn(line).matchData.foreach(dates += _.group(1)) 83 | 84 | if (dates.size == 0) throw new Exception("found no date - " + mainPage + " is probably broken or unreachable. check your network / proxy settings.") 85 | 86 | var count = 0 87 | 88 | // find date pages that have all files we want 89 | for (date <- dates) 90 | { 91 | if (count < dumpCount && date >= firstDate && date <= lastDate && forDate(date, func)) count += 1 92 | } 93 | 94 | if (count == 0) throw new Exception("found no date on " + mainPage + " in range " + firstDate + "-" + lastDate + " with files " + fileNames.mkString(",")) 95 | } 96 | 97 | def forDate[U](date: String, func: DumpFile => U): Boolean = 98 | { 99 | val datePage = new URL(mainPage, date + "/") // here we could use index.html 100 | val datePageLines = Source.fromURL(datePage).getLines().toTraversable 101 | 102 | // Collect regexes 103 | val regexes = fileNames.filter(_._2).map(_._1) 104 | val fileNamesFromRegexes = expandFilenameRegex(date, datePageLines, regexes) 105 | val staticFileNames = fileNames.filter(!_._2).map(_._1) 106 | 107 | val allFileNames = fileNamesFromRegexes ++ staticFileNames 108 | // val urls = allFileNames.map(fileName => new URL(baseURL, wiki + "/" + date + "/" + wiki + "-" + date + "-" + fileName)) 109 | val dumpFiles = allFileNames.map(fileName => DumpFile(baseDir.toUri.getPath, wikiSuffix, language.wikiCode, date, fileName)) 110 | 111 | 112 | // all the links we need - only for non regexes (we have already checked regex ones) 113 | val links = new mutable.HashMap[String, String]() 114 | for (fileName <- staticFileNames) links(fileName) = "" 115 | // Here we should set "" 116 | // but "\"/"+wiki+"/"+date+"/" does not exists in incremental updates, keeping the trailing "\">" should do the trick 117 | // for (fileName <- fileNames) links(fileName) = wiki+"-"+date+"-"+fileName+"\">" 118 | 119 | for (line <- datePageLines) 120 | links.foreach 121 | { 122 | case (fileName, link) => if (line contains link) links -= fileName 123 | } 124 | 125 | // did we find them all? 126 | // Fail if: 127 | // - the user specified static file names and not all of them have been found 128 | // OR 129 | // - the user specified regular expressions and no file has been found that satisfied them 130 | if ((staticFileNames.nonEmpty && links.nonEmpty) || (regexes.nonEmpty && fileNamesFromRegexes.isEmpty)) 131 | { 132 | // TODO: Fix message 133 | val staticFilesMessage = if (links.nonEmpty) " has no links to [" + links.keys.mkString(",") + "]" else "" 134 | val dynamicFilesMessage = if (fileNamesFromRegexes.isEmpty && regexes.nonEmpty) " has no links that satisfies [" + regexes.mkString(",") + "]" else "" 135 | println("date page '" + datePage + staticFilesMessage + dynamicFilesMessage) 136 | false 137 | } 138 | else 139 | { 140 | println("date page '" + datePage + "' has all files [" + allFileNames.mkString(",") + "]") 141 | // run closure over all DumpFiles 142 | for (dumpFile <- dumpFiles) func(dumpFile) 143 | true 144 | } 145 | } 146 | 147 | private def expandFilenameRegex(date: String, index: Traversable[String], filenameRegexes: mutable.Set[String]): mutable.Set[String] = 148 | { 149 | // Prepare regexes 150 | val regexes = filenameRegexes.map(regex => ("").r) 151 | 152 | // Result 153 | val filenames = Set[String]() 154 | 155 | for (line <- index) 156 | regexes.foreach(regex => regex.findAllIn(line).matchData.foreach(filenames += _.group(1))) 157 | 158 | filenames 159 | } 160 | } 161 | 162 | private object LanguageDumpFileSource 163 | { 164 | def apply(language: Language) = new LanguageDumpFileSource(language) 165 | } 166 | 167 | } 168 | -------------------------------------------------------------------------------- /download/src/main/scala/org/dbpedia/extraction/dump/download/actors/DownloadClient.scala: -------------------------------------------------------------------------------- 1 | package org.dbpedia.extraction.dump.download.actors 2 | 3 | import akka.actor.{ActorLogging, Actor} 4 | import java.util.UUID 5 | import scala.concurrent.duration._ 6 | import akka.pattern._ 7 | import akka.contrib.pattern.{DistributedPubSubMediator, DistributedPubSubExtension} 8 | import akka.contrib.pattern.DistributedPubSubMediator.Send 9 | import org.dbpedia.extraction.dump.download.actors.message.GeneralMessage.{MasterQueueEmpty, ShutdownCluster} 10 | import akka.util.Timeout 11 | import org.dbpedia.extraction.dump.download.actors.message.{DumpFile, DownloadJob} 12 | 13 | /** 14 | * A client actor used to submit download jobs to the master. To submit a job, a DumpFile object is sent as message. 15 | */ 16 | class DownloadClient extends Actor with ActorLogging 17 | { 18 | 19 | import DownloadClient._ 20 | import context.dispatcher 21 | 22 | def scheduler = context.system.scheduler 23 | 24 | def nextDownloadId(): String = UUID.randomUUID().toString 25 | 26 | val mediator = DistributedPubSubExtension(context.system).mediator 27 | mediator ! DistributedPubSubMediator.Subscribe(Master.General, self) 28 | 29 | implicit val timeout = Timeout(10.seconds) 30 | 31 | var canShutDownCluster = false 32 | 33 | def receive = 34 | { 35 | case Finished => 36 | // send this when no more DumpFiles are to be added - ready for shutdown 37 | canShutDownCluster = true 38 | 39 | case MasterQueueEmpty => 40 | if (canShutDownCluster) self ! ShutdownCluster 41 | 42 | case ShutdownCluster => 43 | mediator ! Send("/user/master/active", ShutdownCluster, localAffinity = false) 44 | context.stop(self) 45 | context.system.shutdown() 46 | context.become(shuttingDown) 47 | 48 | case file: DumpFile => 49 | self ! DownloadJob(nextDownloadId(), file) 50 | 51 | case job: DownloadJob => 52 | (mediator ? Send("/user/master/active", job, localAffinity = false)) map 53 | { 54 | case Master.Ack(_) => 55 | log.info("Job accepted by master: {}", job) 56 | } recover 57 | { 58 | case _ => 59 | log.info("Job not accepted, retry after a while") 60 | scheduler.scheduleOnce(3.seconds, self, job) 61 | } 62 | } 63 | 64 | def shuttingDown: Receive = 65 | { 66 | case _ => // ignore all messages, shutting down cluster. 67 | } 68 | } 69 | 70 | object DownloadClient 71 | { 72 | case object Finished 73 | } 74 | -------------------------------------------------------------------------------- /download/src/main/scala/org/dbpedia/extraction/dump/download/actors/DownloadJobRunner.scala: -------------------------------------------------------------------------------- 1 | package org.dbpedia.extraction.dump.download.actors 2 | 3 | import akka.actor.{ActorLogging, Props, Actor} 4 | import akka.pattern.ask 5 | import akka.util.Timeout 6 | import org.dbpedia.extraction.dump.download.{Unzip, ActoredCounter, FileDownloader} 7 | import org.dbpedia.extraction.util.{Language, Finder} 8 | import java.net.URL 9 | import org.dbpedia.extraction.util.RichHadoopPath.wrapPath 10 | import org.apache.hadoop.fs.Path 11 | import org.apache.hadoop.conf.Configuration 12 | import java.io.File 13 | import scala.concurrent.Future 14 | import scala.concurrent.duration._ 15 | import scala.language.postfixOps 16 | import org.dbpedia.extraction.dump.download.actors.Worker.DownloadComplete 17 | import org.dbpedia.extraction.dump.download.actors.message.{DumpFile, DownloadJob, MirroredDownloadJob, DownloaderProgressMessage} 18 | import DownloaderProgressMessage.{ProgressEnd, Stop} 19 | import scala.util.{Failure, Success} 20 | 21 | /** 22 | * This actor is used by Worker to run a download job. 23 | * 24 | * @param progressInterval Download progress report interval 25 | * @param hadoopConfiguration Hadoop Configuration 26 | * @param tempDir temporary directory on local file system to download to (before being moved to HDFS) 27 | * @param unzip true if file should be unzipped while downloading, false otherwise 28 | */ 29 | class DownloadJobRunner(progressInterval: FiniteDuration, hadoopConfiguration: Configuration, tempDir: File, unzip: Boolean) extends Actor with ActorLogging 30 | { 31 | implicit private val _hadoopConfiguration = hadoopConfiguration 32 | implicit private val progressStopTimeout = Timeout(5 seconds) 33 | 34 | val progress = context.watch(context.actorOf(Props(classOf[DownloadProgressTracker], context.parent, progressInterval), "progress")) 35 | 36 | class Downloader extends FileDownloader with ActoredCounter 37 | { 38 | override val progressActor = progress 39 | } 40 | 41 | val downloader = 42 | if (unzip) new Downloader with Unzip 43 | else new Downloader 44 | 45 | def receive = 46 | { 47 | case job@MirroredDownloadJob(mirror, DownloadJob(_, DumpFile(base, wikiName, lang, date, fileName))) => 48 | log.debug("Received download job from Worker: {}", job) 49 | val s = sender() 50 | import context.dispatcher 51 | 52 | val baseDir = new Path(base) 53 | val finder = new Finder[Path](baseDir, Language(lang), wikiName) 54 | val wiki = finder.wikiName 55 | val dateDir = baseDir.resolve(wiki).resolve(date) 56 | if (!dateDir.exists && !dateDir.mkdirs) throw new Exception("Target directory [" + dateDir.getSchemeWithFileName + "] does not exist and cannot be created") 57 | if (!tempDir.exists && !tempDir.mkdirs) throw new Exception("Local temporary directory [" + tempDir + "] does not exist and cannot be created") 58 | 59 | val url = new URL(mirror, s"$wiki/$date/$wiki-$date-$fileName") 60 | val targetFile = new File(tempDir, downloader.targetName(url)) 61 | if(targetFile.exists) targetFile.delete() // delete file in temp dir if it already exists 62 | 63 | Future(downloader.downloadTo(url, tempDir)). 64 | onComplete 65 | { 66 | case Success(file) => 67 | // file was downloaded to tempDir; copy it to Hadoop FS. 68 | val fs = dateDir.getFileSystem(hadoopConfiguration) 69 | val outputPath = dateDir.resolve(file.getName) 70 | fs.moveFromLocalFile(new Path(file.toURI), outputPath) 71 | progress ? Stop onSuccess 72 | { 73 | case ProgressEnd(totalBytes) => 74 | s ! DownloadComplete(outputPath.getSchemeWithFileName, totalBytes) // Tell worker that download is finished 75 | } 76 | case Failure(t) => 77 | log.info(t.getMessage) 78 | progress ! Stop 79 | } 80 | } 81 | } 82 | 83 | object DownloadJobRunner 84 | { 85 | def props(progressInterval: FiniteDuration, hadoopConfiguration: Configuration, tempDir: File, unzip: Boolean = false): Props = 86 | Props(classOf[DownloadJobRunner], progressInterval, hadoopConfiguration, tempDir, unzip) 87 | } 88 | -------------------------------------------------------------------------------- /download/src/main/scala/org/dbpedia/extraction/dump/download/actors/DownloadProgressTracker.scala: -------------------------------------------------------------------------------- 1 | package org.dbpedia.extraction.dump.download.actors 2 | 3 | import akka.actor._ 4 | import java.util.concurrent.atomic.AtomicLong 5 | import scala.concurrent.duration.FiniteDuration 6 | import org.dbpedia.extraction.dump.download.actors.message.{DownloaderProgressMessage, WorkerProgressMessage} 7 | import scala.Some 8 | 9 | /** 10 | * An actor that receives Start and Read messages, and relays ProgressStart and Progress messages to the client. 11 | * This is used to keep track of download progress - the number of bytes being read in real time. 12 | * 13 | * @param client The actor to send progress messages to 14 | * @param notifyInterval The time interval at which progress reports will be sent to client 15 | */ 16 | class DownloadProgressTracker(client: ActorRef, notifyInterval: FiniteDuration) extends Actor with ActorLogging 17 | { 18 | import WorkerProgressMessage._ 19 | import DownloaderProgressMessage._ 20 | import DownloadProgressTracker._ 21 | import context.dispatcher 22 | 23 | def scheduler = context.system.scheduler 24 | 25 | private val bytesRead = new AtomicLong() 26 | 27 | /** This task is used to send Progress messages to client at each interval */ 28 | private var progressTaskOption: Option[Cancellable] = None 29 | 30 | override def postStop() = progressTaskOption.foreach(_.cancel()) 31 | 32 | def receive = 33 | { 34 | case Start(total) => // Sent by ActoredCounter to signal start of download 35 | if (0 != bytesRead.get() || progressTaskOption.isDefined) 36 | { 37 | log.info("ProgressTracker is already started!") 38 | } 39 | else 40 | { 41 | progressTaskOption = Some(scheduler.schedule(notifyInterval, notifyInterval, self, Tick)) 42 | client ! ProgressStart(total) 43 | } 44 | 45 | case Read(bytes) => // Sent by ActoredCounter to signal bytes read 46 | bytesRead.set(bytes) 47 | 48 | case Stop => 49 | (progressTaskOption, bytesRead.get) match 50 | { 51 | case (Some(progressTask), b) if b != 0 => 52 | sender ! ProgressEnd(bytesRead.get()) 53 | bytesRead.set(0) 54 | 55 | progressTask.cancel() 56 | progressTaskOption = None 57 | 58 | case _ => 59 | log.info("ProgressTracker is already stopped!") 60 | } 61 | 62 | case Tick => 63 | client ! Progress(bytesRead.get()) 64 | } 65 | } 66 | 67 | object DownloadProgressTracker 68 | { 69 | case object Tick 70 | } -------------------------------------------------------------------------------- /download/src/main/scala/org/dbpedia/extraction/dump/download/actors/DownloadResultConsumer.scala: -------------------------------------------------------------------------------- 1 | package org.dbpedia.extraction.dump.download.actors 2 | 3 | import akka.actor.{ActorLogging, Actor} 4 | import org.dbpedia.extraction.dump.download.actors.message.GeneralMessage.ShutdownCluster 5 | import akka.contrib.pattern.{DistributedPubSubExtension, DistributedPubSubMediator} 6 | import org.dbpedia.extraction.dump.download.actors.message.WorkerProgressMessage.{Progress, ProgressStart} 7 | import org.dbpedia.extraction.dump.download.actors.message.{DownloadProgress, DownloadResult, DownloadJob, MirroredDownloadJob} 8 | 9 | /** 10 | * This actor is used to print download progress logging messages on the driver/master node. 11 | * Hooks into Master.ResultsTopic and consumes DownloadResult messages. 12 | * 13 | * TODO: Refactor the code to pretty-print better progress results like ByteLogger. Maintain list of jobs 14 | * and log percentage of work done etc. 15 | */ 16 | class DownloadResultConsumer extends Actor with ActorLogging 17 | { 18 | var jobs = Map[String, MirroredDownloadJob]() 19 | val mediator = DistributedPubSubExtension(context.system).mediator 20 | mediator ! DistributedPubSubMediator.Subscribe(Master.General, self) 21 | mediator ! DistributedPubSubMediator.Subscribe(Master.ProgressTopic, self) 22 | mediator ! DistributedPubSubMediator.Subscribe(Master.ResultsTopic, self) 23 | 24 | def receive = 25 | { 26 | case _: DistributedPubSubMediator.SubscribeAck => 27 | 28 | case job @ MirroredDownloadJob(_, DownloadJob(downloadId, _)) => 29 | log.info("Starting job: {}", job) 30 | jobs += (downloadId -> job) 31 | 32 | case DownloadResult(downloadId, outputPath, bytes) => 33 | log.info("{}: {} bytes downloaded to {}", downloadId, bytes, outputPath) 34 | 35 | case DownloadProgress(downloadId, p @ ProgressStart(bytes)) => 36 | log.info("{}: {}", jobs(downloadId), p) 37 | 38 | case DownloadProgress(downloadId, p @ Progress(bytes)) => 39 | log.info("{}: {}", jobs(downloadId), p) 40 | 41 | case ShutdownCluster => 42 | context.stop(self) 43 | context.system.shutdown() 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /download/src/main/scala/org/dbpedia/extraction/dump/download/actors/Master.scala: -------------------------------------------------------------------------------- 1 | package org.dbpedia.extraction.dump.download.actors 2 | 3 | import scala.concurrent.duration.{Deadline, FiniteDuration} 4 | import akka.actor._ 5 | import akka.contrib.pattern.{DistributedPubSubMediator, DistributedPubSubExtension} 6 | import scala.collection.immutable.Queue 7 | import org.dbpedia.extraction.dump.download.actors.message.GeneralMessage.{MasterQueueEmpty, ShutdownCluster} 8 | import org.dbpedia.extraction.dump.download.actors.message._ 9 | import java.net.URL 10 | import scala.Some 11 | import akka.contrib.pattern.DistributedPubSubMediator.Put 12 | 13 | /** 14 | * Master/driver node actor. This is responsible for accepting download jobs from a client and dividing jobs 15 | * among the different Workers, keeping track of download jobs, handling failed jobs, shutting down the cluster etc. 16 | * 17 | * @param workTimeout Workers need to send download progress reports within this timeout 18 | * @param mirrors List of wikipedia mirror URLs 19 | * @param threadsPerMirror Number of simultaneous downloads per mirror 20 | */ 21 | class Master(workTimeout: FiniteDuration, mirrors: Seq[URL], threadsPerMirror: Int) extends Actor with ActorLogging 22 | { 23 | 24 | import Master._ 25 | import MasterWorkerMessage._ 26 | import context.dispatcher 27 | 28 | def scheduler = context.system.scheduler 29 | 30 | // The DownloadClient and DownloadResultConsumer communicate with the Master through the DistributedPubSubMediator 31 | val mediator = DistributedPubSubExtension(context.system).mediator 32 | 33 | mediator ! Put(self) 34 | 35 | private var workers = Map[String, WorkerState]() 36 | private var pendingDownloads = Queue[DownloadJob]() 37 | private var downloadIds = Set[String]() 38 | 39 | // Keep track of the number of simultaneous downloads per mirror. 40 | private var mirrorsInUse = (mirrors zip Seq.fill(mirrors.size)(0)).toMap // Mapping mirror URL to number of simultaneous downloads 41 | 42 | val cleanupTask = scheduler.schedule(workTimeout / 2, workTimeout / 2, 43 | self, CleanupTick) 44 | 45 | override def postStop(): Unit = cleanupTask.cancel() 46 | 47 | def receive = 48 | { 49 | case ShutdownCluster => 50 | if (pendingDownloads.isEmpty) // all downloads have finished? 51 | { 52 | if (workers.isEmpty) // all workers have been unregistered? 53 | { 54 | log.info("Stopping master!") 55 | mediator ! DistributedPubSubMediator.Publish(General, ShutdownCluster) 56 | self ! PoisonPill 57 | context.stop(self) 58 | context.system.shutdown() 59 | } 60 | else 61 | { 62 | workers.foreach // still have registered workers? 63 | { 64 | case (workerId, WorkerState(ref, Idle)) => // send shutdown signal to idle workers and remove them. 65 | ref ! ShutdownCluster 66 | workers -= workerId 67 | case _ => // come back to the busy worker after a period of workTimeout 68 | } 69 | log.debug("Some workers still busy! Cannot stop master yet!") 70 | context.system.scheduler.scheduleOnce(workTimeout, self, ShutdownCluster) 71 | } 72 | } 73 | else 74 | { 75 | log.debug("Some work pending! Cannot stop master yet!") 76 | context.system.scheduler.scheduleOnce(workTimeout, self, ShutdownCluster) 77 | } 78 | 79 | case RemoveWorker(workerId) => 80 | workers -= workerId 81 | 82 | case p @ ProgressReport(workerId, downloadId, progress) => // Workers send download progress reports at specific intervals 83 | log.debug("Heard from worker {}: {} ", workerId, progress) 84 | mediator ! DistributedPubSubMediator.Publish(ProgressTopic, DownloadProgress(downloadId, progress)) 85 | workers.get(workerId) match 86 | { 87 | case Some(s@WorkerState(_, Busy(downloadJob, deadline))) => 88 | workers += (workerId -> WorkerState(sender, status = Busy(downloadJob, Deadline.now + workTimeout))) // Renew current job deadline 89 | case _ => 90 | } 91 | 92 | case RegisterWorker(workerId) => // Workers register themselves to the master at specific intervals 93 | if (workers.contains(workerId)) 94 | { 95 | workers += (workerId -> workers(workerId).copy(ref = sender)) 96 | } 97 | else 98 | { 99 | log.info("Worker registered: {}", workerId) 100 | workers += (workerId -> WorkerState(sender, status = Idle)) 101 | if (pendingDownloads.nonEmpty) 102 | sender ! DownloadIsReady 103 | } 104 | 105 | case WorkerRequestsDownload(workerId) => 106 | if (pendingDownloads.nonEmpty) 107 | { 108 | workers.get(workerId) match 109 | { 110 | case Some(s@WorkerState(_, Idle)) => // is the requesting Worker Idle? 111 | getFreeMirror foreach 112 | { 113 | case url => // We have a free mirror! 114 | val (downloadJob, rest) = pendingDownloads.dequeue 115 | pendingDownloads = rest 116 | val downloadWithMirror = MirroredDownloadJob(url, downloadJob) 117 | 118 | // Publish new download job so that DownloadResultConsumer can keep track of it 119 | mediator ! DistributedPubSubMediator.Publish(ProgressTopic, downloadWithMirror) 120 | 121 | sender ! downloadWithMirror // send new download job back to the Worker that sent the job request 122 | log.info("Giving worker {} a download job {}", workerId, downloadWithMirror) 123 | 124 | mirrorsInUse += (url -> (mirrorsInUse(url) + 1)) // decrement no. of threads to mirror 125 | workers += (workerId -> s.copy(status = Busy(downloadWithMirror, Deadline.now + workTimeout))) // set worker status to Busy 126 | } 127 | case _ => 128 | } 129 | } 130 | 131 | case DownloadIsDone(workerId, downloadId, outputPath, totalBytes) => 132 | workers.get(workerId) match 133 | { 134 | case Some(s@WorkerState(_, Busy(downloadJob, _))) if downloadJob.job.downloadId == downloadId => 135 | log.debug("Download is done: {} => {} bytes written to {} by worker {}", downloadJob, totalBytes, outputPath, workerId) 136 | 137 | val mirror = downloadJob.baseUrl 138 | mirrorsInUse += (mirror -> (mirrorsInUse(mirror) - 1)) // decrement no. of threads to mirror 139 | workers += (workerId -> s.copy(status = Idle)) // set worker status to Idle 140 | 141 | // publish download result for DownloadResultConsumer to read 142 | mediator ! DistributedPubSubMediator.Publish(ResultsTopic, DownloadResult(downloadJob, outputPath, totalBytes)) 143 | 144 | sender ! MasterWorkerMessage.Ack(downloadId) // Ack to worker 145 | case _ => 146 | if (downloadIds.contains(downloadId)) 147 | { 148 | // previous Ack was lost, confirm again that this is done 149 | sender ! MasterWorkerMessage.Ack(downloadId) 150 | } 151 | } 152 | 153 | case DownloadFailed(workerId, downloadId) => 154 | workers.get(workerId) match 155 | { 156 | case Some(s@WorkerState(_, Busy(downloadJob, _))) if downloadJob.job.downloadId == downloadId => 157 | log.info("Download failed: {}", downloadJob) 158 | 159 | val mirror = downloadJob.baseUrl 160 | mirrorsInUse += (mirror -> (mirrorsInUse(mirror) - 1)) 161 | workers += (workerId -> s.copy(status = Idle)) 162 | 163 | pendingDownloads = pendingDownloads enqueue downloadJob.job // put the download back into queue 164 | notifyWorkers() 165 | case _ => 166 | } 167 | 168 | case job: DownloadJob => // client sent a new DownloadJob 169 | // idempotent 170 | if (downloadIds.contains(job.downloadId)) 171 | { 172 | sender ! Master.Ack(job.downloadId) 173 | } 174 | else 175 | { 176 | log.info("Accepted download: {}", job) 177 | pendingDownloads = pendingDownloads enqueue job 178 | downloadIds += job.downloadId 179 | sender ! Master.Ack(job.downloadId) 180 | notifyWorkers() 181 | } 182 | 183 | case CleanupTick => // runs at fixed intervals, removes timed out jobs 184 | var hasBusy = false 185 | for ((workerId, s@WorkerState(_, Busy(downloadJob, timeout))) <- workers) 186 | { 187 | hasBusy = true 188 | if (timeout.isOverdue) 189 | { 190 | log.info("Download timed out: {}", downloadJob) 191 | workers -= workerId 192 | pendingDownloads = pendingDownloads enqueue downloadJob.job 193 | notifyWorkers() 194 | } 195 | } 196 | // publish MasterQueueEmpty if there are no pending downloads AND no workers are busy 197 | if(!hasBusy && pendingDownloads.isEmpty) mediator ! DistributedPubSubMediator.Publish(General, MasterQueueEmpty) 198 | } 199 | 200 | def getFreeMirror: Option[URL] = 201 | mirrorsInUse.find(_._2 < threadsPerMirror) match 202 | { 203 | case Some((url, _)) => Some(url) 204 | case _ => None 205 | } 206 | 207 | /** Tell idle workers that download is ready */ 208 | def notifyWorkers(): Unit = 209 | if (pendingDownloads.nonEmpty) 210 | { 211 | // TODO: Pick workers more intelligently, according to number of bytes downloaded by each worker 212 | // to encourage better spreading out of downloads over the cluster - better for distributed processing too. 213 | workers.foreach 214 | { 215 | case (_, WorkerState(ref, Idle)) => ref ! DownloadIsReady 216 | case _ => // busy 217 | } 218 | } 219 | 220 | // TODO cleanup old workers 221 | // TODO cleanup old downloadIds 222 | } 223 | 224 | object Master 225 | { 226 | val ResultsTopic = "results" 227 | val ProgressTopic = "progress" 228 | val General = "general" 229 | 230 | def props(workTimeout: FiniteDuration, mirrors: Seq[URL], threadsPerMirror: Int): Props = 231 | Props(classOf[Master], workTimeout, mirrors, threadsPerMirror) 232 | 233 | case class Ack(downloadId: String) 234 | 235 | private sealed trait WorkerStatus 236 | private case object Idle extends WorkerStatus 237 | private case class Busy(job: MirroredDownloadJob, deadline: Deadline) extends WorkerStatus 238 | private case class WorkerState(ref: ActorRef, status: WorkerStatus) 239 | 240 | private case object CleanupTick 241 | 242 | } -------------------------------------------------------------------------------- /download/src/main/scala/org/dbpedia/extraction/dump/download/actors/Worker.scala: -------------------------------------------------------------------------------- 1 | package org.dbpedia.extraction.dump.download.actors 2 | 3 | import akka.actor._ 4 | import scala.concurrent.duration._ 5 | import java.util.UUID 6 | import akka.actor.SupervisorStrategy.{Stop, Restart} 7 | import org.dbpedia.extraction.dump.download.actors.message._ 8 | import GeneralMessage.ShutdownCluster 9 | import scala.language.postfixOps 10 | import org.dbpedia.extraction.dump.download.actors.Worker.DownloadComplete 11 | import scala.Some 12 | import akka.actor.OneForOneStrategy 13 | import akka.contrib.pattern.ClusterClient.SendToAll 14 | import org.dbpedia.extraction.dump.download.actors.message.DownloadJob 15 | import akka.actor.Terminated 16 | import akka.actor.DeathPactException 17 | 18 | /** 19 | * Worker actor that runs on each worker node. This dispatches a download job to a child DownloadJobRunner actor 20 | * which manages download and a DownloadProgressTracker to send progress reports back to the Worker. 21 | * 22 | * @param clusterClient Akka ClusterClient that acts as a proxy to the master 23 | * @param downloadRunnerProps Props for the downloadRunner actor. See Worker.props() 24 | * @param maxDuplicateProgress Maximum number of consecutive duplicate progress read bytes to tolerate 25 | * @param registerInterval The worker registers itself with the master every registerInterval 26 | */ 27 | class Worker(clusterClient: ActorRef, downloadRunnerProps: Props, maxDuplicateProgress: Int, registerInterval: FiniteDuration) 28 | extends Actor with ActorLogging 29 | { 30 | 31 | import MasterWorkerMessage._ 32 | import WorkerProgressMessage._ 33 | import context.dispatcher 34 | 35 | def scheduler = context.system.scheduler 36 | 37 | val workerId = UUID.randomUUID().toString 38 | 39 | // Register to the master at specific intervals. 40 | val registerTask = context.system.scheduler.schedule(0.seconds, registerInterval, clusterClient, 41 | SendToAll("/user/master/active", RegisterWorker(workerId))) 42 | 43 | val downloadRunner = context.watch(context.actorOf(downloadRunnerProps, "runner")) 44 | 45 | var currentDownloadId: Option[String] = None 46 | 47 | private var totalBytes = 0l 48 | private var currentBytes = 0l 49 | private var progressDelays = 0 50 | 51 | def downloadId: String = currentDownloadId match 52 | { 53 | case Some(workId) => workId 54 | case None => throw new IllegalStateException("Not working") 55 | } 56 | 57 | override def supervisorStrategy = 58 | OneForOneStrategy() 59 | { 60 | case _: ActorInitializationException => Stop 61 | case _: DeathPactException => Stop 62 | case _: Exception => 63 | currentDownloadId foreach (workId => sendToMaster(DownloadFailed(workerId, workId))) 64 | context.become(idle) 65 | Restart 66 | } 67 | 68 | override def postStop(): Unit = registerTask.cancel() 69 | 70 | def receive = idle 71 | 72 | def idle: Receive = 73 | { 74 | case ShutdownCluster => // Master sends ShutdownCluster 75 | sendToMaster(RemoveWorker(workerId)) 76 | scheduler.scheduleOnce(5 seconds) 77 | { 78 | registerTask.cancel() 79 | context.stop(downloadRunner) 80 | context.stop(self) 81 | context.system.shutdown() 82 | } 83 | 84 | case DownloadIsReady => // begin 3-way handshake to get download job from master 85 | sendToMaster(WorkerRequestsDownload(workerId)) 86 | 87 | case job @ MirroredDownloadJob(_, DownloadJob(downloadId, _)) => // receive new download job 88 | log.info("Got download job: {}", job) 89 | currentDownloadId = Some(downloadId) 90 | 91 | // reset state variables for new download job 92 | currentBytes = 0 93 | totalBytes = 0 94 | progressDelays = 0 95 | 96 | downloadRunner ! job 97 | context.become(working) 98 | } 99 | 100 | def working: Receive = 101 | { 102 | case p @ ProgressStart(total) => 103 | sendToMaster(ProgressReport(workerId, downloadId, p)) 104 | if(totalBytes == 0) totalBytes = total 105 | 106 | case p @ Progress(bytes) => 107 | sendToMaster(ProgressReport(workerId, downloadId, p)) 108 | 109 | // check if number of bytes downloaded has increased. 110 | if(bytes > currentBytes) 111 | { 112 | currentBytes = bytes 113 | progressDelays = 0 114 | } 115 | else 116 | { 117 | progressDelays += 1 118 | } 119 | 120 | if(progressDelays > maxDuplicateProgress && totalBytes != bytes) // too many progress delays? 121 | { 122 | val delay = progressDelays * downloadRunnerProps.args(0).asInstanceOf[FiniteDuration].toSeconds 123 | log.info(s"Download progress of $currentDownloadId has stagnated. No update occurred in $delay seconds!") 124 | sendToMaster(DownloadFailed(workerId, currentDownloadId.get)) 125 | } 126 | 127 | case DownloadComplete(output, bytes) => // DownloadJobRunner sends this upon completion 128 | log.info("Download is complete. Output file: {}. Total bytes: {}", output, bytes) 129 | sendToMaster(DownloadIsDone(workerId, downloadId, output, bytes)) 130 | context.setReceiveTimeout(10.seconds) 131 | context.become(waitForDownloadIsDoneAck(output, bytes)) // Send news of finished download to Master and wait for ACK. 132 | 133 | case ShutdownCluster => 134 | log.info("Yikes. Master told me to shutdown, while I'm downloading.") 135 | 136 | case _: MirroredDownloadJob => 137 | log.info("Yikes. Master gave me a download job, while I'm downloading.") 138 | } 139 | 140 | def waitForDownloadIsDoneAck(outputFilePath: String, bytes: Long): Receive = 141 | { 142 | case Ack(id) if id == downloadId => 143 | sendToMaster(WorkerRequestsDownload(workerId)) 144 | context.setReceiveTimeout(Duration.Undefined) 145 | context.become(idle) 146 | case ReceiveTimeout => 147 | log.info("No ACK from master, retrying") 148 | sendToMaster(DownloadIsDone(workerId, downloadId, outputFilePath, bytes)) 149 | } 150 | 151 | override def unhandled(message: Any): Unit = message match 152 | { 153 | case Terminated(`downloadRunner`) => context.stop(self) 154 | case DownloadIsReady => 155 | case _ => super.unhandled(message) 156 | } 157 | 158 | def sendToMaster(msg: Any): Unit = 159 | { 160 | clusterClient ! SendToAll("/user/master/active", msg) 161 | } 162 | } 163 | 164 | object Worker 165 | { 166 | def props(clusterClient: ActorRef, downloadRunnerProps: Props, maxDuplicateProgress: Int, registerInterval: FiniteDuration = 10.seconds): Props = 167 | Props(classOf[Worker], clusterClient, downloadRunnerProps, maxDuplicateProgress, registerInterval) 168 | 169 | case class DownloadComplete(outputFilePath: String, bytes: Long) 170 | 171 | } -------------------------------------------------------------------------------- /download/src/main/scala/org/dbpedia/extraction/dump/download/actors/message/DownloadJob.scala: -------------------------------------------------------------------------------- 1 | package org.dbpedia.extraction.dump.download.actors.message 2 | 3 | import java.net.URL 4 | import org.dbpedia.extraction.dump.download.actors.message.WorkerProgressMessage.ProgressMessage 5 | 6 | /** 7 | * Download job used by the actor framework. 8 | * 9 | * @param downloadId Unique job ID 10 | * @param file URL information 11 | */ 12 | case class DownloadJob(downloadId: String, file: DumpFile) 13 | 14 | /** 15 | * Download job wrapped along with the mirror to use for downloading. 16 | * This contains all the information needed by DownloadJobRunner to perform the job. 17 | * 18 | * @param baseUrl URL of the mirror to download from 19 | * @param job download job 20 | */ 21 | case class MirroredDownloadJob(baseUrl: URL, job: DownloadJob) 22 | 23 | /** 24 | * Download information for single wiki dump file. 25 | * 26 | * @param baseDir Base directory on Hadoop file system (HDFS for distributed downloads) 27 | * @param wikiSuffix Wiki name suffix (eg. wiki) 28 | * @param language Language wikiCode 29 | * @param date YYYYMMDD date string 30 | * @param fileName URL file name 31 | */ 32 | case class DumpFile(baseDir: String, wikiSuffix: String, language: String, date: String, fileName: String) 33 | 34 | /** 35 | * Download job used by the actor framework. 36 | * 37 | * @param job MirroredDownloadJob 38 | * @param outputPath Output path name in scheme://path/fileName format 39 | * @param bytes Total bytes downloaded 40 | */ 41 | case class DownloadResult(job: MirroredDownloadJob, outputPath: String, bytes: Long) 42 | 43 | /** 44 | * Progress reports published by Master. 45 | * 46 | * @param downloadId Unique job ID 47 | * @param progress Progress message 48 | */ 49 | case class DownloadProgress(downloadId: String, progress: ProgressMessage) -------------------------------------------------------------------------------- /download/src/main/scala/org/dbpedia/extraction/dump/download/actors/message/DownloaderProgressMessage.scala: -------------------------------------------------------------------------------- 1 | package org.dbpedia.extraction.dump.download.actors.message 2 | 3 | object DownloaderProgressMessage 4 | { 5 | // From Downloader or DownloadJobRunner to DownloadProgressTracker 6 | case class Read(bytesRead: Long) 7 | case class Start(totalBytes: Long) // totalBytes = total content length 8 | case object Stop 9 | 10 | // From DownloadProgressTracker to DownloadJobRunner 11 | case class ProgressEnd(bytes: Long) 12 | } 13 | -------------------------------------------------------------------------------- /download/src/main/scala/org/dbpedia/extraction/dump/download/actors/message/GeneralMessage.scala: -------------------------------------------------------------------------------- 1 | package org.dbpedia.extraction.dump.download.actors.message 2 | 3 | object GeneralMessage 4 | { 5 | // This message is used by different actors to propagate a cluster shutdown. 6 | case object ShutdownCluster 7 | 8 | // This message is published by the master when the pending download queue is empty. 9 | case object MasterQueueEmpty 10 | } 11 | -------------------------------------------------------------------------------- /download/src/main/scala/org/dbpedia/extraction/dump/download/actors/message/MasterWorkerMessage.scala: -------------------------------------------------------------------------------- 1 | package org.dbpedia.extraction.dump.download.actors.message 2 | 3 | import org.dbpedia.extraction.dump.download.actors.message.WorkerProgressMessage.ProgressMessage 4 | 5 | object MasterWorkerMessage 6 | { 7 | // Messages from Workers 8 | case class RegisterWorker(workerId: String) 9 | case class WorkerRequestsDownload(workerId: String) 10 | case class DownloadIsDone(workerId: String, downloadId: String, outputPath: String, bytes: Long) 11 | case class DownloadFailed(workerId: String, downloadId: String) 12 | case class ProgressReport(workerId: String, downloadId: String, progress: ProgressMessage) // progress = number of bytes read till now 13 | case class RemoveWorker(workerId: String) 14 | 15 | // Messages to Workers 16 | case object DownloadIsReady 17 | case class Ack(id: String) 18 | } 19 | -------------------------------------------------------------------------------- /download/src/main/scala/org/dbpedia/extraction/dump/download/actors/message/WorkerProgressMessage.scala: -------------------------------------------------------------------------------- 1 | package org.dbpedia.extraction.dump.download.actors.message 2 | 3 | object WorkerProgressMessage 4 | { 5 | // DownloadProgressTracker to Worker 6 | trait ProgressMessage 7 | case class Progress(bytes: Long) extends ProgressMessage 8 | case class ProgressStart(bytes: Long) extends ProgressMessage 9 | } 10 | -------------------------------------------------------------------------------- /download/src/main/scala/org/dbpedia/extraction/util/RemoteExecute.scala: -------------------------------------------------------------------------------- 1 | package org.dbpedia.extraction.util 2 | 3 | import com.jcraft.jsch.{JSch, JSchException, ChannelExec, Session} 4 | import java.io.IOException 5 | 6 | /** 7 | * Utility trait for creating an SSH session and executing remote commands. 8 | */ 9 | trait RemoteExecute 10 | { 11 | val jsch = new JSch() 12 | 13 | def addIdentity(privateKeyPath: String, passphrase: String) = jsch.addIdentity(privateKeyPath, passphrase) 14 | 15 | def addIdentity(privateKeyPath: String) = jsch.addIdentity(privateKeyPath) 16 | 17 | def createSession(userName: String, host: String): Session = 18 | { 19 | val session = jsch.getSession(userName, host) 20 | session.setConfig("UserKnownHostsFile", "/dev/null") 21 | session.setConfig("CheckHostIP", "no") 22 | session.setConfig("StrictHostKeyChecking", "no") 23 | session.connect() 24 | session 25 | } 26 | 27 | def execute(session: Session, command: String): String = 28 | { 29 | val outputBuffer = new StringBuilder() 30 | 31 | val channel = session.openChannel("exec").asInstanceOf[ChannelExec] 32 | channel.setCommand(command) 33 | channel.connect() 34 | channel.setErrStream(System.err) 35 | 36 | val commandOutput = channel.getInputStream 37 | var readByte = commandOutput.read() 38 | 39 | while (readByte != 0xffffffff) 40 | { 41 | outputBuffer.append(readByte) 42 | readByte = commandOutput.read() 43 | } 44 | 45 | channel.disconnect() 46 | outputBuffer.toString() 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /download/src/test/resources/dist-download.properties: -------------------------------------------------------------------------------- 1 | # NOTE: format is not java.util.Properties, but org.dbpedia.extraction.dump.download.DownloadConfig 2 | 3 | #distconfig=/example/path/file.cfg 4 | # Path to existing distributed download configuration text file (UTF-8) whose lines contain arguments 5 | # in the format given here. Absolute or relative path. File paths in that config file will be interpreted 6 | # relative to the config file. 7 | 8 | #extraction-framework-home=/path/to/distributed-extraction-framework 9 | # This must be set to the absolute path to the distributed extraction framework (containing this module) 10 | # in all nodes. No default value is set. 11 | 12 | mirrors=http://dumps.wikimedia.org/,http://wikipedia.c3sl.ufpr.br/,http://ftp.fi.muni.cz/pub/wikimedia/,http://dumps.wikimedia.your.org/ 13 | # List of mirrors to download from in the form of comma-separated URLs. Choose from the list of mirrors at: 14 | # http://meta.wikimedia.org/wiki/Mirroring_Wikimedia_project_XML_dumps#Current_Mirrors 15 | # Example: mirrors=http://dumps.wikimedia.org/,http://wikipedia.c3sl.ufpr.br,http://ftp.fi.muni.cz/pub/wikimedia/,http://dumps.wikimedia.your.org/ 16 | 17 | threads-per-mirror=2 18 | # Number of simultaneous downloads from each mirror per slave node. Set to 2 by default. 19 | 20 | workers-per-slave=2 21 | # Number of workers to run per slave. This is set to 2 by default. 22 | # Setting it to (no. of mirrors) * threads-per-mirror is recommended for exploiting maximum parallelism. On the other hand, 23 | # if your whole cluster has only one public facing IP it is better to set this to a low number like 1. 24 | 25 | progress-interval=2 26 | # Progress report time interval in secs - the driver node receives real-time progress reports for running downloads from the workers. 27 | # If a worker fails to send a progress report of the current download under the given timeout (the timeout is set to something 28 | # like progressReportInterval + 2 to be safe) the download job will be marked as failed and inserted back into the pending 29 | # download queue. This is 2 seconds by default. 30 | 31 | max-duplicate-progress-reports=30 32 | # Maximum number of consecutive duplicate progress read bytes to tolerate. The workers keep track of download progress; 33 | # if a download gets stuck consecutive progress reports will contain the same number of bytes downloaded. If this is set 34 | # to 30 (not recommended to go below that), the worker will declare a job as failed only after getting the same progress 35 | # report for 30 times. By default set to 30. 36 | 37 | local-temp-dir=/tmp 38 | # Local temporary directory on worker nodes. Each dump file/chunk is downloaded to this directory before being moved to 39 | # the configured Hadoop file system. This is /tmp by default. 40 | 41 | #private-key=/path/to/id_rsa 42 | # Optional identity file to connect to cluster nodes via SSH. 43 | 44 | #ssh-passphrase=passphrase 45 | # Optional passphrase for SSH private key. 46 | 47 | sequential-languages=false 48 | # If each language consists of multiple dump files (eg. enwiki-latest-pages-articles1.xml-p000000010p000010000.bz2) 49 | # they are downloaded in parallel. Multiple languages are downloaded in parallel too, giving us 2 levels of 50 | # parallelism. If sequentialLanguages is set to true, one language is downloaded at a time, otherwise, 51 | # all languages are downloaded in parallel. 52 | 53 | #hadoop-coresite-xml-path=/path/to/core-site.xml 54 | # Path to hadoop core-site.xml configuration file. 55 | 56 | #hadoop-hdfssite-xml-path=/path/to/hdfs-site.xml 57 | # Path to hadoop hdfs-site.xml configuration file. 58 | 59 | #hadoop-mapredsite-xml-path=/path/to/mapred-site.xml 60 | # Path to hadoop mapred-site.xml configuration file. 61 | 62 | master=127.0.0.1 63 | # Master node host. 64 | 65 | slaves=127.0.0.1 66 | # List of comma-separated slave hosts. Example: slaves=node1,node2,node3 67 | 68 | base-dir=/tmp/basedir 69 | # Replace by your target folder. If this is omitted here, it is read from the general configuration file if there is any. 70 | 71 | #join=akka.tcp://Workers@hostname:port 72 | # This variable needs to be specified when starting up a worker manually. Do not use this variable unless you know what you're 73 | # doing. The driver node automatically starts up workers on the slaves and takes care of this variable. Never set this variable 74 | # when starting up the master/driver. -------------------------------------------------------------------------------- /download/src/test/resources/download.properties: -------------------------------------------------------------------------------- 1 | # NOTE: format is not java.util.Properties, but org.dbpedia.extraction.dump.download.DownloadConfig 2 | 3 | # Default download server. It lists mirrors which may be faster. 4 | base-url=http://dumps.wikimedia.org/ 5 | 6 | # Replace by your target folder. 7 | base-dir=/home/gonephishing/dbpedia-extraction/distributed-extraction-framework/dumps/files 8 | 9 | # This setting is recommended for large languages that have part files (eg. en, fr). See below. Replace xx/yy by your language. 10 | #download=xx,yy:@pages-articles\d+\.xml.*\.bz2 11 | download=en:pages-articles1.xml-p000000010p000010000.bz2 12 | 13 | # This setting should be provided for small languages that have no part files (eg. li) 14 | #download=xx,yy:pages-articles.xml.bz2 15 | 16 | # You may provide multiple "download=" lines for different types of languages, just like above. 17 | 18 | ###### Download part files ###### 19 | # 20 | # Please make sure that the regex actually matches the format used for xx dumps 21 | # by checking http://dumps.wikimedia.org/xxwiki/yyyymmdd 22 | # 23 | # Example: 24 | # enwiki => enwiki-20131120-pages-articles1.xml-p000000010p000010000.bz2 hence @pages-articles\d+\.xml-p\d+p\d+\.bz2 matches 25 | # frwiki => frwiki-20131120-pages-articles1.xml.bz2 hence @pages-articles\d+\.xml\.bz2 matches (the previous regex does not!) 26 | # 27 | # NOTE: @pages-articles\d+\.xml.*\.bz2 is especially recommended when using the distributed downloader because it captures both 28 | # the above types and exploits maximum parallelism by allowing multiple part files to be downloaded and processed simultaneously. 29 | # 30 | # Remember that certain languages have small dumps and therefore no part files at all. They need to be handled with only 31 | # pages-articles.xml.bz2. Example with both small and large languages (setting download multiple times works like appending; so 32 | # adding both download's below is perfectly valid): 33 | # 34 | # download=en,fr:@pages-articles\d+\.xml.*\.bz2 35 | # download=li,bn,ilo:pages-articles.xml.bz2 36 | # 37 | # commonswiki => it does not have part files! This is true for other wikis as well. In this case xx:pages-articles.xml.bz2 38 | # shoud be used (e.g. commons:pages-articles.xml.bz2 or cowiki:pages-articles.xml.bz2) 39 | # 40 | # download=xx:@pages-articles\d+\.xml-p\d+p\d+\.bz2 41 | # download=xx:@pages-articles\d+\.xml.*\.bz2 42 | 43 | # Only needed for the ImageExtractor 44 | # download=commons:pages-articles.xml.bz2 45 | 46 | # Unzip files while downloading? Not necessary, extraction will unzip on the fly. Let's save space. 47 | unzip=false 48 | 49 | # Sometimes connecting to the server fails, so we try five times with pauses of 10 seconds. 50 | retry-max=5 51 | retry-millis=1000 52 | -------------------------------------------------------------------------------- /extraction/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 6 | distributed-extraction 7 | org.dbpedia 8 | 4.1-SNAPSHOT 9 | 10 | 4.0.0 11 | 12 | org.dbpedia.distributed-extraction 13 | extraction 14 | 4.1-SNAPSHOT 15 | DBpedia Distributed Dump Extractor 16 | 17 | 18 | 19 | 20 | 21 | org.apache.maven.plugins 22 | maven-shade-plugin 23 | 1.7 24 | 25 | 26 | package 27 | 28 | shade 29 | 30 | 31 | 32 | 33 | 34 | 35 | net.alchim31.maven 36 | scala-maven-plugin 37 | 38 | 39 | 40 | 41 | seq-extraction 42 | org.dbpedia.extraction.dump.extract.Extraction 43 | 44 | 45 | -server 46 | 58 | 59 | 60 | 61 | 62 | extraction 63 | org.dbpedia.extraction.dump.extract.DistExtraction 64 | 65 | 66 | -server 67 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | org.dbpedia.distributed-extraction 90 | common 91 | 4.1-SNAPSHOT 92 | 93 | 94 | 95 | org.dbpedia.extraction 96 | core 97 | 4.1 98 | 99 | 100 | 101 | org.dbpedia.extraction 102 | dump 103 | 4.1 104 | 105 | 106 | 107 | org.dbpedia.extraction 108 | scripts 109 | 4.1 110 | 111 | 112 | 113 | org.apache.spark 114 | spark-core_2.11 115 | ${spark.version} 116 | provided 117 | 118 | 119 | 120 | org.apache.hadoop 121 | hadoop-client 122 | ${hadoop.version} 123 | 124 | 125 | 126 | org.apache.hadoop 127 | hadoop-common 128 | ${hadoop.version} 129 | 130 | 131 | 132 | org.scalatest 133 | scalatest_2.11 134 | test 135 | 136 | 137 | 138 | junit 139 | junit 140 | 4.8.2 141 | test 142 | 143 | 144 | 145 | 146 | -------------------------------------------------------------------------------- /extraction/src/main/scala/org/apache/spark/ui/jobs/DBpediaJobProgressListener.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.ui.jobs 2 | 3 | import org.apache.spark.{Logging, SparkConf} 4 | import org.apache.spark.scheduler._ 5 | import org.apache.spark.scheduler.SparkListenerTaskEnd 6 | import org.apache.spark.scheduler.SparkListenerJobEnd 7 | import org.apache.spark.scheduler.SparkListenerStageSubmitted 8 | import org.apache.spark.scheduler.SparkListenerStageCompleted 9 | import org.apache.spark.scheduler.SparkListenerTaskStart 10 | import org.apache.spark.scheduler.SparkListenerJobStart 11 | import org.dbpedia.extraction.util.StringUtils 12 | import scala.collection.mutable 13 | 14 | /** 15 | * SparkListener implementation that provides real-time logging for jobs, tasks and stages in a 16 | * friendly way omitting most of the details that can be had using Spark's default logging 17 | * system. 18 | * 19 | * This is in the org.apache.spark.ui.jobs package because it needs to extend 20 | * org.apache.spark.ui.jobs.JobProgressListener which is private[spark]. 21 | */ 22 | class DBpediaJobProgressListener(sc: SparkConf) extends JobProgressListener(sc) with Logging 23 | { 24 | /** 25 | * The time when this class was created (usually along with the SparkContext). 26 | * Milliseconds since midnight, January 1, 1970 UTC. 27 | */ 28 | val startTime = System.currentTimeMillis() 29 | 30 | val stageNumTasks = mutable.Map[Int, Int]() // Maps stageId to number of tasks 31 | 32 | override def onStageSubmitted(stageSubmitted: SparkListenerStageSubmitted): Unit = 33 | { 34 | super.onStageSubmitted(stageSubmitted) 35 | val stage = stageSubmitted.stageInfo 36 | val numTasks = stage.numTasks 37 | stageNumTasks.synchronized(stageNumTasks(stage.stageId) = numTasks) 38 | val time = prettyTime(stage.submissionTime.getOrElse(startTime)) 39 | logInfo("Stage #%d: Starting stage %s with %d tasks at %s".format(stage.stageId, stage.name, numTasks, time)) 40 | } 41 | 42 | override def onStageCompleted(stageCompleted: SparkListenerStageCompleted): Unit = 43 | { 44 | super.onStageCompleted(stageCompleted) 45 | val stage = stageCompleted.stageInfo 46 | val time = prettyTime(stage.completionTime.getOrElse(startTime)) 47 | logInfo("Stage #%d: Finished stage %s at %s".format(stage.stageId, stage.name, time)) 48 | } 49 | 50 | override def onTaskStart(taskStart: SparkListenerTaskStart): Unit = 51 | { 52 | super.onTaskStart(taskStart) 53 | val executor = taskStart.taskInfo.executorId 54 | val host = taskStart.taskInfo.host 55 | val time = prettyTime(taskStart.taskInfo.launchTime) 56 | val taskId = taskStart.taskInfo.taskId 57 | val stageId = taskStart.taskInfo.taskId 58 | // Get TaskInfos for this stage to compute number of tasks 59 | val numTasks = this.stageIdToInfo.size 60 | //val numTasks = this.stageIdToTaskInfos(stageId).size 61 | logInfo("Stage #%d: Started task #%d on host %s, executor %s at %s. Total tasks submitted: %d".format(stageId, taskId, host, executor, time, numTasks)) 62 | } 63 | 64 | override def onTaskEnd(taskEnd: SparkListenerTaskEnd): Unit = 65 | { 66 | super.onTaskEnd(taskEnd) 67 | val time = prettyTime(taskEnd.taskInfo.finishTime) 68 | val taskId = taskEnd.taskInfo.taskId 69 | val stageId = taskEnd.stageId 70 | val totalNumTasks = stageNumTasks(taskEnd.stageId) 71 | // Get TaskInfos for this stage to compute number of tasks 72 | val numTasks = this.stageIdToInfo.size 73 | //val numTasks = this.stageIdToTaskInfos(stageId).size 74 | // Wrap in try/catch to return 0 if no completed/failed tasks for stageId are found in the maps. 75 | val finished = try { this.numCompletedStages } catch { case ex: NoSuchElementException =>0 } 76 | val failed = try { this.numFailedStages } catch { case ex: NoSuchElementException =>0 } 77 | //val finished = try { this.stageIdToTasksComplete(stageId) } catch { case ex: NoSuchElementException => 0 } 78 | //val failed = try { this.stageIdToTasksFailed(stageId) } catch { case ex: NoSuchElementException => 0 } 79 | logInfo("Stage #%d: Finished task #%d at %s. Completed: %d/%d Failed: %d/%d Total Progress: %d/%d".format(stageId, taskId, time, finished, numTasks, failed, numTasks, finished, totalNumTasks)) 80 | } 81 | 82 | override def onJobStart(jobStart: SparkListenerJobStart): Unit = 83 | { 84 | super.onJobStart(jobStart) 85 | logInfo("Started job #" + jobStart.jobId) 86 | } 87 | 88 | override def onJobEnd(jobEnd: SparkListenerJobEnd): Unit = 89 | { 90 | super.onJobEnd(jobEnd) 91 | logInfo("Finished job #" + jobEnd.jobId) 92 | } 93 | 94 | override def onTaskGettingResult(taskGettingResult: SparkListenerTaskGettingResult): Unit = 95 | { 96 | super.onTaskGettingResult(taskGettingResult) 97 | } 98 | 99 | private def prettyTime(time: Long) = StringUtils.prettyMillis(time - startTime) 100 | } -------------------------------------------------------------------------------- /extraction/src/main/scala/org/dbpedia/extraction/destinations/DistDeduplicatingWriterDestination.scala: -------------------------------------------------------------------------------- 1 | package org.dbpedia.extraction.destinations 2 | 3 | import org.apache.hadoop.fs.Path 4 | import org.apache.hadoop.conf.Configuration 5 | import org.apache.spark.rdd.RDD 6 | import org.apache.hadoop.io.Text 7 | import org.dbpedia.extraction.spark.io.QuadSeqWritable 8 | import org.dbpedia.extraction.spark.io.output.DBpediaCompositeOutputFormat 9 | import org.apache.spark.SparkContext._ 10 | 11 | /** 12 | * Destination where RDF graphs are deduplicated and written to a Hadoop Path. 13 | * 14 | * @param path Path used by DBpediaCompositeOutputFormat to write outputs 15 | * @param hadoopConfiguration Hadoop Configuration object 16 | */ 17 | class DistDeduplicatingWriterDestination(path: Path, hadoopConfiguration: Configuration) extends DistDestination 18 | { 19 | override def open() = () 20 | 21 | /** 22 | * Writes RDD of quads (after extracting unique quads) to path using DBpediaCompositeOutputFormat. 23 | * 24 | * @param rdd RDD[ Seq[Quad] ] 25 | */ 26 | override def write(rdd: RDD[Seq[Quad]]) 27 | { 28 | rdd.flatMap 29 | { 30 | quads => 31 | quads.distinct.groupBy(quad => new Text(quad.dataset)).toSeq.map 32 | { 33 | case (key: Text, quads: Seq[Quad]) => (key, new QuadSeqWritable(quads)) 34 | } 35 | }.saveAsNewAPIHadoopFile(path.toString, 36 | classOf[Text], 37 | classOf[QuadSeqWritable], 38 | classOf[DBpediaCompositeOutputFormat], 39 | hadoopConfiguration) 40 | } 41 | 42 | override def close() = () 43 | } 44 | -------------------------------------------------------------------------------- /extraction/src/main/scala/org/dbpedia/extraction/destinations/DistDestination.scala: -------------------------------------------------------------------------------- 1 | package org.dbpedia.extraction.destinations 2 | 3 | import org.apache.spark.rdd.RDD 4 | 5 | /** 6 | * A distributed destination for RDF quads. 7 | */ 8 | trait DistDestination 9 | { 10 | /** 11 | * Opens this destination. This method should only be called once during the lifetime 12 | * of a destination, and it should not be called concurrently with other methods of this class. 13 | */ 14 | def open(): Unit 15 | 16 | /** 17 | * Writes RDD of quads to this destination. 18 | * 19 | * @param rdd RDD[ Seq[Quad] ] 20 | */ 21 | def write(rdd: RDD[Seq[Quad]]): Unit 22 | 23 | /** 24 | * Closes this destination. This method should only be called once during the lifetime 25 | * of a destination, and it should not be called concurrently with other methods of this class. 26 | */ 27 | def close(): Unit 28 | } 29 | -------------------------------------------------------------------------------- /extraction/src/main/scala/org/dbpedia/extraction/destinations/DistMarkerDestination.scala: -------------------------------------------------------------------------------- 1 | package org.dbpedia.extraction.destinations 2 | 3 | import org.dbpedia.extraction.util.FileLike 4 | import java.io.IOException 5 | 6 | /** 7 | * MakerDestination that wraps a DistDestination. The code has been taken from MakerDestination. 8 | * 9 | * Handles a marker file that signals that the extraction is either running ('start mode') 10 | * or finished ('end mode'). 11 | * 12 | * In 'start mode', the file is created before the extraction starts (it must not already exist) 13 | * and deleted after the extraction ends. 14 | * 15 | * In 'end mode', the file is deleted before the extraction starts (if it already exists) 16 | * and re-created after the extraction ends. 17 | * 18 | * @param file marker file 19 | * @param start 'start mode' if true, 'end mode' if false. 20 | */ 21 | class DistMarkerDestination(destination: DistDestination, file: FileLike[_], start: Boolean) 22 | extends DistWrapperDestination(destination) 23 | { 24 | override def open(): Unit = 25 | { 26 | if (start) create() else delete() 27 | super.open() 28 | } 29 | 30 | override def close(): Unit = 31 | { 32 | super.close() 33 | if (!start) create() else delete() 34 | } 35 | 36 | private def create(): Unit = 37 | { 38 | if (file.exists) throw new IOException("file '" + file + "' already exists") 39 | file.outputStream().close() 40 | } 41 | 42 | private def delete(): Unit = 43 | { 44 | if (file.exists) file.delete() 45 | } 46 | } -------------------------------------------------------------------------------- /extraction/src/main/scala/org/dbpedia/extraction/destinations/DistWrapperDestination.scala: -------------------------------------------------------------------------------- 1 | package org.dbpedia.extraction.destinations 2 | 3 | import org.apache.spark.rdd.RDD 4 | 5 | /** 6 | * Base class for DistDestination objects that forward most calls to another destination. 7 | */ 8 | abstract class DistWrapperDestination(destination: DistDestination) extends DistDestination 9 | { 10 | override def open() = destination.open() 11 | 12 | def write(rdd: RDD[Seq[Quad]]) = destination.write(rdd) 13 | 14 | override def close() = destination.close() 15 | } -------------------------------------------------------------------------------- /extraction/src/main/scala/org/dbpedia/extraction/dump/extract/DistConfig.scala: -------------------------------------------------------------------------------- 1 | package org.dbpedia.extraction.dump.extract 2 | 3 | import java.util.Properties 4 | import scala.collection.JavaConversions.asScalaSet 5 | import org.dbpedia.extraction.util.ConfigUtils.getValue 6 | import java.io.File 7 | import org.apache.spark.storage.StorageLevel 8 | import java.net.URI 9 | import org.apache.log4j.Level 10 | import org.dbpedia.extraction.util.HadoopConfigurable 11 | import org.apache.hadoop.fs.Path 12 | 13 | /** 14 | * Class for distributed configuration. Delegates general stuff except directory/file properties to Config. 15 | * 16 | * Note that dumpDir, ontologyFile and mappingsDir are first checked in distConfigProps; 17 | * if not found they're checked in extractionConfigProps. 18 | * 19 | * @param distConfigProps Distributed extraction configuration properties 20 | * @param extractionConfigProps General extraction framework configuration properties 21 | * @see Config 22 | */ 23 | class DistConfig(distConfigProps: Properties, extractionConfigProps: Properties, val extractionConfigFile: URI) extends HadoopConfigurable 24 | { 25 | private val extractionConfig = new ExtractionConfig() 26 | 27 | /** It is recommended that spark-home and spark-master are explicitly provided. */ 28 | val sparkHome = distConfigProps.getProperty("spark-home", sys.env.get("SPARK_HOME").getOrElse("")) 29 | 30 | /** By default assume master is runnning locally; use 4 cores */ 31 | val sparkMaster = distConfigProps.getProperty("spark-master", "local[4]") 32 | 33 | /** Shows up on Spark Web UI */ 34 | val sparkAppName = distConfigProps.getProperty("spark-appname", "dbpedia-distributed-extraction-framework") 35 | 36 | /** 37 | * The StorageLevel to be used when calling RDD.persist() unless otherwise specified. Choose any of these: 38 | * MEMORY_ONLY 39 | * MEMORY_AND_DISK 40 | * MEMORY_ONLY_SER 41 | * MEMORY_AND_DISK_SER 42 | * DISK_ONLY 43 | * MEMORY_ONLY_2, MEMORY_AND_DISK_2 etc. 44 | * 45 | * By default it is set to MEMORY_AND_DISK_SER 46 | * 47 | * @see org.apache.spark.storage.StorageLevel 48 | */ 49 | val sparkStorageLevel = Option( 50 | getValue(distConfigProps, "spark-storage-level", required = false) 51 | { 52 | level => StorageLevel.getClass.getDeclaredMethod(level).invoke(StorageLevel).asInstanceOf[StorageLevel] 53 | } 54 | ).getOrElse(StorageLevel.MEMORY_AND_DISK_SER) 55 | 56 | /** Map of optional spark configuration properties. See http://spark.apache.org/docs/latest/configuration.html */ 57 | val sparkProperties = distConfigProps.stringPropertyNames().filter(_.startsWith("spark.")).map(x => (x, distConfigProps.getProperty(x))).toMap 58 | 59 | /** Path to hadoop core-site.xml */ 60 | override protected val hadoopCoreConf = distConfigProps.getProperty("hadoop-coresite-xml-path") 61 | 62 | /** Path to hadoop hdfs-site.xml */ 63 | override protected val hadoopHdfsConf = distConfigProps.getProperty("hadoop-hdfssite-xml-path") 64 | 65 | /** Path to hadoop mapred-site.xml */ 66 | override protected val hadoopMapredConf = distConfigProps.getProperty("hadoop-mapredsite-xml-path") 67 | 68 | /** This is used for setting log levels for "org.apache", "spark", "org.eclipse.jetty" and "akka" using 69 | * SparkUtils.setLogLevels(). It is WARN by default. 70 | */ 71 | val sparkLogLevel = Level.toLevel(distConfigProps.getProperty("logging-level"), Level.WARN) 72 | 73 | /** 74 | * Number of threads to use in the ExecutionContext while calling DistExtractionJob.run() on multiple 75 | * extraction jobs in parallel. 76 | * 77 | * Note that these threads on the driver node do not perform any heavy work except for executing 78 | * DistExtractionJob.run() which submits the respective Spark job to the Spark master and waits 79 | * for the job to finish. 80 | * 81 | * By default it is set to Integer.MAX_VALUE so that all extraction jobs are submitted to Spark master 82 | * simultaneously, which uses the configured scheduling mechanism to execute the jobs on the cluster. 83 | */ 84 | val extractionJobThreads = distConfigProps.getProperty("extraction-job-threads", Integer.MAX_VALUE.toString).toInt 85 | 86 | /** Whether output files should be overwritten or not (true/false). This is true by default. */ 87 | val overwriteOutput = distConfigProps.getProperty("overwrite-output", "true").toBoolean 88 | 89 | /** 90 | * Whether the intermediate RDD[WikiPage] should be cached to Hadoop's filesystem (true/false). 91 | * This is false by default. 92 | * 93 | * Performance implications: 94 | * 1. Caching will make further extractions over the same dump much faster. 95 | * 2. Caching will force early evaluation of the RDD and will cause some delay before extraction. 96 | * 97 | * If you are not planning on repeated extractions over the same dump it is best to leave this as it is. 98 | */ 99 | val cacheWikiPageRDD = distConfigProps.getProperty("cache-wikipages", "false").toBoolean 100 | 101 | /** Dump directory */ 102 | val dumpDir = getPath("base-dir", pathMustExist = true) 103 | 104 | /** Local ontology file, downloaded for speed and reproducibility */ 105 | val ontologyFile = getPath("ontology", pathMustExist = false) 106 | 107 | /** Local mappings files, downloaded for speed and reproducibility */ 108 | val mappingsDir = getPath("mappings", pathMustExist = false) 109 | 110 | val requireComplete = extractionConfig.requireComplete 111 | 112 | val source = extractionConfig.source 113 | 114 | val disambiguations = extractionConfig.disambiguations 115 | 116 | val wikiName = extractionConfig.wikiName 117 | 118 | val parser = extractionConfig.parser 119 | 120 | val formats = extractionConfig.formats 121 | 122 | val extractorClasses = extractionConfig.extractorClasses 123 | 124 | val namespaces = extractionConfig.namespaces 125 | 126 | /** 127 | * Creates a Path from the given property (null if the property is absent) and wraps it in an Option. 128 | * This method first checks the distributed config properties, then the general extraction config properties. 129 | * 130 | * @param property String property key 131 | * @param pathMustExist Boolean to ensure that the Path, if obtained, actually exists. 132 | * @throws RuntimeException if the property is defined but the path does not exist 133 | * @return Option wrapping the obtained Path 134 | */ 135 | def getPath(property: String, pathMustExist: Boolean): Option[Path] = 136 | { 137 | val somePath = Option({ 138 | val distProp = getValue(distConfigProps, property, required = false)(new Path(_)) 139 | if(distProp != null) 140 | { 141 | // If property exists in distributed config file return it. 142 | distProp 143 | } 144 | else 145 | { 146 | // Or else, try the extraction config file - returns either null or a Path. 147 | getValue(extractionConfigProps, property, required = false)(new Path(_)) 148 | } 149 | }) 150 | 151 | checkPathExists(somePath, pathMustExist) 152 | } 153 | 154 | /** 155 | * Custom Config subclass that makes the File-based variables null. 156 | * 157 | * The distributed extraction framework should only work with Paths. Initialization operations on non-existent 158 | * Files may cause errors, and are not required anyway. 159 | */ 160 | private class ExtractionConfig extends Config(extractionConfigProps) 161 | { 162 | override lazy val dumpDir: File = null 163 | override lazy val ontologyFile: File = null 164 | override lazy val mappingsDir: File = null 165 | } 166 | 167 | } 168 | -------------------------------------------------------------------------------- /extraction/src/main/scala/org/dbpedia/extraction/dump/extract/DistConfigLoader.scala: -------------------------------------------------------------------------------- 1 | package org.dbpedia.extraction.dump.extract 2 | 3 | import org.dbpedia.extraction.destinations._ 4 | import org.dbpedia.extraction.mappings._ 5 | import org.dbpedia.extraction.ontology.io.OntologyReader 6 | import org.dbpedia.extraction.sources.{Source, WikiPage, XMLSource, WikiSource} 7 | import org.dbpedia.extraction.util._ 8 | import org.dbpedia.extraction.util.RichHadoopPath.wrapPath 9 | import org.dbpedia.extraction.wikiparser.Namespace 10 | import java.io._ 11 | import java.net.URL 12 | import java.util.logging.{Level, Logger} 13 | import org.apache.spark.rdd.RDD 14 | import org.dbpedia.extraction.dump.download.Download 15 | import org.apache.hadoop.conf.Configuration 16 | import org.apache.hadoop.io.LongWritable 17 | import org.dbpedia.extraction.spark.io.WikiPageWritable 18 | import org.apache.hadoop.mapreduce.Job 19 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat 20 | import org.apache.hadoop.fs.Path 21 | import org.apache.spark.SparkContext 22 | import org.dbpedia.extraction.spark.io.input.DBpediaWikiPageInputFormat 23 | 24 | /** 25 | * Loads the dump extraction configuration. 26 | * 27 | * This class configures Spark and sets up the extractors to run using Spark 28 | * 29 | * TODO: get rid of all config file parsers, use Spring 30 | * TODO: Inherit ConfigLoader methods and get rid of redundant code 31 | * 32 | * @param config DistConfig 33 | */ 34 | class DistConfigLoader(config: DistConfig, sparkContext: SparkContext) 35 | { 36 | private val logger = Logger.getLogger(classOf[DistConfigLoader].getName) 37 | private val CONFIG_PROPERTIES = "config.properties" 38 | 39 | /** 40 | * Loads the configuration and creates extraction jobs for all configured languages. 41 | * 42 | * @return Non-strict Traversable over all configured extraction jobs i.e. an extractions job will not be created until it is explicitly requested. 43 | */ 44 | def getExtractionJobs(): Traversable[DistExtractionJob] = 45 | { 46 | // Create a non-strict view of the extraction jobs 47 | // non-strict because we want to create the extraction job when it is needed, not earlier 48 | config.extractorClasses.view.map(e => createExtractionJob(e._1, e._2)) 49 | } 50 | 51 | /** 52 | * Creates an extraction job for a specific language. 53 | */ 54 | private def createExtractionJob(lang: Language, extractorClasses: Seq[Class[_ <: Extractor[_]]]): DistExtractionJob = 55 | { 56 | val dumpDir = config.dumpDir.get 57 | 58 | // Finder[Path] works with Hadoop's FileSystem class - operates on HDFS, or the local file system depending 59 | // upon whether we are running in local mode or distributed/cluster mode. 60 | val finder = new Finder[Path](dumpDir, lang, config.wikiName) 61 | val date = latestDate(finder) 62 | 63 | // Add input sources 64 | val job = Job.getInstance(hadoopConfiguration) 65 | for (file <- files(config.source, finder, date)) 66 | FileInputFormat.addInputPath(job, file) 67 | hadoopConfiguration = job.getConfiguration // update Configuration 68 | 69 | // Add the extraction configuration file to distributed cache. 70 | // It will be needed in DBpediaCompositeOutputFormat for getting the Formatters. 71 | val configPropertiesDCPath = finder.wikiDir.resolve(CONFIG_PROPERTIES) // Path where to the copy config properties file 72 | val fs = configPropertiesDCPath.getFileSystem(hadoopConfiguration) 73 | fs.copyFromLocalFile(false, true, new Path(config.extractionConfigFile), configPropertiesDCPath) // Copy local file to Hadoop file system 74 | job.addCacheFile(configPropertiesDCPath.toUri) // Add to distributed cache 75 | 76 | // Setup config variables needed by DBpediaWikiPageInputFormat and DBpediaCompositeOutputFormat. 77 | hadoopConfiguration.set("dbpedia.config.properties", configPropertiesDCPath.toString) 78 | hadoopConfiguration.set("dbpedia.wiki.name", config.wikiName) 79 | hadoopConfiguration.set("dbpedia.wiki.language.wikicode", lang.wikiCode) 80 | hadoopConfiguration.set("dbpedia.wiki.date", date) 81 | hadoopConfiguration.setBoolean("dbpedia.output.overwrite", config.overwriteOutput) 82 | 83 | // Getting the WikiPages from local on-disk cache saves processing time. 84 | val cache = finder.file(date, "articles-rdd") 85 | lazy val articlesRDD: RDD[WikiPage] = try 86 | { 87 | if (!cache.exists) 88 | throw new IOException("Cache file " + cache.getSchemeWithFileName + " does not exist.") 89 | logger.info("Loading articles from cache file " + cache.getSchemeWithFileName) 90 | val loaded = DistIOUtils.loadRDD(sparkContext, classOf[WikiPage], cache) 91 | logger.info("WikiPages loaded from cache file " + cache.getSchemeWithFileName) 92 | loaded 93 | } 94 | catch 95 | { 96 | case ex: Exception => 97 | { 98 | logger.log(Level.INFO, "Will read from wiki dump file for " + lang.wikiCode + " wiki, could not load cache file '" + cache.getSchemeWithFileName + "': " + ex) 99 | 100 | // Create RDD with WikiPageWritable elements. 101 | val rawArticlesRDD: RDD[(LongWritable, WikiPageWritable)] = 102 | sparkContext.newAPIHadoopRDD(hadoopConfiguration, classOf[DBpediaWikiPageInputFormat], classOf[LongWritable], classOf[WikiPageWritable]) 103 | 104 | // Unwrap WikiPages and filter unnecessary pages 105 | val newRdd = rawArticlesRDD.map(_._2.get).filter 106 | { 107 | page => 108 | page.title.namespace == Namespace.Main || 109 | page.title.namespace == Namespace.File || 110 | page.title.namespace == Namespace.Category || 111 | page.title.namespace == Namespace.Template 112 | }.persist(config.sparkStorageLevel) 113 | 114 | if (config.cacheWikiPageRDD) 115 | { 116 | DistIOUtils.saveRDD(newRdd, cache) 117 | logger.info("Parsed WikiPages written to cache file " + cache.getSchemeWithFileName) 118 | } 119 | 120 | newRdd 121 | } 122 | } 123 | 124 | val _ontology = 125 | { 126 | val ontologySource = config.ontologyFile match 127 | { 128 | case Some(ontologyFile) if ontologyFile.isFile => 129 | // Is ontologyFile defined and it is indeed a file? 130 | XMLSource.fromReader(reader(ontologyFile), Language.Mappings) 131 | case _ => 132 | val namespaces = Set(Namespace.OntologyClass, Namespace.OntologyProperty) 133 | val url = new URL(Language.Mappings.apiUri) 134 | val language = Language.Mappings 135 | WikiSource.fromNamespaces(namespaces, url, language) 136 | } 137 | 138 | new OntologyReader().read(ontologySource) 139 | } 140 | 141 | val _commonsSource = 142 | { 143 | try 144 | { 145 | val finder = new Finder[Path](config.dumpDir.get, Language("commons"), config.wikiName) 146 | val date = latestDate(finder) 147 | XMLSource.fromReaders(readers(config.source, finder, date), Language.Commons, _.namespace == Namespace.File) 148 | } 149 | catch 150 | { 151 | case ex: Exception => 152 | logger.info("Could not load commons source - error: " + ex.getMessage) 153 | null 154 | } 155 | } 156 | 157 | val _disambiguations = 158 | { 159 | val cache = finder.file(date, "disambiguations-ids.obj") 160 | try 161 | { 162 | DistDisambiguations.load(reader(finder.file(date, config.disambiguations)), cache, lang) 163 | } catch 164 | { 165 | case ex: Exception => 166 | logger.info("Could not load disambiguations - error: " + ex.getMessage) 167 | Disambiguations.empty() 168 | } 169 | } 170 | 171 | val redirectsCache = finder.file(date, "template-redirects.obj") 172 | lazy val _redirects = DistRedirects.load(articlesRDD, redirectsCache, lang) // lazy because it will be evaluated in DistExtractionJob.run() 173 | 174 | lazy val context = new DumpExtractionContext 175 | { 176 | def ontology = _ontology 177 | 178 | def commonsSource = _commonsSource 179 | 180 | def language = lang 181 | 182 | private lazy val _mappingPageSource = 183 | { 184 | val namespace = Namespace.mappings(language) 185 | 186 | config.mappingsDir match 187 | { 188 | case Some(mappingsDir) if mappingsDir.isDirectory => 189 | // Is mappingsDir defined and it is indeed a directory? 190 | val path = new Path(mappingsDir, namespace.name(Language.Mappings).replace(' ', '_') + ".xml") 191 | XMLSource.fromReader(reader(path), Language.Mappings) 192 | case _ => 193 | val namespaces = Set(namespace) 194 | val url = new URL(Language.Mappings.apiUri) 195 | WikiSource.fromNamespaces(namespaces, url, Language.Mappings) 196 | } 197 | } 198 | 199 | def mappingPageSource: Traversable[WikiPage] = _mappingPageSource 200 | 201 | private lazy val _mappings = 202 | { 203 | MappingsLoader.load(this) 204 | } 205 | 206 | def mappings: Mappings = _mappings 207 | 208 | def articlesSource: Source = null // Not needing raw article source 209 | 210 | def redirects: Redirects = _redirects 211 | 212 | def disambiguations: Disambiguations = if (_disambiguations != null) _disambiguations else new Disambiguations(Set[Long]()) 213 | } 214 | 215 | // Extractors - this is lazily evaluated in DistExtractionJob.run() so that the distributed redirect extraction happens inside run() 216 | // NOTE: All subsequent references to this val need to be lazy! 217 | lazy val extractor = 218 | { 219 | val _redirects = context.redirects // Trigger evaluation of lazy redirects and load the updated context into extractors. 220 | val updatedContext = new DumpExtractionContextWrapper(context) 221 | { 222 | override def redirects: Redirects = _redirects 223 | } 224 | CompositeParseExtractor.load(extractorClasses, updatedContext) 225 | } 226 | 227 | lazy val destination = 228 | { 229 | // Create empty directories for all datasets. This is not strictly necessary because Hadoop would create the directories 230 | // it needs to by itself, though in that case the directories for unused datasets will obviously be absent. 231 | val datasets = extractor.datasets 232 | val outputPath = finder.directory(date) 233 | 234 | for ((suffix, format) <- config.formats; dataset <- datasets) 235 | { 236 | new Path(outputPath, s"${finder.wikiName}-$date-${dataset.name.replace('_', '-')}.$suffix").mkdirs() 237 | } 238 | new DistMarkerDestination(new DistDeduplicatingWriterDestination(outputPath, hadoopConfiguration), finder.file(date, Extraction.Complete), false) 239 | } 240 | 241 | lazy val description = 242 | { 243 | val datasets = extractor.datasets 244 | lang.wikiCode + ": " + extractorClasses.size + " extractors (" + extractorClasses.map(_.getSimpleName).mkString(",") + "), " + datasets.size + " datasets (" + datasets.mkString(",") + ")" 245 | } 246 | 247 | new DistExtractionJob(new RootExtractor(extractor), articlesRDD, config.namespaces, destination, lang.wikiCode, description) 248 | } 249 | 250 | implicit var hadoopConfiguration: Configuration = config.hadoopConf 251 | 252 | private def writer[T <% FileLike[_]](file: T): () => Writer = 253 | { 254 | () => IOUtils.writer(file) 255 | } 256 | 257 | private def reader[T <% FileLike[_]](file: T): () => Reader = 258 | { 259 | () => IOUtils.reader(file) 260 | } 261 | 262 | private def readers[T <% FileLike[_]](source: String, finder: Finder[T], date: String): List[() => Reader] = 263 | { 264 | files(source, finder, date).map(reader(_)) 265 | } 266 | 267 | private def files[T <% FileLike[_]](source: String, finder: Finder[T], date: String): List[T] = 268 | { 269 | 270 | val files = if (source.startsWith("@")) 271 | { 272 | // the articles source is a regex - we want to match multiple files 273 | finder.matchFiles(date, source.substring(1)) 274 | } else List(finder.file(date, source)) 275 | 276 | logger.info(s"Source is ${source} - ${files.size} file(s) matched") 277 | 278 | files 279 | } 280 | 281 | private def latestDate(finder: Finder[_]): String = 282 | { 283 | val isSourceRegex = config.source.startsWith("@") 284 | val source = if (isSourceRegex) config.source.substring(1) else config.source 285 | val fileName = if (config.requireComplete) Download.Complete else source 286 | finder.dates(fileName, isSuffixRegex = isSourceRegex).last 287 | } 288 | } -------------------------------------------------------------------------------- /extraction/src/main/scala/org/dbpedia/extraction/dump/extract/DistExtraction.scala: -------------------------------------------------------------------------------- 1 | package org.dbpedia.extraction.dump.extract 2 | 3 | import org.dbpedia.extraction.util.{SparkUtils, ProxyAuthenticator, ConfigUtils} 4 | import java.net.Authenticator 5 | import scala.concurrent.{ExecutionContext, Await, Future, future} 6 | import scala.concurrent.duration.Duration 7 | import java.io.File 8 | import java.util.concurrent.Executors 9 | 10 | /** 11 | * Dump extraction script. 12 | */ 13 | object DistExtraction 14 | { 15 | 16 | val Started = "extraction-started" 17 | 18 | val Complete = "extraction-complete" 19 | 20 | def main(args: Array[String]): Unit = 21 | { 22 | require(args != null && args.length >= 2 && args(0).nonEmpty && args(1).nonEmpty, "missing required arguments: ") 23 | Authenticator.setDefault(new ProxyAuthenticator()) 24 | 25 | // Load properties 26 | val extractionConfigProps = ConfigUtils.loadConfig(args(0), "UTF-8") 27 | val distConfigProps = ConfigUtils.loadConfig(args(1), "UTF-8") 28 | val distConfig = new DistConfig(distConfigProps, extractionConfigProps, new File(args(0)).toURI) 29 | 30 | // overwrite properties with CLI args 31 | // TODO arguments could be of the format a=b and then property a can be overwritten with "b" 32 | 33 | // Create SparkContext 34 | SparkUtils.setSparkLogLevels(distConfig) 35 | val sparkContext = SparkUtils.getSparkContext(distConfig) 36 | 37 | // Load extraction jobs from configuration 38 | val jobs = new DistConfigLoader(distConfig, sparkContext).getExtractionJobs() 39 | 40 | val executor = Executors.newFixedThreadPool(distConfig.extractionJobThreads) 41 | implicit val ec = ExecutionContext.fromExecutor(executor) 42 | val futures = for (job <- jobs) yield future 43 | { 44 | job.run() 45 | } 46 | Await.result(Future.sequence(futures), Duration.Inf) 47 | 48 | sparkContext.stop() 49 | executor.shutdown() 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /extraction/src/main/scala/org/dbpedia/extraction/dump/extract/DistExtractionJob.scala: -------------------------------------------------------------------------------- 1 | package org.dbpedia.extraction.dump.extract 2 | 3 | import java.util.logging.{Level, Logger} 4 | import org.dbpedia.extraction.destinations.{Quad, DistDestination} 5 | import org.dbpedia.extraction.mappings.RootExtractor 6 | import org.dbpedia.extraction.sources.WikiPage 7 | import org.dbpedia.extraction.spark.serialize.KryoSerializationWrapper 8 | import org.dbpedia.extraction.wikiparser.Namespace 9 | import org.apache.spark.rdd.RDD 10 | import org.dbpedia.extraction.util.StringUtils 11 | import org.apache.spark.SparkContext._ 12 | import org.dbpedia.util.Exceptions 13 | 14 | /** 15 | * Executes an extraction using Spark. 16 | * 17 | * @param extractor The Extractor 18 | * @param rdd The RDD of WikiPages 19 | * @param namespaces Only extract pages in these namespaces 20 | * @param destination The extraction destination. Will be closed after the extraction has been finished. 21 | * @param label user readable label of this extraction job. 22 | */ 23 | class DistExtractionJob(extractor: => RootExtractor, rdd: => RDD[WikiPage], namespaces: Set[Namespace], destination: => DistDestination, label: String, description: => String) 24 | { 25 | private val logger = Logger.getLogger(getClass.getName) 26 | 27 | def run(): Unit = 28 | { 29 | val sc = rdd.sparkContext 30 | val allPages = sc.accumulator(0) 31 | val failedPages = sc.accumulator(0) 32 | 33 | val loggerBC = sc.broadcast(logger) 34 | val extractorBC = sc.broadcast(KryoSerializationWrapper(extractor)) 35 | val namespacesBC = sc.broadcast(namespaces) 36 | 37 | val startTime = System.currentTimeMillis 38 | 39 | val results: RDD[Seq[Quad]] = 40 | rdd.map 41 | { 42 | page => 43 | // Take a WikiPage, perform the extraction with a set of extractors and return the results as a Seq[Quad]. 44 | val (success, graph) = try 45 | { 46 | (true, if (namespacesBC.value.contains(page.title.namespace)) Some(extractorBC.value.value.apply(page)) else None) 47 | } 48 | catch 49 | { 50 | case ex: Exception => 51 | loggerBC.value.log(Level.WARNING, "error processing page '" + page.title + "': " + Exceptions.toString(ex, 200)) 52 | (false, None) 53 | } 54 | 55 | if (success) allPages += 1 else failedPages += 1 56 | 57 | graph.getOrElse(Nil) 58 | } 59 | 60 | logger.info(description+" started") 61 | 62 | destination.open() 63 | 64 | logger.info("Writing outputs to destination...") 65 | 66 | destination.write(results) 67 | 68 | destination.close() 69 | 70 | val time = System.currentTimeMillis - startTime 71 | println("%s: extracted %d pages in %s (per page: %f ms; failed pages: %d).".format(label, 72 | allPages.value, 73 | StringUtils.prettyMillis(time), 74 | time.toDouble / allPages.value, 75 | failedPages.value)) 76 | 77 | logger.info(description+" finished") 78 | } 79 | } 80 | -------------------------------------------------------------------------------- /extraction/src/main/scala/org/dbpedia/extraction/dump/extract/DumpExtractionContextWrapper.scala: -------------------------------------------------------------------------------- 1 | package org.dbpedia.extraction.dump.extract 2 | 3 | import org.dbpedia.extraction.ontology.Ontology 4 | import org.dbpedia.extraction.sources.{WikiPage, Source} 5 | import org.dbpedia.extraction.util.Language 6 | import org.dbpedia.extraction.mappings.{Disambiguations, Redirects, Mappings} 7 | 8 | /** 9 | * A simple wrapper for a DumpExtractionContext object 10 | * 11 | * @param context 12 | */ 13 | class DumpExtractionContextWrapper(context: DumpExtractionContext) extends DumpExtractionContext 14 | { 15 | override def ontology: Ontology = context.ontology 16 | 17 | override def commonsSource: Source = context.commonsSource 18 | 19 | override def language: Language = context.language 20 | 21 | override def mappingPageSource: Traversable[WikiPage] = context.mappingPageSource 22 | 23 | override def mappings: Mappings = context.mappings 24 | 25 | override def articlesSource: Source = context.articlesSource 26 | 27 | override def redirects: Redirects = context.redirects 28 | 29 | override def disambiguations: Disambiguations = context.disambiguations 30 | } 31 | -------------------------------------------------------------------------------- /extraction/src/main/scala/org/dbpedia/extraction/mappings/DistDisambiguations.scala: -------------------------------------------------------------------------------- 1 | package org.dbpedia.extraction.mappings 2 | 3 | import java.util.logging.{Level, Logger} 4 | import java.io._ 5 | import org.apache.hadoop.fs.Path 6 | import org.dbpedia.extraction.util.RichHadoopPath.wrapPath 7 | import org.dbpedia.extraction.util.{DistIOUtils, Language} 8 | import org.apache.hadoop.conf.Configuration 9 | import com.esotericsoftware.kryo.io.{Input, Output} 10 | 11 | /** 12 | * A version of Disambiguations that works with org.apache.hadoop.fs.Path. 13 | * 14 | * @see Disambiguations 15 | */ 16 | class DistDisambiguations(override val set : Set[Long]) extends Disambiguations(set) 17 | 18 | object DistDisambiguations 19 | { 20 | private val logger = Logger.getLogger(classOf[DistDisambiguations].getName) 21 | 22 | /** 23 | * Loads disambiguations from cache/source reader. 24 | * 25 | * @param reader Reader to load disambiguations from 26 | * @param cache Path to cache file 27 | * @param lang Language 28 | * @param hadoopConf Configuration 29 | * @return Disambiguations object 30 | */ 31 | def load(reader : () => Reader, cache : Path, lang : Language)(implicit hadoopConf: Configuration) : Disambiguations = 32 | { 33 | try 34 | { 35 | return loadFromCache(cache) 36 | } 37 | catch 38 | { 39 | case ex : Exception => logger.log(Level.INFO, "Will extract disambiguations from source for "+lang.wikiCode+" wiki, could not load cache file '"+cache.getSchemeWithFileName+"': "+ex) 40 | } 41 | 42 | val disambiguations = Disambiguations.loadFromFile(reader, lang) 43 | 44 | val dir = cache.getParent 45 | if (!dir.exists && !dir.mkdirs()) throw new IOException("cache dir [" + dir.getSchemeWithFileName + "] does not exist and cannot be created") 46 | val output = new Output(new BufferedOutputStream(cache.outputStream())) 47 | 48 | try 49 | { 50 | DistIOUtils.getKryoInstance.writeClassAndObject(output, disambiguations.set) 51 | logger.info(disambiguations.set.size + " disambiguations written to cache file " + cache.getSchemeWithFileName) 52 | disambiguations 53 | } 54 | finally 55 | { 56 | output.close() 57 | } 58 | } 59 | 60 | /** 61 | * Loads the disambiguations from a cache file. 62 | */ 63 | private def loadFromCache(cache : Path)(implicit hadoopConf: Configuration) : Disambiguations = 64 | { 65 | logger.info("Loading disambiguations from cache file " + cache.getSchemeWithFileName) 66 | val input = new Input(new BufferedInputStream(cache.inputStream())) 67 | try 68 | { 69 | val disambiguations = new Disambiguations(DistIOUtils.getKryoInstance.readClassAndObject(input).asInstanceOf[Set[Long]]) 70 | logger.info(disambiguations.set.size + " disambiguations loaded from cache file " + cache.getSchemeWithFileName) 71 | disambiguations 72 | } 73 | finally 74 | { 75 | input.close() 76 | } 77 | } 78 | } 79 | -------------------------------------------------------------------------------- /extraction/src/main/scala/org/dbpedia/extraction/mappings/DistRedirects.scala: -------------------------------------------------------------------------------- 1 | package org.dbpedia.extraction.mappings 2 | 3 | import java.util.logging.{Level, Logger} 4 | import org.dbpedia.extraction.sources.WikiPage 5 | import java.io._ 6 | import org.dbpedia.extraction.wikiparser._ 7 | import org.dbpedia.extraction.util.{DistIOUtils, Language} 8 | import org.dbpedia.extraction.wikiparser.impl.wikipedia.Redirect 9 | import org.apache.spark.rdd.RDD 10 | import com.esotericsoftware.kryo.io.{Input, Output} 11 | import org.dbpedia.extraction.util.RichHadoopPath.wrapPath 12 | import org.apache.hadoop.fs.Path 13 | import org.apache.hadoop.conf.Configuration 14 | import org.apache.spark.SparkContext._ 15 | 16 | /** 17 | * Distributed version of Redirects; uses Spark to compute redirects. 18 | * 19 | * Holds the redirects between wiki pages 20 | * At the moment, only redirects between Templates are considered 21 | * 22 | * @param map Redirect map. Contains decoded template titles. 23 | * 24 | * @see Redirects 25 | */ 26 | class DistRedirects(override val map: Map[String, String]) extends Redirects(map) 27 | 28 | /** 29 | * Loads redirects from a cache file or source of Wiki pages. 30 | * At the moment, only redirects between Templates are considered 31 | */ 32 | object DistRedirects 33 | { 34 | private val logger = Logger.getLogger(classOf[DistRedirects].getName) 35 | 36 | /** 37 | * Tries to load the redirects from a cache file. 38 | * If not successful, loads the redirects from an RDD. 39 | * Updates the cache after loading the redirects from the source. 40 | * 41 | * @param rdd RDD of WikiPages 42 | * @param cache Path to cache file 43 | * @param lang Language 44 | * @param hadoopConf Configuration 45 | * @return Redirects object 46 | */ 47 | def load(rdd: RDD[WikiPage], cache: Path, lang: Language)(implicit hadoopConf: Configuration): Redirects = 48 | { 49 | //Try to load redirects from the cache 50 | try 51 | { 52 | return loadFromCache(cache) 53 | } 54 | catch 55 | { 56 | case ex: Exception => logger.log(Level.INFO, "Will extract redirects from source for " + lang.wikiCode + " wiki, could not load cache file '" + cache.getSchemeWithFileName + "': " + ex) 57 | } 58 | 59 | //Load redirects from RDD 60 | val redirects = loadFromRDD(rdd, lang) 61 | 62 | val dir = cache.getParent 63 | if (!dir.exists && !dir.mkdirs()) throw new IOException("cache dir [" + dir.getSchemeWithFileName + "] does not exist and cannot be created") 64 | val output = new Output(new BufferedOutputStream(cache.outputStream())) 65 | try 66 | { 67 | DistIOUtils.getKryoInstance.writeClassAndObject(output, redirects.map) 68 | logger.info(redirects.map.size + " redirects written to cache file " + cache.getSchemeWithFileName) 69 | redirects 70 | } 71 | finally 72 | { 73 | output.close() 74 | } 75 | } 76 | 77 | /** 78 | * Loads the redirects from a cache file. 79 | */ 80 | private def loadFromCache(cache: Path)(implicit hadoopConf: Configuration): Redirects = 81 | { 82 | logger.info("Loading redirects from cache file " + cache.getSchemeWithFileName) 83 | val input = new Input(new BufferedInputStream(cache.inputStream())) 84 | try 85 | { 86 | val redirects = new Redirects(DistIOUtils.getKryoInstance.readClassAndObject(input).asInstanceOf[Map[String, String]]) 87 | logger.info(redirects.map.size + " redirects loaded from cache file " + cache.getSchemeWithFileName) 88 | redirects 89 | } 90 | finally 91 | { 92 | input.close() 93 | } 94 | } 95 | 96 | /** 97 | * Loads the redirects from a source. 98 | * 99 | * @param rdd RDD of WikiPages 100 | * @param lang Language 101 | * @return Redirects object 102 | */ 103 | def loadFromRDD(rdd: RDD[WikiPage], lang: Language): Redirects = 104 | { 105 | logger.info("Loading redirects from source (" + lang.wikiCode + ")") 106 | 107 | val regexBC = rdd.sparkContext.broadcast(buildRegex(lang)) 108 | 109 | // Wrap the map function inside a KryoSerializationWrapper 110 | // val mapper = SparkUtils.kryoWrapFunction(new RedirectFinder(langBC)) 111 | // val redirects = new Redirects(rdd.flatMap(mapper).collectAsMap().toMap) 112 | 113 | val redirectsRDD = rdd.flatMap 114 | { 115 | case page: WikiPage => 116 | val regex = regexBC.value 117 | 118 | val destinationTitle = page.source match 119 | { 120 | case regex(destination) => 121 | try 122 | { 123 | WikiTitle.parse(destination, page.title.language) 124 | } 125 | catch 126 | { 127 | case ex: WikiParserException => 128 | Logger.getLogger(Redirects.getClass.getName).log(Level.WARNING, "Couldn't parse redirect destination", ex) 129 | null 130 | } 131 | case _ => null 132 | } 133 | 134 | if (destinationTitle != page.redirect) 135 | { 136 | Logger.getLogger(Redirects.getClass.getName).log(Level.WARNING, "wrong redirect. page: [" + page.title + "].\nfound by dbpedia: [" + destinationTitle + "].\nfound by wikipedia: [" + page.redirect + "]") 137 | } 138 | 139 | if (destinationTitle != null && page.title.namespace == Namespace.Template && destinationTitle.namespace == Namespace.Template) 140 | { 141 | List((page.title.decoded, destinationTitle.decoded)) 142 | } 143 | else 144 | { 145 | Nil 146 | } 147 | } 148 | 149 | val redirects = new Redirects(redirectsRDD.collectAsMap().toMap) 150 | 151 | logger.info("Redirects loaded from source (" + lang.wikiCode + ")") 152 | redirects 153 | } 154 | 155 | private def buildRegex(lang: Language) = 156 | { 157 | val redirects = Redirect(lang).mkString("|") 158 | // (?ius) enables CASE_INSENSITIVE UNICODE_CASE DOTALL 159 | // case insensitive and unicode are important - that's what mediawiki does. 160 | // Note: Although we do not specify a Locale, UNICODE_CASE does mostly the right thing. 161 | // DOTALL means that '.' also matches line terminators. 162 | // Reminder: (?:...) are non-capturing groups, '*?' is a reluctant qualifier. 163 | // (?:#[^\n]*?)? is an optional (the last '?') non-capturing group meaning: there may 164 | // be a '#' after which everything but line breaks is allowed ('[]{}|<>' are not allowed 165 | // before the '#'). The match is reluctant ('*?'), which means that we recognize ']]' 166 | // as early as possible. 167 | // (?:\|[^\n]*?)? is another optional non-capturing group that reluctantly consumes 168 | // a '|' character and everything but line breaks after it. 169 | ("""(?ius)\s*(?:""" + redirects + """)\s*:?\s*\[\[([^\[\]{}|<>\n]+(?:#[^\n]*?)?)(?:\|[^\n]*?)?\]\].*""").r 170 | } 171 | } 172 | 173 | -------------------------------------------------------------------------------- /extraction/src/main/scala/org/dbpedia/extraction/spark/io/QuadSeqWritable.scala: -------------------------------------------------------------------------------- 1 | package org.dbpedia.extraction.spark.io 2 | 3 | import org.dbpedia.extraction.destinations.Quad 4 | import org.apache.hadoop.io.Writable 5 | import org.dbpedia.extraction.util.DistIOUtils 6 | import java.io.{DataOutput, ByteArrayOutputStream, DataInput} 7 | import com.esotericsoftware.kryo.io.{Input, Output} 8 | 9 | /** 10 | * Writable wrapping Seq[Quad] - used by custom OutputFormat 11 | */ 12 | class QuadSeqWritable(quads: Seq[Quad]) extends Writable 13 | { 14 | var _quads = quads 15 | 16 | def this() = this(null) 17 | 18 | def set(quads: Seq[Quad]) 19 | { 20 | _quads = quads 21 | } 22 | 23 | def get = _quads 24 | 25 | override def write(output: DataOutput) 26 | { 27 | val out = new ByteArrayOutputStream() 28 | val o = new Output(out) 29 | DistIOUtils.getKryoInstance.writeClassAndObject(o, get) 30 | o.close() 31 | val bytes = out.toByteArray 32 | output.writeInt(bytes.size) 33 | output.write(bytes) 34 | } 35 | 36 | override def readFields(input: DataInput) 37 | { 38 | val size = input.readInt() 39 | val bytes = new Array[Byte](size) 40 | input.readFully(bytes) 41 | val i = new Input() 42 | i.setBuffer(bytes) 43 | set(DistIOUtils.getKryoInstance.readClassAndObject(i).asInstanceOf[Seq[Quad]]) 44 | i.close() 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /extraction/src/main/scala/org/dbpedia/extraction/spark/io/WikiPageWritable.scala: -------------------------------------------------------------------------------- 1 | package org.dbpedia.extraction.spark.io 2 | 3 | import org.apache.hadoop.io.Writable 4 | import java.io.{ByteArrayOutputStream, DataOutput, DataInput} 5 | import org.dbpedia.extraction.sources.WikiPage 6 | import com.esotericsoftware.kryo.io.{Input, Output} 7 | import org.dbpedia.extraction.spark.serialize.WikiPageSerializer 8 | import org.dbpedia.extraction.util.DistIOUtils 9 | 10 | /** 11 | * DBpediaWikiPageInputFormat emits values of type WikiPageWritable. This class holds a single WikiPage instance. 12 | * @see DBpediaWikiPageInputFormat 13 | */ 14 | class WikiPageWritable(wikiPage: WikiPage) extends Writable 15 | { 16 | var _wikiPage = wikiPage 17 | 18 | def this() = this(null) 19 | 20 | def set(wikiPage: WikiPage) 21 | { 22 | _wikiPage = wikiPage 23 | } 24 | 25 | def get = _wikiPage 26 | 27 | val wps = new WikiPageSerializer 28 | 29 | override def write(output: DataOutput) 30 | { 31 | val out = new ByteArrayOutputStream() 32 | val o = new Output(out) 33 | wps.write(DistIOUtils.getKryoInstance, o, get) 34 | o.close() 35 | val bytes = out.toByteArray 36 | output.writeInt(bytes.size) 37 | output.write(bytes) 38 | } 39 | 40 | override def readFields(input: DataInput) 41 | { 42 | val size = input.readInt() 43 | val bytes = new Array[Byte](size) 44 | input.readFully(bytes) 45 | val i = new Input() 46 | i.setBuffer(bytes) 47 | set(wps.read(DistIOUtils.getKryoInstance, i, classOf[WikiPage])) 48 | i.close() 49 | } 50 | } -------------------------------------------------------------------------------- /extraction/src/main/scala/org/dbpedia/extraction/spark/io/input/ByteMatcher.scala: -------------------------------------------------------------------------------- 1 | package org.dbpedia.extraction.spark.io.input 2 | 3 | import java.io.InputStream 4 | import org.apache.hadoop.fs.Seekable 5 | import org.apache.hadoop.io.DataOutputBuffer 6 | import scala.annotation.tailrec 7 | 8 | /** 9 | * A class that operates mainly on SeekableInputStreams, iteratively reading chunks of data from an InputStream 10 | * depending upon a match pattern, through the method readUntilMatch(). 11 | * 12 | * @param in InputStream to read binary data from 13 | * @param seeker Seekable for the InputStream "in" - used for keeping track of position in the InputStream 14 | */ 15 | class ByteMatcher(in: InputStream, seeker: Seekable) 16 | { 17 | private var bytesRead: Long = 0 18 | private var lastMatchedPos: Long = -1 19 | private var currentPos: Long = -1 20 | 21 | def this(is: SeekableInputStream) = this(is, is) 22 | 23 | /** 24 | * @return number of bytes read 25 | */ 26 | def getReadBytes: Long = bytesRead 27 | 28 | /** 29 | * @return current position in seeker 30 | */ 31 | def getPos: Long = seeker.getPos 32 | 33 | /** 34 | * @return last position when a match was found 35 | */ 36 | def getLastMatchedPos: Long = lastMatchedPos 37 | 38 | /** 39 | * @param len number of bytes to skip 40 | */ 41 | def skip(len: Long) 42 | { 43 | in.skip(len) 44 | bytesRead += len 45 | } 46 | 47 | /** 48 | * Reads the InputStream until a match is found or "end" number of bytes is reached. 49 | * 50 | * @param textPattern String to match against 51 | * @param end number of bytes to read till - checked against seeker 52 | * @return Boolean true if a match was found, false if EOF was found or stopping point "end" was crossed. 53 | */ 54 | def readUntilMatch(textPattern: String, end: Long): Boolean = 55 | { 56 | readUntilMatch(textPattern.getBytes("UTF-8"), 0, end) 57 | } 58 | 59 | /** 60 | * Reads the InputStream while writing to a buffer, until a match is found or "end" number of bytes is reached. 61 | * 62 | * @param textPattern String to match against 63 | * @param end number of bytes to read till - checked against seeker 64 | * @param outputBuffer DataOutputBuffer where the data being read is written to 65 | * @return Boolean true if a match was found, false if EOF was found or stopping point "end" was crossed. 66 | */ 67 | def readUntilMatch(textPattern: String, end: Long, outputBuffer: Option[DataOutputBuffer]): Boolean = 68 | { 69 | readUntilMatch(textPattern.getBytes("UTF-8"), 0, end, outputBuffer) 70 | } 71 | 72 | /** 73 | * Reads the InputStream until a match is found or "end" number of bytes is reached. 74 | * 75 | * @param bytePattern Byte array to match against 76 | * @param end number of bytes to read till - checked against seeker 77 | * @return Boolean true if a match was found, false if EOF was found or stopping point "end" was crossed. 78 | */ 79 | def readUntilMatch(bytePattern: Array[Byte], end: Long): Boolean = 80 | { 81 | readUntilMatch(bytePattern, 0, end) 82 | } 83 | 84 | /** 85 | * Reads the InputStream while writing to a buffer, until a match is found or "end" number of bytes is reached. 86 | * 87 | * @param bytePattern Byte array to match against 88 | * @param end number of bytes to read till - checked against seeker 89 | * @param outputBuffer DataOutputBuffer where the data being read is written to 90 | * @return Boolean true if a match was found, false if EOF was found or stopping point "end" was crossed. 91 | */ 92 | def readUntilMatch(bytePattern: Array[Byte], end: Long, outputBuffer: Option[DataOutputBuffer]): Boolean = 93 | { 94 | readUntilMatch(bytePattern, 0, end, outputBuffer) 95 | } 96 | 97 | @tailrec private def readUntilMatch(matchBytes: Array[Byte], matchIter: Int, end: Long, outputBuffer: Option[DataOutputBuffer] = None): Boolean = 98 | { 99 | var i = matchIter 100 | val b: Int = this.in.read 101 | // EOF at the beginning 102 | if (b == -1) return false 103 | 104 | this.bytesRead += 1 105 | 106 | // Save to the buffer, if any provided 107 | outputBuffer.foreach(_.write(b)) 108 | 109 | // Check if we're matching 110 | if (b == matchBytes(i)) 111 | { 112 | i += 1 113 | // Whole of matchBytes matched successfully? 114 | if (i >= matchBytes.length) return true 115 | } 116 | else 117 | { 118 | // If not matched, start afresh and increment position. 119 | i = 0 120 | if (this.currentPos != this.getPos) 121 | { 122 | this.lastMatchedPos = this.currentPos 123 | this.currentPos = this.getPos 124 | } 125 | } 126 | 127 | // See if we've passed the stop point 128 | if (i == 0 && this.seeker.getPos >= end) return false 129 | 130 | // Keep reading 131 | readUntilMatch(matchBytes, i, end, outputBuffer) 132 | } 133 | } -------------------------------------------------------------------------------- /extraction/src/main/scala/org/dbpedia/extraction/spark/io/input/DBpediaWikiPageInputFormat.scala: -------------------------------------------------------------------------------- 1 | package org.dbpedia.extraction.spark.io.input 2 | 3 | import org.apache.hadoop.io.{DataOutputBuffer, LongWritable} 4 | import org.apache.hadoop.io.compress.{CompressionCodecFactory, SplittableCompressionCodec} 5 | import org.apache.hadoop.fs.Path 6 | import scala.xml.XML 7 | import org.dbpedia.extraction.sources.XMLSource 8 | import org.apache.hadoop.mapreduce.lib.input.{FileSplit, FileInputFormat} 9 | import org.apache.hadoop.mapreduce.{JobContext, RecordReader, InputSplit, TaskAttemptContext} 10 | import org.apache.commons.logging.LogFactory 11 | import org.dbpedia.extraction.util.Language 12 | import org.dbpedia.extraction.spark.io.WikiPageWritable 13 | 14 | /** 15 | * Hadoop InputFormat that splits a Wikipedia dump file into WikiPageWritable (representing a single 16 | * org.dbpedia.extraction.sources.WikiPage) chunks. 17 | * 18 | * The WikiPageRecordReader class inside outputs a WikiPageWritable as value and the starting position (byte) as key. 19 | * 20 | * Note that wikipage.language.wikicode needs to be set in Hadoop's Configuration. 21 | */ 22 | class DBpediaWikiPageInputFormat extends FileInputFormat[LongWritable, WikiPageWritable] 23 | { 24 | private val LOG = LogFactory.getLog(classOf[DBpediaWikiPageInputFormat]) 25 | private val LANGUAGE = "dbpedia.wiki.language.wikicode" 26 | 27 | protected override def isSplitable(context: JobContext, file: Path): Boolean = 28 | { 29 | val codec = new CompressionCodecFactory(context.getConfiguration).getCodec(file) 30 | if (null == codec) true else codec.isInstanceOf[SplittableCompressionCodec] 31 | } 32 | 33 | override def createRecordReader(genericSplit: InputSplit, context: TaskAttemptContext): RecordReader[LongWritable, WikiPageWritable] = 34 | { 35 | val split = genericSplit.asInstanceOf[FileSplit] 36 | LOG.info("getRecordReader start.....split=" + split) 37 | context.setStatus(split.toString) 38 | new WikiPageRecordReader(split, context) 39 | } 40 | 41 | private class WikiPageRecordReader(split: FileSplit, context: TaskAttemptContext) extends RecordReader[LongWritable, WikiPageWritable] 42 | { 43 | private var key: LongWritable = null 44 | private var value: WikiPageWritable = null 45 | 46 | private val conf = context.getConfiguration 47 | 48 | // Language code for this data dump 49 | private val language = Language(conf.get(LANGUAGE)) 50 | private val page = new DataOutputBuffer() 51 | private val inputStream = SeekableInputStream(split, 52 | split.getPath.getFileSystem(conf), 53 | new CompressionCodecFactory(conf)) 54 | private val matcher = new ByteMatcher(inputStream) 55 | 56 | private val (start, end) = 57 | { 58 | inputStream match 59 | { 60 | case SeekableSplitCompressedInputStream(sin) => 61 | (sin.getAdjustedStart, sin.getAdjustedEnd + 1) 62 | case _ => 63 | (split.getStart, split.getStart + split.getLength) 64 | } 65 | } 66 | 67 | private val pageBeginPattern = "".getBytes("UTF-8") 68 | private val pageEndPattern = "".getBytes("UTF-8") 69 | 70 | override def close() = inputStream.close() 71 | 72 | override def getProgress: Float = 73 | { 74 | if (end == start) 1.0f else (getPos - start).asInstanceOf[Float] / (end - start).asInstanceOf[Float] 75 | } 76 | 77 | def getPos: Long = matcher.getPos 78 | 79 | override def initialize(genericInputSplit: InputSplit, context: TaskAttemptContext) = () 80 | 81 | override def nextKeyValue(): Boolean = 82 | { 83 | // Initialize key and value 84 | if (key == null) key = new LongWritable() 85 | if (value == null) value = new WikiPageWritable() 86 | 87 | if (matcher.getPos < end && matcher.readUntilMatch(pageBeginPattern, end)) 88 | { 89 | try 90 | { 91 | page.write(pageBeginPattern) 92 | if (matcher.readUntilMatch(pageEndPattern, end, Some(page))) 93 | { 94 | // Key is set to the position (bytes) where the page is found 95 | key.set(matcher.getPos) 96 | 97 | // Set value to the WikiPage created from the parsed ... 98 | val elem = XML.loadString("" + new String(page.getData.take(page.getLength), "UTF-8") + "") 99 | value.set(XMLSource.fromXML(elem, language).head) 100 | 101 | return true 102 | } 103 | } 104 | finally 105 | { 106 | page.reset() 107 | } 108 | } 109 | false 110 | } 111 | 112 | override def getCurrentKey: LongWritable = key 113 | 114 | override def getCurrentValue: WikiPageWritable = value 115 | } 116 | 117 | } 118 | -------------------------------------------------------------------------------- /extraction/src/main/scala/org/dbpedia/extraction/spark/io/input/SeekableInputStream.scala: -------------------------------------------------------------------------------- 1 | package org.dbpedia.extraction.spark.io.input 2 | 3 | import org.apache.hadoop.io.compress._ 4 | import org.apache.hadoop.fs.{FileSystem, Seekable, FSDataInputStream} 5 | import java.io.{InputStream, FilterInputStream} 6 | import org.apache.hadoop.mapreduce.lib.input.FileSplit 7 | 8 | object SeekableInputStream 9 | { 10 | /** 11 | * Examines a FileSplit and returns the appropriate SeekableInputStream generated from it. 12 | * 13 | * @param split FileSplit to generate the SeekableInputStream from 14 | * @param fs FileSystem 15 | * @param compressionCodecs CompressionCodecFactory 16 | * @return SeekableInputStream to read from split 17 | */ 18 | def apply(split: FileSplit, fs: FileSystem, compressionCodecs: CompressionCodecFactory): SeekableInputStream = 19 | { 20 | val path = split.getPath 21 | val start = split.getStart 22 | val end = start + split.getLength 23 | 24 | val codec = compressionCodecs.getCodec(path) 25 | val dataInputStream = fs.open(path) 26 | 27 | codec match 28 | { 29 | case splitableCodec: SplittableCompressionCodec => 30 | // Is it a splittable compression input stream? 31 | val compressionInputStream = splitableCodec.createInputStream(dataInputStream, 32 | CodecPool.getDecompressor(codec), 33 | start, 34 | end, 35 | SplittableCompressionCodec.READ_MODE.BYBLOCK) 36 | SeekableSplitCompressedInputStream(compressionInputStream) 37 | case null => 38 | // Input stream not compressed? 39 | dataInputStream.seek(start) 40 | SeekableUncompressedInputStream(dataInputStream) 41 | case _ => 42 | // Non-splittable compression input stream? No seeking or offsetting is needed 43 | assert(start == 0) 44 | val compressionInputStream = codec.createInputStream(dataInputStream, CodecPool.getDecompressor(codec)) 45 | SeekableCompressedInputStream(compressionInputStream, dataInputStream) 46 | } 47 | } 48 | } 49 | 50 | /** 51 | * A SeekableInputStream internally using a SplitCompressionInputStream, ie. compressed by a splittable compression method. 52 | */ 53 | case class SeekableSplitCompressedInputStream(sin: SplitCompressionInputStream) extends SeekableInputStream(sin, sin) 54 | 55 | /** 56 | * A compressed SeekableInputStream using a non-splittable compression input stream 57 | */ 58 | case class SeekableCompressedInputStream(cin: CompressionInputStream, fsin: FSDataInputStream) extends SeekableInputStream(cin, fsin) 59 | 60 | /** 61 | * SeekableInputStream without compression. 62 | */ 63 | case class SeekableUncompressedInputStream(fsin: FSDataInputStream) extends SeekableInputStream(fsin, fsin) 64 | 65 | /** 66 | * Wraps an InputStream and a corresponding Seekable to track its position. 67 | * 68 | * @param in InputStream to read binary data from 69 | * @param seeker Seekable for the InputStream "in" - used for keeping track of position in the InputStream 70 | */ 71 | sealed class SeekableInputStream(in: InputStream, seeker: Seekable) extends FilterInputStream(in) with Seekable 72 | { 73 | override def getPos: Long = seeker.getPos 74 | 75 | override def seek(pos: Long) = seeker.seek(pos) 76 | 77 | override def seekToNewSource(targetPos: Long): Boolean = seeker.seekToNewSource(targetPos) 78 | 79 | override def toString: String = in.toString 80 | } -------------------------------------------------------------------------------- /extraction/src/main/scala/org/dbpedia/extraction/spark/io/output/DBpediaCompositeOutputFormat.scala: -------------------------------------------------------------------------------- 1 | package org.dbpedia.extraction.spark.io.output 2 | 3 | import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat 4 | import org.dbpedia.extraction.spark.io.QuadSeqWritable 5 | import org.apache.hadoop.io.Text 6 | import org.apache.hadoop.mapreduce.{JobContext, RecordWriter, TaskAttemptContext} 7 | import scala.collection.mutable 8 | import org.dbpedia.extraction.destinations.formatters.UriPolicy 9 | import org.dbpedia.extraction.util.ConfigUtils 10 | import org.apache.commons.io.FilenameUtils 11 | import java.io.File 12 | import org.apache.hadoop.fs.{Path, FileSystem} 13 | 14 | /** 15 | * OutputFormat implementation that uses the configured Formatters to write Quads to respective datasets 16 | * through the DBpediaDatasetOutputFormat class. This class uses as many DBpediaDatasetOutputFormat objects 17 | * as there are configured formats. Formats are read in from the provided extraction config properties file. 18 | * This class handles configuration and Formatters, while DBpediaDatasetOutputFormat handles dividing the Quads 19 | * into datasets. 20 | * 21 | * 1. To use this OutputFormat three Strings need to be set in Hadoop's Configuration: 22 | * dbpedia.wiki.name - Config.wikiName, the wiki suffix (eg. wiki) 23 | * dbpedia.wiki.language.wikicode - Language wiki code of the input wiki dump 24 | * dbpedia.wiki.date - Wiki dump date in YYYYMMDD format 25 | * dbpedia.output.overwrite - Boolean, if set to true, output files will be overwritten if they already exist, 26 | * or else an IOException will be thrown (which is also the default behaviour) - this is actually for MultipleTextOutputFormat 27 | * dbpedia.config.properties - HDFS Path at which the extraction config properties file is stored 28 | * 29 | * 2. The extraction config properties file needs to be added to the distributed cache - the HDFS location should be 30 | * configured using dbpedia.config.properties. 31 | * 32 | * 3. Also, the output needs to be grouped by dataset such that each key is a Text representing the dataset 33 | * to which the Quads in the value belong to. Example key: article_categories 34 | * 35 | * NOTE: When using this with Spark set only one core per worker. 36 | * 37 | * Output will look like Hadoop leaf files (eg. part-r-00000) inside directories like enwiki-20140614-article-categories.tql. 38 | * The files will be compressed using the specified compression codec. 39 | * 40 | * @see DBpediaDatasetOutputFormat 41 | */ 42 | class DBpediaCompositeOutputFormat extends TextOutputFormat[Text, QuadSeqWritable] 43 | { 44 | private val CONFIG_PROPERTIES = "dbpedia.config.properties" 45 | private val WIKI = "dbpedia.wiki.name" 46 | private val LANGUAGE = "dbpedia.wiki.language.wikicode" 47 | private val DATE = "dbpedia.wiki.date" 48 | 49 | private class DBpediaCompositeRecordWriter(context: TaskAttemptContext) extends RecordWriter[Text, QuadSeqWritable] 50 | { 51 | private val recordWriters = mutable.Map[String, RecordWriter[Text, QuadSeqWritable]]() 52 | private val conf = context.getConfiguration 53 | private val configPropertiesDCPath = conf.get(CONFIG_PROPERTIES) 54 | private val wikiName = conf.get(WIKI) 55 | private val langCode = conf.get(LANGUAGE) 56 | private val date = conf.get(DATE) 57 | private val localConfigPropertiesFile = new Path("./config.properties") 58 | private val formatters = 59 | { 60 | // Deserialize the config Properties object to get the Formatters 61 | println(context.getCacheFiles.mkString("\n")) 62 | val configProperties = context.getCacheFiles.find(_.getPath == configPropertiesDCPath).get 63 | 64 | val fs = FileSystem.get(conf) 65 | // copy config file from distributed cache to raw local FS 66 | fs.copyToLocalFile(false, new Path(configProperties), localConfigPropertiesFile, true) 67 | 68 | val config = ConfigUtils.loadConfig(localConfigPropertiesFile.toString, "UTF-8") 69 | UriPolicy.parseFormats(config, "uri-policy", "format") 70 | } 71 | 72 | /** 73 | * Note: This method is not synchronized, keeping with the rest of the Hadoop code in this framework. 74 | * When using this with Spark set only one core per worker to ensure that only one thread accesses 75 | * this method per JVM. 76 | */ 77 | override def write(key: Text, value: QuadSeqWritable) 78 | { 79 | for ((suffix, format) <- formatters) 80 | { 81 | // Each RecordReader writes Quads to corresponding datasets depending upon the Text key. 82 | // See DBpediaDatasetOutputFormat and MultipleTextOutputFormat for details. 83 | val writer = recordWriters.getOrElseUpdate(suffix, new DBpediaDatasetOutputFormat( 84 | langCode, 85 | wikiName, 86 | date, 87 | suffix, 88 | format 89 | ).getRecordWriter(context)) 90 | writer.write(key, value) 91 | } 92 | } 93 | 94 | override def close(context: TaskAttemptContext) = recordWriters.foreach(_._2.close(context)) 95 | } 96 | 97 | override def getRecordWriter(context: TaskAttemptContext): RecordWriter[Text, QuadSeqWritable] = new DBpediaCompositeRecordWriter(context) 98 | 99 | override def checkOutputSpecs(job: JobContext) = () // allow overwriting output directory 100 | } 101 | -------------------------------------------------------------------------------- /extraction/src/main/scala/org/dbpedia/extraction/spark/io/output/DBpediaDatasetOutputFormat.scala: -------------------------------------------------------------------------------- 1 | package org.dbpedia.extraction.spark.io.output 2 | 3 | import org.apache.hadoop.io.{Text, NullWritable} 4 | import org.dbpedia.extraction.destinations.formatters.Formatter 5 | import org.apache.hadoop.mapreduce.{TaskAttemptContext, RecordWriter} 6 | import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat.LineRecordWriter 7 | import org.dbpedia.extraction.spark.io.QuadSeqWritable 8 | import java.io.DataOutputStream 9 | import org.apache.hadoop.io.compress.CompressionCodec 10 | 11 | /** 12 | * OutputFormat implementation that writes Quads to respective datasets depending upon the key, after applying 13 | * a given Formatter. This class extends MultipleTextOutputFormat which allows it to write to multiple locations 14 | * (for multiple datasets) depending upon custom criteria. 15 | * 16 | * The output needs to be grouped by dataset such that each key is a Text representing the dataset to which 17 | * the Quads in the value belong to. Example key: article_categories 18 | * 19 | * @param langWikiCode Language wiki code of the input wiki dump 20 | * @param wikiNameSuffix Config.wikiName (eg. wiki) 21 | * @param date Wiki dump date in YYYYMMDD format 22 | * @param outputSuffix Output suffix corresponding to formatter (eg. tql) 23 | * @param formatter Formatter object used to render the Quad objects according to a specific format 24 | */ 25 | class DBpediaDatasetOutputFormat(langWikiCode: String, 26 | wikiNameSuffix: String, 27 | date: String, 28 | outputSuffix: String, 29 | formatter: Formatter) extends MultipleTextOutputFormat[Text, QuadSeqWritable] 30 | { 31 | /** 32 | * Construct the underlying RecordWriter. By default creates a LineRecordWriter that is used by 33 | * TextOutputFormat by default. 34 | * 35 | * @param context TaskAttemptContext 36 | * @param out DataOutputStream where output data is written to 37 | * @param keyValueSeparator String separator between output key and value 38 | * @param codec Option[CompressionCodec] for handling compression 39 | * @return A RecordWriter object over the given DataOutputStream 40 | */ 41 | override protected def getBaseRecordWriter(context: TaskAttemptContext, 42 | out: DataOutputStream, 43 | keyValueSeparator: String, 44 | codec: Option[CompressionCodec] = None): RecordWriter[Text, QuadSeqWritable] = 45 | { 46 | // Get a LineRecordWriter (the usual RecordWriter used by TextOutputFormat) that ignores keys and writes Text outputs. 47 | val lineWriter = codec match 48 | { 49 | case Some(c) => 50 | // Have we an output compression codec? 51 | new LineRecordWriter[NullWritable, Text]( 52 | new DataOutputStream(c.createOutputStream(out)), 53 | keyValueSeparator 54 | ) 55 | case _ => 56 | new LineRecordWriter[NullWritable, Text](out, keyValueSeparator) 57 | } 58 | 59 | new DBpediaDatasetRecordWriter(lineWriter) 60 | } 61 | 62 | /** 63 | * If inferCodecFromPathName is set to true, the output compression codec will be inferred from the suffix/extension 64 | * in pathName (eg. tql.gz implies GzipCodec is used), otherwise it uses Hadoop configuration settings. 65 | */ 66 | override protected val inferCodecFromPathName = true 67 | 68 | /** 69 | * Generate the output file name (the directory where the leaf part-* files will be written to) 70 | * based on the given key and value. The default behavior is that the file name does not depend on them. 71 | * That is, by default this method returns an empty String. 72 | * 73 | * @param key the key of the output data 74 | * @return generated file name 75 | */ 76 | override protected def generateFileNameForKeyValue(key: Text, value: QuadSeqWritable): String = 77 | { 78 | val datasetName = key.toString 79 | // eg. enwiki-20140614-article-categories.tql 80 | s"$langWikiCode$wikiNameSuffix-$date-${datasetName.replace('_', '-')}.$outputSuffix" 81 | } 82 | 83 | /** 84 | * RecordWriter that wraps a LineRecordWriter, applies the given Formatter on a Seq[Quad] and writes to 85 | * the LineRecordWriter. 86 | */ 87 | private class DBpediaDatasetRecordWriter(lineWriter: LineRecordWriter[NullWritable, Text]) extends RecordWriter[Text, QuadSeqWritable] 88 | { 89 | private val text = new Text("") 90 | private val nullKey = NullWritable.get() 91 | 92 | // Begin writing split with formatter header 93 | text.set(formatter.header.dropRight(1)) // remove newline from header 94 | lineWriter.write(nullKey, text) 95 | 96 | /** 97 | * Note: This method is not synchronized, keeping with the rest of the Hadoop code in this framework. 98 | * When using this with Spark, set only one core per worker to ensure that only one thread accesses 99 | * this method per JVM. 100 | */ 101 | override def write(key: Text, value: QuadSeqWritable) = 102 | { 103 | for (quad <- value.get) 104 | { 105 | text.set(formatter.render(quad).dropRight(1)) // remove newline from rendered output 106 | lineWriter.write(nullKey, text) 107 | } 108 | } 109 | 110 | override def close(context: TaskAttemptContext) = 111 | { 112 | text.set(formatter.footer.dropRight(1)) // remove newline from footer 113 | lineWriter.write(nullKey, text) 114 | lineWriter.close(context) 115 | } 116 | } 117 | 118 | } 119 | -------------------------------------------------------------------------------- /extraction/src/main/scala/org/dbpedia/extraction/spark/io/output/MultipleTextOutputFormat.scala: -------------------------------------------------------------------------------- 1 | package org.dbpedia.extraction.spark.io.output 2 | 3 | import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat 4 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat._ 5 | import org.apache.hadoop.mapreduce.{TaskAttemptContext, RecordWriter} 6 | import scala.collection.mutable 7 | import org.apache.hadoop.fs.Path 8 | import org.apache.hadoop.util.ReflectionUtils 9 | import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat.LineRecordWriter 10 | import java.io.DataOutputStream 11 | import org.apache.hadoop.io.compress.{CompressionCodecFactory, CompressionCodec} 12 | 13 | /** 14 | * This class extends allows writing output to multiple output files depending upon custom criteria. It filters 15 | * every key-value pair and routes them to the corresponding locations. 16 | * 17 | * Configuration variables: 18 | * dbpedia.output.overwrite - Boolean, if set to true, output files will be overwritten if they already exist, 19 | * or else an IOException will be thrown (which is also the default behaviour) 20 | */ 21 | class MultipleTextOutputFormat[K, V] extends TextOutputFormat[K, V] 22 | { 23 | private val OVERWRITE = "dbpedia.output.overwrite" 24 | 25 | private class MultipleTextRecordWriter(context: TaskAttemptContext) extends RecordWriter[K, V] 26 | { 27 | private val recordWriters = mutable.Map[String, RecordWriter[K, V]]() 28 | 29 | /** 30 | * Note: This method is not synchronized, keeping with the rest of the Hadoop code in this framework. 31 | * When using this with Spark, set only one core per worker to ensure that only one thread accesses 32 | * this method per JVM. 33 | */ 34 | override def write(key: K, value: V) 35 | { 36 | // Generate the path depending upon key-value pair 37 | val finalPath = generateFileNameForKeyValue(key, value) 38 | 39 | // Extract the actual key and value 40 | val actualKey = generateActualKey(key, value) 41 | val actualValue = generateActualValue(key, value) 42 | 43 | // Get the RecordReader for finalPath or create one if needed 44 | val writer = recordWriters.getOrElseUpdate(finalPath, createRecordWriter(finalPath, context)) 45 | writer.write(actualKey, actualValue) 46 | } 47 | 48 | override def close(context: TaskAttemptContext) = recordWriters.foreach(_._2.close(context)) 49 | } 50 | 51 | override def getRecordWriter(context: TaskAttemptContext): RecordWriter[K, V] = new MultipleTextRecordWriter(context) 52 | 53 | /** 54 | * Create a new RecordWriter based on the modified output path and the RecordWriter implementation 55 | * returned by getBaseRecordWriter(). 56 | */ 57 | private def createRecordWriter(pathName: String, context: TaskAttemptContext): RecordWriter[K, V] = 58 | { 59 | val conf = context.getConfiguration 60 | val keyValueSeparator = conf.get(TextOutputFormat.SEPERATOR, "\t") 61 | // If overwriteOutput is set to true, output files will be overwritten if they already exist, 62 | // or else an IOException will be thrown (which is also the default behaviour) 63 | val overwriteOutput = conf.getBoolean(OVERWRITE, false) 64 | 65 | val (codec, file) = if (inferCodecFromPathName) 66 | { 67 | val extension = pathName.substring(pathName.lastIndexOf('.')) 68 | // Get modified suffixed path 69 | val file = getModifiedWorkFile(pathName, context, extension) 70 | // Returns Option[CompressionCodec] or None depending on file extension 71 | val codec = Option(new CompressionCodecFactory(conf).getCodec(file)) 72 | (codec, file) 73 | } 74 | else 75 | { 76 | val isCompressed = getCompressOutput(context) 77 | if (isCompressed) 78 | { 79 | // Get the CompressionCodec from job configuration 80 | val codecClass = getOutputCompressorClass(context, classOf[CompressionCodec]) 81 | val codec = ReflectionUtils.newInstance(codecClass, conf) 82 | val file = getModifiedWorkFile(pathName, context, codec.getDefaultExtension) 83 | (Some(codec), file) 84 | } 85 | else 86 | { 87 | val file = getModifiedWorkFile(pathName, context, "") 88 | (None, file) 89 | } 90 | } 91 | 92 | val fs = file.getFileSystem(conf) 93 | val fileOutputStream = fs.create(file, overwriteOutput) 94 | 95 | getBaseRecordWriter(context, fileOutputStream, keyValueSeparator, codec) 96 | } 97 | 98 | /** 99 | * Gets the default output path and inserts directoryName between the parent directory and leaf file (part-*). 100 | */ 101 | private def getModifiedWorkFile(directoryName: String, 102 | context: TaskAttemptContext, 103 | extension: String): Path = 104 | { 105 | val path = super.getDefaultWorkFile(context, extension) 106 | new Path(new Path(path.getParent, directoryName), path.getName) 107 | } 108 | 109 | /** 110 | * If inferCodecFromPathName is set to true, the output compression codec will be inferred from the suffix/extension 111 | * in pathName (eg. foobar.gz implies GzipCodec is used), otherwise it uses Hadoop configuration settings. 112 | * 113 | * The default behaviour is to use Hadoop configuration settings. 114 | */ 115 | protected val inferCodecFromPathName: Boolean = false 116 | 117 | /** 118 | * Construct the underlying RecordWriter. By default creates a LineRecordWriter that is used by 119 | * TextOutputFormat by default. 120 | * 121 | * @param context TaskAttemptContext 122 | * @param out DataOutputStream where output data is written to 123 | * @param keyValueSeparator String separator between output key and value 124 | * @param codec Option[CompressionCodec] for handling compression 125 | * @return A RecordWriter object over the given DataOutputStream 126 | */ 127 | protected def getBaseRecordWriter(context: TaskAttemptContext, 128 | out: DataOutputStream, 129 | keyValueSeparator: String, 130 | codec: Option[CompressionCodec] = None): RecordWriter[K, V] = 131 | { 132 | codec match 133 | { 134 | case Some(c) => 135 | // Have we an output compression codec? 136 | new LineRecordWriter[K, V]( 137 | new DataOutputStream(c.createOutputStream(out)), 138 | keyValueSeparator 139 | ) 140 | case _ => 141 | new LineRecordWriter[K, V](out, keyValueSeparator) 142 | } 143 | } 144 | 145 | /** 146 | * Generate the output file name (the directory where the leaf part-* files will be written to) 147 | * based on the given key and value. The default behavior is that the file name does not depend on them. 148 | * That is, by default this method returns an empty String. 149 | * 150 | * @param key the key of the output data 151 | * @return generated file name 152 | */ 153 | protected def generateFileNameForKeyValue(key: K, value: V): String = "" 154 | 155 | /** 156 | * Generate the actual key from the given key/value. The default behavior is that 157 | * the actual key is equal to the given key. 158 | * 159 | * @param key the key of the output data 160 | * @param value the value of the output data 161 | * @return the actual key derived from the given key/value 162 | */ 163 | protected def generateActualKey(key: K, value: V): K = key 164 | 165 | /** 166 | * Generate the actual value from the given key and value. The default behavior is that 167 | * the actual value is equal to the given value. 168 | * 169 | * @param key the key of the output data 170 | * @param value the value of the output data 171 | * @return the actual value derived from the given key/value 172 | */ 173 | protected def generateActualValue(key: K, value: V): V = value 174 | } 175 | -------------------------------------------------------------------------------- /extraction/src/main/scala/org/dbpedia/extraction/spark/serialize/KryoExtractionRegistrator.scala: -------------------------------------------------------------------------------- 1 | package org.dbpedia.extraction.spark.serialize 2 | 3 | import org.apache.spark.serializer.KryoRegistrator 4 | import com.esotericsoftware.kryo.Kryo 5 | import scala.Console._ 6 | import org.dbpedia.extraction.sources.WikiPage 7 | import org.dbpedia.extraction.wikiparser.{Namespace, WikiTitle} 8 | import org.dbpedia.extraction.util.Language 9 | import java.util.logging.Logger 10 | import org.dbpedia.extraction.dataparser.ParserUtils 11 | 12 | /** 13 | * It's best to register the classes that will be serialized/deserialized with Kryo. 14 | */ 15 | class KryoExtractionRegistrator extends KryoRegistrator 16 | { 17 | override def registerClasses(kryo: Kryo) 18 | { 19 | kryo.register(classOf[Array[Object]]) 20 | kryo.register(classOf[org.dbpedia.extraction.dataparser.GeoCoordinateParser]) 21 | kryo.register(classOf[org.dbpedia.extraction.dataparser.SingleGeoCoordinateParser]) 22 | kryo.register(classOf[org.dbpedia.extraction.destinations.Dataset]) 23 | kryo.register(classOf[org.dbpedia.extraction.destinations.Quad]) 24 | kryo.register(classOf[org.dbpedia.extraction.dump.extract.DistConfigLoader]) 25 | kryo.register(classOf[org.dbpedia.extraction.dump.extract.DumpExtractionContext]) 26 | kryo.register(classOf[org.dbpedia.extraction.dump.extract.DumpExtractionContextWrapper]) 27 | kryo.register(classOf[org.dbpedia.extraction.mappings.ArticleCategoriesExtractor]) 28 | kryo.register(classOf[org.dbpedia.extraction.mappings.ArticlePageExtractor]) 29 | kryo.register(classOf[org.dbpedia.extraction.mappings.ArticleTemplatesExtractor]) 30 | kryo.register(classOf[org.dbpedia.extraction.mappings.CategoryLabelExtractor]) 31 | kryo.register(classOf[org.dbpedia.extraction.mappings.CompositeParseExtractor]) 32 | kryo.register(classOf[org.dbpedia.extraction.mappings.DistRedirects]) 33 | kryo.register(classOf[org.dbpedia.extraction.mappings.ExternalLinksExtractor]) 34 | kryo.register(classOf[org.dbpedia.extraction.mappings.GeoExtractor]) 35 | kryo.register(classOf[org.dbpedia.extraction.mappings.InfoboxExtractor]) 36 | kryo.register(classOf[org.dbpedia.extraction.mappings.InterLanguageLinksExtractor]) 37 | kryo.register(classOf[org.dbpedia.extraction.mappings.LabelExtractor]) 38 | kryo.register(classOf[org.dbpedia.extraction.mappings.PageIdExtractor]) 39 | kryo.register(classOf[org.dbpedia.extraction.mappings.PageLinksExtractor]) 40 | kryo.register(classOf[org.dbpedia.extraction.mappings.ProvenanceExtractor]) 41 | kryo.register(classOf[org.dbpedia.extraction.mappings.RedirectExtractor]) 42 | kryo.register(classOf[org.dbpedia.extraction.mappings.Redirects]) 43 | kryo.register(classOf[org.dbpedia.extraction.mappings.RevisionIdExtractor]) 44 | kryo.register(classOf[org.dbpedia.extraction.mappings.RootExtractor]) 45 | kryo.register(classOf[org.dbpedia.extraction.mappings.SkosCategoriesExtractor]) 46 | kryo.register(classOf[org.dbpedia.extraction.dataparser.ParserUtils]) 47 | kryo.register(classOf[org.dbpedia.extraction.ontology.datatypes.Datatype]) 48 | kryo.register(classOf[org.dbpedia.extraction.ontology.OntologyClass]) 49 | kryo.register(classOf[org.dbpedia.extraction.ontology.OntologyDatatypeProperty]) 50 | kryo.register(classOf[org.dbpedia.extraction.ontology.OntologyObjectProperty]) 51 | kryo.register(classOf[org.dbpedia.extraction.ontology.OntologyProperty]) 52 | kryo.register(Class.forName("scala.collection.immutable.$colon$colon")) 53 | kryo.register(Class.forName("scala.collection.immutable.Map$EmptyMap$")) 54 | kryo.register(Class.forName("scala.collection.immutable.Nil$")) 55 | kryo.register(Class.forName("scala.collection.immutable.Set$EmptySet$")) 56 | kryo.register(classOf[scala.collection.mutable.ArrayBuffer[_]]) 57 | kryo.register(classOf[Array[scala.collection.Seq[_]]]) 58 | kryo.register(classOf[scala.runtime.BoxedUnit]) 59 | kryo.register(classOf[Array[scala.Tuple2[_,_]]]) 60 | kryo.register(classOf[scala.util.matching.Regex]) 61 | kryo.register(classOf[WikiPage], new WikiPageSerializer) 62 | kryo.register(classOf[WikiTitle], new WikiTitleSerializer) 63 | kryo.register(classOf[Namespace]) 64 | kryo.register(classOf[Language], new LanguageSerializer) 65 | kryo.register(classOf[Logger], new LoggerSerializer) 66 | kryo.register(classOf[ParserUtils], new ParserUtilsSerializer) 67 | } 68 | } -------------------------------------------------------------------------------- /extraction/src/main/scala/org/dbpedia/extraction/spark/serialize/KryoSerializationWrapper.scala: -------------------------------------------------------------------------------- 1 | package org.dbpedia.extraction.spark.serialize 2 | 3 | import scala.reflect.ClassTag 4 | 5 | /** 6 | * A wrapper around some unserializable objects that make them both Java 7 | * serializable. Internally, Kryo is used for serialization. 8 | * 9 | * Use KryoSerializationWrapper(value) to create a wrapper. 10 | */ 11 | class KryoSerializationWrapper[T: ClassTag] extends Serializable 12 | { 13 | 14 | @transient var value: T = _ 15 | 16 | private var valueSerialized: Array[Byte] = _ 17 | 18 | // The getter and setter for valueSerialized is used for XML serialization. 19 | def getValueSerialized(): Array[Byte] = 20 | { 21 | valueSerialized = KryoSerializer.serialize(value) 22 | valueSerialized 23 | } 24 | 25 | def setValueSerialized(bytes: Array[Byte]) = 26 | { 27 | valueSerialized = bytes 28 | value = KryoSerializer.deserialize[T](valueSerialized) 29 | } 30 | 31 | // Used for Java serialization. 32 | private def writeObject(out: java.io.ObjectOutputStream) 33 | { 34 | getValueSerialized() 35 | out.defaultWriteObject() 36 | } 37 | 38 | private def readObject(in: java.io.ObjectInputStream) 39 | { 40 | in.defaultReadObject() 41 | setValueSerialized(valueSerialized) 42 | } 43 | } 44 | 45 | 46 | object KryoSerializationWrapper 47 | { 48 | def apply[T: ClassTag](value: T): KryoSerializationWrapper[T] = 49 | { 50 | val wrapper = new KryoSerializationWrapper[T] 51 | wrapper.value = value 52 | wrapper 53 | } 54 | } -------------------------------------------------------------------------------- /extraction/src/main/scala/org/dbpedia/extraction/spark/serialize/KryoSerializer.scala: -------------------------------------------------------------------------------- 1 | package org.dbpedia.extraction.spark.serialize 2 | 3 | import java.nio.ByteBuffer 4 | 5 | import org.apache.spark.{SparkConf, SparkEnv} 6 | import org.apache.spark.serializer.{KryoSerializer => SparkKryoSerializer} 7 | import scala.reflect.ClassTag 8 | 9 | 10 | /** 11 | * Java object serialization using Kryo. This is much more efficient, but Kryo 12 | * sometimes is buggy to use. We use this mainly to serialize the object 13 | * inspectors. 14 | */ 15 | object KryoSerializer 16 | { 17 | 18 | @transient lazy val ser: SparkKryoSerializer = 19 | { 20 | val sparkConf = Option(SparkEnv.get).map(_.conf).getOrElse(new SparkConf()) 21 | new SparkKryoSerializer(sparkConf) 22 | } 23 | 24 | def serialize[T: ClassTag](o: T): Array[Byte] = 25 | { 26 | ser.newInstance().serialize(o).array() 27 | } 28 | 29 | def deserialize[T: ClassTag](bytes: Array[Byte]): T = 30 | { 31 | ser.newInstance().deserialize[T](ByteBuffer.wrap(bytes)) 32 | } 33 | } -------------------------------------------------------------------------------- /extraction/src/main/scala/org/dbpedia/extraction/spark/serialize/LanguageSerializer.scala: -------------------------------------------------------------------------------- 1 | package org.dbpedia.extraction.spark.serialize 2 | 3 | import com.esotericsoftware.kryo.{Kryo, Serializer} 4 | import com.esotericsoftware.kryo.io.{Input, Output} 5 | import org.dbpedia.extraction.util.Language 6 | 7 | /** 8 | * Kryo serializer for org.dbpedia.extraction.util.Language 9 | */ 10 | class LanguageSerializer extends Serializer[Language] 11 | { 12 | override def write(kryo: Kryo, output: Output, language: Language) 13 | { 14 | output.writeString(language.wikiCode) 15 | } 16 | 17 | override def read(kryo: Kryo, input: Input, languageClass: Class[Language]): Language = 18 | { 19 | val wikiCode = input.readString() 20 | Language(wikiCode) 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /extraction/src/main/scala/org/dbpedia/extraction/spark/serialize/LocaleSerializer.scala: -------------------------------------------------------------------------------- 1 | package org.dbpedia.extraction.spark.serialize 2 | 3 | import com.esotericsoftware.kryo.{Kryo, Serializer} 4 | import com.esotericsoftware.kryo.io.{Output, Input} 5 | import java.util.Locale 6 | 7 | /** 8 | * Kryo serializer for java.util.Locale 9 | */ 10 | class LocaleSerializer extends Serializer[Locale] 11 | { 12 | override def write(kryo: Kryo, output: Output, locale: Locale) 13 | { 14 | output.writeAscii(locale.getLanguage) 15 | output.writeAscii(locale.getCountry) 16 | output.writeAscii(locale.getVariant) 17 | } 18 | 19 | override def read(kryo: Kryo, input: Input, localeClass: Class[Locale]): Locale = 20 | { 21 | new Locale(input.readString(), input.readString(), input.readString()) 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /extraction/src/main/scala/org/dbpedia/extraction/spark/serialize/LoggerSerializer.scala: -------------------------------------------------------------------------------- 1 | package org.dbpedia.extraction.spark.serialize 2 | 3 | import com.esotericsoftware.kryo.{Kryo, Serializer} 4 | import com.esotericsoftware.kryo.io.{Input, Output} 5 | import org.dbpedia.extraction.util.Language 6 | import java.util.logging.Logger 7 | 8 | /** 9 | * Kryo serializer for org.dbpedia.extraction.util.Language 10 | */ 11 | class LoggerSerializer extends Serializer[Logger] 12 | { 13 | override def write(kryo: Kryo, output: Output, logger: Logger) 14 | { 15 | output.writeString(logger.getName) 16 | } 17 | 18 | override def read(kryo: Kryo, input: Input, loggerClass: Class[Logger]): Logger = 19 | { 20 | val className = input.readString() 21 | Logger.getLogger(className) 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /extraction/src/main/scala/org/dbpedia/extraction/spark/serialize/ParserUtilsSerializer.scala: -------------------------------------------------------------------------------- 1 | package org.dbpedia.extraction.spark.serialize 2 | 3 | import com.esotericsoftware.kryo.{Kryo, Serializer} 4 | import com.esotericsoftware.kryo.io.{Input, Output} 5 | import org.dbpedia.extraction.dataparser.ParserUtils 6 | import org.dbpedia.extraction.util.Language 7 | 8 | /** 9 | * Kryo serializer for org.dbpedia.extraction.dataparser.ParserUtils 10 | */ 11 | class ParserUtilsSerializer extends Serializer[ParserUtils] 12 | { 13 | override def write(kryo: Kryo, output: Output, parserUtils: ParserUtils) { 14 | kryo.writeObjectOrNull(output, parserUtils.context.language, new LanguageSerializer) 15 | } 16 | 17 | override def read(kryo: Kryo, input: Input, parserUtilsClass: Class[ParserUtils]): ParserUtils = { 18 | val lang = kryo.readObjectOrNull(input, classOf[Language], new LanguageSerializer) 19 | new ParserUtils(new {def language: Language = lang}) 20 | } 21 | } -------------------------------------------------------------------------------- /extraction/src/main/scala/org/dbpedia/extraction/spark/serialize/WikiPageSerializer.scala: -------------------------------------------------------------------------------- 1 | package org.dbpedia.extraction.spark.serialize 2 | 3 | import com.esotericsoftware.kryo.{Kryo, Serializer} 4 | import org.dbpedia.extraction.sources.WikiPage 5 | import com.esotericsoftware.kryo.io.{Output, Input} 6 | import org.dbpedia.extraction.wikiparser.WikiTitle 7 | 8 | /** 9 | * Kryo serializer for org.dbpedia.extraction.sources.WikiPage 10 | */ 11 | class WikiPageSerializer extends Serializer[WikiPage] 12 | { 13 | override def write(kryo: Kryo, output: Output, wikiPage: WikiPage) 14 | { 15 | kryo.writeObjectOrNull(output, wikiPage.title, new WikiTitleSerializer) 16 | kryo.writeObjectOrNull(output, wikiPage.redirect, new WikiTitleSerializer) 17 | output.writeLong(wikiPage.id) 18 | output.writeLong(wikiPage.revision) 19 | output.writeLong(wikiPage.timestamp) 20 | output.writeLong(wikiPage.contributorID) 21 | output.writeString(wikiPage.contributorName) 22 | output.writeString(wikiPage.source) 23 | output.writeString(wikiPage.format) 24 | } 25 | 26 | override def read(kryo: Kryo, input: Input, wikiPageClass: Class[WikiPage]): WikiPage = 27 | { 28 | val title = kryo.readObjectOrNull(input, classOf[WikiTitle], new WikiTitleSerializer) 29 | val redirect = kryo.readObjectOrNull(input, classOf[WikiTitle], new WikiTitleSerializer) 30 | val id = input.readLong() 31 | val revision = input.readLong() 32 | val timestamp = input.readLong() 33 | val contributorID = input.readLong() 34 | val contributorName = input.readString() 35 | val source = input.readString() 36 | val format = input.readString() 37 | new WikiPage(title, redirect, id, revision, timestamp, contributorID, contributorName, source, format) 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /extraction/src/main/scala/org/dbpedia/extraction/spark/serialize/WikiTitleSerializer.scala: -------------------------------------------------------------------------------- 1 | package org.dbpedia.extraction.spark.serialize 2 | 3 | import com.esotericsoftware.kryo.{Kryo, Serializer} 4 | import com.esotericsoftware.kryo.serializers.FieldSerializer 5 | import com.esotericsoftware.kryo.io.{Input, Output} 6 | import org.dbpedia.extraction.wikiparser.{Namespace, WikiTitle} 7 | import org.dbpedia.extraction.util.Language 8 | 9 | /** 10 | * Kryo serializer for org.dbpedia.extraction.wikiparser.WikiTitle 11 | */ 12 | class WikiTitleSerializer extends Serializer[WikiTitle] 13 | { 14 | override def write(kryo: Kryo, output: Output, wikiTitle: WikiTitle) 15 | { 16 | output.writeString(wikiTitle.decoded) 17 | kryo.writeObjectOrNull(output, wikiTitle.language, new LanguageSerializer) 18 | kryo.writeObjectOrNull(output, wikiTitle.namespace, new FieldSerializer(kryo, classOf[Namespace])) 19 | output.writeBoolean(wikiTitle.isInterLanguageLink) 20 | output.writeString(wikiTitle.fragment) 21 | } 22 | 23 | override def read(kryo: Kryo, input: Input, wikiTitleClass: Class[WikiTitle]): WikiTitle = 24 | { 25 | val decoded = input.readString() 26 | val language = kryo.readObjectOrNull(input, classOf[Language], new LanguageSerializer) 27 | val namespace = kryo.readObjectOrNull(input, classOf[Namespace], new FieldSerializer(kryo, classOf[Namespace])) 28 | val isInterLanguageLink = input.readBoolean() 29 | val fragment = input.readString() 30 | new WikiTitle(decoded, namespace, language, isInterLanguageLink, fragment) 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /extraction/src/main/scala/org/dbpedia/extraction/util/DistIOUtils.scala: -------------------------------------------------------------------------------- 1 | package org.dbpedia.extraction.util 2 | 3 | import com.esotericsoftware.kryo.Kryo 4 | import org.dbpedia.extraction.spark.serialize.KryoSerializer 5 | import org.apache.spark.SparkContext 6 | import org.apache.spark.rdd.RDD 7 | import org.apache.hadoop.io.{BytesWritable, NullWritable} 8 | import java.io.{ByteArrayInputStream, ByteArrayOutputStream} 9 | import com.esotericsoftware.kryo.io.{Input, Output} 10 | import org.apache.spark.SparkContext._ 11 | import scala.reflect.ClassTag 12 | import org.apache.hadoop.fs.Path 13 | import org.apache.hadoop.conf.Configuration 14 | import org.apache.hadoop.mapreduce.Job 15 | import org.apache.hadoop.mapreduce.lib.input.{SequenceFileInputFormat, FileInputFormat} 16 | 17 | /** 18 | * Kryo file operations helper methods 19 | */ 20 | object DistIOUtils 21 | { 22 | private val kryo: ThreadLocal[Kryo] = new ThreadLocal[Kryo] 23 | { 24 | override def initialValue = getNewKryo() 25 | } 26 | 27 | /** 28 | * @return returns a thread-local instance of Kryo 29 | */ 30 | def getKryoInstance: Kryo = kryo.get() 31 | 32 | /** 33 | * @return new Kryo instance. 34 | */ 35 | def getNewKryo(): Kryo = KryoSerializer.ser.newKryo() 36 | 37 | /** 38 | * Loads an RDD saved as a SequenceFile containing objects serialized by Kryo, 39 | * with NullWritable keys and BytesWritable values. 40 | * @param sc SparkContext 41 | * @param path String path to existing file. Can be on local file system or HDFS, S3 etc. See Spark docs. 42 | * @return deserialized RDD 43 | */ 44 | def loadRDD[T: ClassTag](sc: SparkContext, rddClass: Class[T], path: String): RDD[T] = 45 | { 46 | val arrayOfRddClass = Class.forName("[L" + rddClass.getName + ";") 47 | val serializedRDD = sc.sequenceFile(path, classOf[NullWritable], classOf[BytesWritable]) 48 | serializedRDD.values.flatMap(x => deserialize(x.getBytes, arrayOfRddClass).asInstanceOf[Array[T]]) 49 | } 50 | 51 | /** 52 | * Loads an RDD saved as a SequenceFile containing objects serialized by Kryo, 53 | * with NullWritable keys and BytesWritable values. 54 | * @param sc SparkContext 55 | * @param path String path to existing file. Can be on local file system or HDFS, S3 etc. See Spark docs. 56 | * @return deserialized RDD 57 | */ 58 | def loadRDD[T: ClassTag](sc: SparkContext, rddClass: Class[T], path: Path): RDD[T] = 59 | { 60 | val arrayOfRddClass = Class.forName("[L" + rddClass.getName + ";") 61 | val conf = new Configuration() 62 | val job = Job.getInstance(conf) 63 | FileInputFormat.addInputPath(job, path) 64 | val updatedConf = job.getConfiguration 65 | val serializedRDD = sc.newAPIHadoopRDD(updatedConf, classOf[SequenceFileInputFormat[NullWritable, BytesWritable]], classOf[NullWritable], classOf[BytesWritable]) 66 | serializedRDD.values.flatMap(x => deserialize(x.getBytes, arrayOfRddClass).asInstanceOf[Array[T]]) 67 | } 68 | 69 | /** 70 | * Saves an RDD as a SequenceFile containing objects serialized by Kryo, 71 | * with NullWritable keys and BytesWritable values. 72 | * @param rdd Spark RDD 73 | * @param path String path to existing file. Can be on local file system or HDFS, S3 etc. See Spark docs. 74 | */ 75 | def saveRDD(rdd: RDD[_ <: AnyRef], path: String) 76 | { 77 | rdd.mapPartitions(iter => iter.grouped(50).map(_.toArray)) 78 | .map(x => (NullWritable.get(), new BytesWritable(serialize(x)))).saveAsSequenceFile(path) 79 | } 80 | 81 | /** 82 | * Saves an RDD as a SequenceFile containing objects serialized by Kryo, 83 | * with NullWritable keys and BytesWritable values. 84 | * @param rdd Spark RDD 85 | * @param path String path to existing file. Can be on local file system or HDFS, S3 etc. See Spark docs. 86 | */ 87 | def saveRDD(rdd: RDD[_ <: AnyRef], path: Path) 88 | { 89 | rdd.mapPartitions(iter => iter.grouped(50).map(_.toArray)) 90 | .map(x => (NullWritable.get(), new BytesWritable(serialize(x)))).saveAsSequenceFile(path.toString) 91 | } 92 | 93 | // TODO: Add unit tests with code similar to: 94 | // /** 95 | // * Temporary method to test if serialization-deserialization works properly 96 | // */ 97 | // def testSerDe(rdd: RDD[_ <: AnyRef], path: String) { 98 | // val serialized = rdd.map(x => (NullWritable.get(), new BytesWritable(serialize(x)))) 99 | // serialized.saveAsSequenceFile(path) 100 | // 101 | // val deserialized : RDD[_ <: AnyRef] = serialized.values.map(x => { 102 | // deserialize(x.getBytes, classOf[WikiPage]).asInstanceOf[WikiPage] 103 | // }) 104 | // 105 | // //Assertions below to test if (de)serialization works properly. 106 | // assert(deserialized.first().toString == rdd.first().toString) 107 | // assert(deserialized.count() == rdd.count()) 108 | // } 109 | // 110 | // /** 111 | // * Temporary method to test if saveAsKryoFile() and openFromKryoFile() work consistently. 112 | // */ 113 | // def testSaveOpen(sc: SparkContext, rdd: RDD[_ <: WikiPage], path: String) { 114 | // saveRDD(rdd, path) 115 | // val deserialized = loadRDD(sc, path) 116 | // 117 | // //Test to ensure we're saving as many WikiPages as we're retrieving after deserialization 118 | // assert(deserialized.count() == rdd.count()) 119 | // } 120 | 121 | /** 122 | * @param x Any object 123 | * @return serialized Array of Bytes 124 | */ 125 | def serialize(x: Any): Array[Byte] = 126 | { 127 | val stream = new ByteArrayOutputStream() 128 | val output = new Output(stream) 129 | getKryoInstance.writeObject(output, x) 130 | output.close() 131 | stream.toByteArray 132 | } 133 | 134 | /** 135 | * @param x Array of Bytes - serialized version of an object 136 | * @param c Class of the object 137 | * @return the object deserialized by Kryo 138 | */ 139 | def deserialize[T](x: Array[Byte], c: Class[T]) = 140 | { 141 | getKryoInstance.readObject(new Input(new ByteArrayInputStream(x)), c) 142 | } 143 | } -------------------------------------------------------------------------------- /extraction/src/main/scala/org/dbpedia/extraction/util/SparkUtils.scala: -------------------------------------------------------------------------------- 1 | package org.dbpedia.extraction.util 2 | 3 | import org.apache.spark.{Logging, SparkContext, SparkConf} 4 | import org.dbpedia.extraction.dump.extract.DistConfig 5 | import org.apache.log4j.{Logger, Level} 6 | import java.nio.file.{Paths, Files} 7 | import java.io.FileNotFoundException 8 | import scala.reflect.ClassTag 9 | import org.apache.spark.rdd.RDD 10 | import org.dbpedia.extraction.spark.serialize.KryoSerializationWrapper 11 | import org.apache.spark.ui.jobs.DBpediaJobProgressListener 12 | 13 | /** 14 | * Utility functions specific to Spark 15 | */ 16 | object SparkUtils 17 | { 18 | /** 19 | * Stores the SparkContext instance. 20 | */ 21 | private var sc: SparkContext = null 22 | 23 | /** 24 | * Set all loggers to the given log level. Returns a map of the value of every logger 25 | * @param level 26 | * @param loggers 27 | * @return 28 | */ 29 | def setLogLevels(level: org.apache.log4j.Level, loggers: TraversableOnce[String]) = 30 | { 31 | loggers.map 32 | { 33 | loggerName => 34 | val logger = Logger.getLogger(loggerName) 35 | val prevLevel = logger.getLevel() 36 | logger.setLevel(level) 37 | loggerName -> prevLevel 38 | }.toMap 39 | } 40 | 41 | /** 42 | * Sets log levels for Spark and its peripheral libraries to DistConfig.sparkLogLevel. 43 | */ 44 | def setSparkLogLevels(config: DistConfig) = 45 | { 46 | setLogLevels(config.sparkLogLevel, Seq("org.apache", "spark", "org.eclipse.jetty", "akka")) 47 | } 48 | 49 | /** 50 | * Creates and returns a new SparkContext taking configuration info from Config 51 | * @param config 52 | * @return 53 | */ 54 | def getSparkContext(config: DistConfig) = 55 | synchronized 56 | { 57 | if (sc == null) 58 | { 59 | val conf = new SparkConf().setMaster(config.sparkMaster).setAppName(config.sparkAppName) 60 | for ((property, value) <- config.sparkProperties) 61 | conf.set(property, value) 62 | conf.setSparkHome(config.sparkHome) 63 | val distJarName = if (Files.exists(Paths.get("target/extraction-4.1-SNAPSHOT.jar"))) 64 | { 65 | "target/extraction-4.1-SNAPSHOT.jar" 66 | } else if (Files.exists(Paths.get("extraction/target/extraction-4.1-SNAPSHOT.jar"))) 67 | { 68 | "extraction/target/extraction-4.1-SNAPSHOT.jar" 69 | } else 70 | { 71 | throw new FileNotFoundException("extraction-4.1-SNAPSHOT.jar cannot be found in extraction/target. Please run mvn install -Dmaven.test.skip=true to build JAR first.") 72 | } 73 | 74 | conf.setJars(List(distJarName)) 75 | conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") 76 | conf.set("spark.kryo.registrator", "org.dbpedia.extraction.spark.serialize.KryoExtractionRegistrator") 77 | conf.set("spark.kryoserializer.buffer.mb", "100") 78 | sc = new SparkContext(conf) 79 | // No logging is done upon omitting 'with Logging' - some package problem? 80 | setLogLevels(Level.INFO, Seq("org.apache.spark.ui.jobs.DBpediaJobProgressListener")) 81 | sc.addSparkListener(new DBpediaJobProgressListener(conf)) 82 | } 83 | sc 84 | } 85 | 86 | /** 87 | * Return an iterator that contains all of the elements in given RDD. 88 | * The iterator will consume as much memory as the largest partition in the RDD. 89 | * 90 | * @param rdd 91 | * @return iterator for rdd's elements 92 | */ 93 | def rddToLocalIterator[T: ClassTag](rdd: RDD[T]): Iterator[T] = 94 | { 95 | def collectPartition(p: Int): Array[T] = 96 | { 97 | sc.runJob(rdd, (iter: Iterator[T]) => iter.toArray, Seq(p), allowLocal = false).head 98 | } 99 | (0 until rdd.partitions.length).iterator.flatMap(i => collectPartition(i)) 100 | } 101 | 102 | /** 103 | * Returns the function object wrapped inside a KryoSerializationWrapper. 104 | * This is useful for having Kryo-serialization for Spark closures. 105 | * 106 | * @param function 107 | * @return 108 | */ 109 | def kryoWrapFunction[T, U](function: (T => U)): (T => U) = 110 | { 111 | def genMapper(kryoWrapper: KryoSerializationWrapper[(T => U)])(input: T): U = 112 | { 113 | kryoWrapper.value.apply(input) 114 | } 115 | 116 | genMapper(KryoSerializationWrapper(function)) _ 117 | } 118 | } 119 | -------------------------------------------------------------------------------- /extraction/src/test/resources/config.properties: -------------------------------------------------------------------------------- 1 | # download and extraction target dir 2 | # This can be a directory on HDFS or a local directory, depending on the Hadoop configuration files given in dist-config.properties 3 | base-dir=src/test/resources/data 4 | # Source file. If source file name ends with .gz or .bz2, it is unzipped on the fly. 5 | # Must exist in the directory xxwiki/yyyymmdd and have the prefix xxwiki-yyyymmdd- 6 | # where xx is the wiki code and yyyymmdd is the dump date. 7 | 8 | # default (prefer multistream bz2 files): 9 | source=pages-articles-multistream.xml.bz2 10 | 11 | # alternatives: 12 | # source=pages-articles.xml.gz 13 | # source=pages-articles.xml 14 | 15 | ###### Extract from part files ###### 16 | # 17 | # Please make sure that the regex actually matches the format used by ALL the wikis you are going to extract from!!!! 18 | # One that should work in all cases is 19 | # source=@pages-articles\\d*\\.xml(-p\\d+p\\d+)?\\.bz2 20 | # 21 | # NOTE: when using the above regex you should make sure you do not have part files AND regular dump files together 22 | # for the same wiki, e.g. frwiki-20131120-pages-articles1.xml.bz2 and frwiki-20131120-pages-articles.xml.bz2, as they 23 | # BOTH will match and that will result in duplicate output data 24 | # 25 | # Example: 26 | # enwiki => enwiki-latest-pages-articles1.xml-p000000010p000010000.bz2 hence @pages-articles\\d+\\.xml-p\\d+p\\d+\\.bz2 matches 27 | # frwiki => frwiki-latest-pages-articles1.xml.bz2 hence @pages-articles\\d+\\.xml\\.bz2 matches (the previous regex does not!) 28 | # commonswiki => it does not have part files! This is true for other wikis as well. 29 | # 30 | # source=@pages-articles\\d+\\.xml-p\\d+p\\d+\\.bz2 31 | 32 | # In case of multistream chunks 33 | # source=@pages-articles-multistream\\.xml\\.\\d+\\.bz2 34 | 35 | # use only directories that contain a 'download-complete' file? Default is false. 36 | require-download-complete=false 37 | 38 | # List of languages or article count ranges, e.g. 'en,de,fr' or '10000-20000' or '10000-', or '@mappings' 39 | languages=en 40 | # extractor class names starting with "." are prefixed by "org.dbpedia.extraction.mappings" 41 | 42 | extractors=.ArticleCategoriesExtractor,.ArticleTemplatesExtractor,.CategoryLabelExtractor,\ 43 | .ExternalLinksExtractor,.GeoExtractor,.InterLanguageLinksExtractor,.LabelExtractor,.PageIdExtractor,\ 44 | .PageLinksExtractor,.RedirectExtractor,.RevisionIdExtractor,.ProvenanceExtractor,.SkosCategoriesExtractor,\ 45 | .ArticlePageExtractor 46 | 47 | #extractors=.InfoboxExtractor 48 | 49 | # if ontology and mapping files are not given or do not exist, download info from mappings.dbpedia.org 50 | ontology=ontology.xml 51 | mappings=../mappings 52 | 53 | # Serialization URI policies and file formats. Quick guide: 54 | # uri-policy keys: uri, generic, xml-safe, reject-long 55 | # uri-policy position modifiers: -subjects, -predicates, -objects, -datatypes, -contexts 56 | # uri-policy values: comma-separated languages or '*' for all languages 57 | # format values: n-triples, n-quads, turtle-triples, turtle-quads, trix-triples, trix-quads 58 | # See http://git.io/DBpedia-serialization-format-properties for details. 59 | 60 | # For backwards compatibility, en uses generic URIs. All others use local IRIs. 61 | uri-policy.uri=uri:en; generic:en; xml-safe-predicates:*; reject-long:* 62 | uri-policy.iri=generic:en; xml-safe-predicates:*; reject-long:* 63 | 64 | # NT is unreadable anyway - might as well use URIs for en 65 | format.nt.gz=n-triples;uri-policy.uri 66 | format.nq.gz=n-quads;uri-policy.uri 67 | 68 | # Turtle is much more readable - use nice IRIs for all languages 69 | format.ttl.gz=turtle-triples;uri-policy.iri 70 | format.tql.gz=turtle-quads;uri-policy.iri 71 | -------------------------------------------------------------------------------- /extraction/src/test/resources/data/enwiki/20160407/enwiki-20160407-pages-articles-multistream.xml.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dbpedia/distributed-extraction-framework/ad039712889000a085dd3d0ab66a15ddde99573d/extraction/src/test/resources/data/enwiki/20160407/enwiki-20160407-pages-articles-multistream.xml.bz2 -------------------------------------------------------------------------------- /extraction/src/test/resources/dist-config.properties: -------------------------------------------------------------------------------- 1 | # The SPARK_HOME environment variable should be set to this, Spark's location 2 | spark-home=/home/user/engine/spark-0.9.1-bin-hadoop2/ 3 | 4 | # Paths to the Hadoop configuration files, if any. These are needed for HDFS. 5 | # hadoop-coresite-xml-path=/home/user/engine/hadoop-2.2.0/etc/hadoop/core-site.xml 6 | # hadoop-hdfssite-xml-path=/home/user/engine/hadoop-2.2.0/etc/hadoop/hdfs-site.xml 7 | # hadoop-mapredsite-xml-path=/home/user/engine/hadoop-2.2.0/etc/hadoop/mapred-site.xml 8 | 9 | # Refer to README.md for advice 10 | spark.executor.memory=2500m 11 | 12 | # Replace local[8] with something like spark://192.168.0.100 to go into distributed mode. 13 | spark-master=local[8] 14 | 15 | # When running on a distributed cluster, it is essential that you set spark.cores.max to N * M 16 | # where N = total no. of slave machines, M = SPARK_WORKER_INSTANCES (from spark-env.sh) 17 | # This is to ensure that Spark uses as many cores (over the entire cluster) as many workers there are. 18 | spark.cores.max=8 19 | 20 | # You can add more spark.* variables here. All variables starting with spark. will be provided to SparkConf 21 | 22 | # This is used for setting log levels for "org.apache", "spark", "org.eclipse.jetty" and "akka" using 23 | # SparkUtils.setLogLevels(). It is WARN by default to prevent excessive logging from Spark. 24 | # It is a good idea to set it to INFO while debugging/testing out the framework. 25 | # Refer to org.apache.log4j.Level for more details 26 | # logging-level=INFO 27 | 28 | # WARNING: If base-dir is set here, the base-dir in config.properties (the original DBpedia extraction configuration) is ignored. 29 | # base-dir=/data 30 | 31 | # Please refer to the source code for org.dbpedia.extraction.dump.extract.DistConfig for the complete set of configuration variables 32 | # TODO: Add info on all configuration variables here. -------------------------------------------------------------------------------- /extraction/src/test/scala/org/dbpedia/extraction/mappings/DistRedirectsTest.scala: -------------------------------------------------------------------------------- 1 | package org.dbpedia.extraction.mappings 2 | 3 | import org.junit.Assert._ 4 | import org.dbpedia.extraction.sources.{Source, XMLSource, WikiPage} 5 | import org.apache.spark.rdd.RDD 6 | import org.dbpedia.extraction.util._ 7 | import java.io.File 8 | import org.dbpedia.extraction.wikiparser.Namespace 9 | import org.dbpedia.extraction.dump.extract.{Config, DistConfig} 10 | import org.dbpedia.extraction.dump.download.Download 11 | import org.dbpedia.extraction.util.RichFile.wrapFile 12 | import org.scalatest.FunSuite 13 | import org.scalatest.junit.JUnitRunner 14 | import org.junit.runner.RunWith 15 | import org.apache.hadoop.conf.Configuration 16 | import org.apache.hadoop.fs.Path 17 | import org.dbpedia.extraction.util.RichHadoopPath.wrapPath 18 | 19 | /** 20 | * Unit Test for the DistRedirects class. 21 | * 22 | * This test expects a DBpedia extraction configuration properties file named "config.properties" and a distributed 23 | * framework configuration file named "dist-config.properties" to be present at the test/resources directory. 24 | * 25 | * It's better to use a small data dump like the liwiki dump to run the test. 26 | * 27 | * TODO: Add some wiki dump content to test upon rather than rely upon an external wiki dump file and config files. 28 | */ 29 | @RunWith(classOf[JUnitRunner]) 30 | class DistRedirectsTest extends FunSuite 31 | { 32 | val CONFIG_FILE = "config.properties" 33 | val SPARK_CONFIG_FILE = "dist-config.properties" 34 | 35 | // Fixtures shared between all tests in this class 36 | val (distConfig: DistConfig, 37 | articleSource: Source, 38 | rdd: RDD[WikiPage], 39 | language: Language, 40 | date: String, 41 | distFinder: Finder[Path]) = try 42 | { 43 | val configFileResource = getClass.getClassLoader.getResource(CONFIG_FILE) 44 | val sparkConfigFileResource = getClass.getClassLoader.getResource(SPARK_CONFIG_FILE) 45 | 46 | //Check if the wiki-pages file and config.properties file are present 47 | assertNotNull("Test file %s missing from distributed/src/test/resources".format(CONFIG_FILE), configFileResource) 48 | assertNotNull("Test file %s missing from distributed/src/test/resources".format(SPARK_CONFIG_FILE), sparkConfigFileResource) 49 | 50 | val configProperties = ConfigUtils.loadConfig(configFileResource.toURI.getPath, "UTF-8") 51 | val distConfigProperties = ConfigUtils.loadConfig(sparkConfigFileResource.toURI.getPath, "UTF-8") 52 | val config = new Config(configProperties) 53 | val distConfig = new DistConfig(distConfigProperties, configProperties, configFileResource.toURI) 54 | implicit val hadoopConfiguration = distConfig.hadoopConf 55 | val lang = config.extractorClasses.iterator.next()._1 56 | 57 | val localFinder = new Finder[File](config.dumpDir, lang, config.wikiName) 58 | val distFinder = new Finder[Path](distConfig.dumpDir.get, lang, config.wikiName) 59 | val date = latestDate(config, localFinder) 60 | 61 | // Get the readers for the test dump files 62 | val articlesReaders = files(config.source, localFinder, date).map(x => () => IOUtils.reader(x)) 63 | 64 | // Get the article source for Redirects to load from 65 | val articleSource = XMLSource.fromReaders(articlesReaders, lang, 66 | title => title.namespace == Namespace.Main || title.namespace == Namespace.File || 67 | title.namespace == Namespace.Category || title.namespace == Namespace.Template) 68 | 69 | SparkUtils.setSparkLogLevels(distConfig) 70 | val sc = SparkUtils.getSparkContext(distConfig) 71 | // Generate RDD from the article source for DistRedirects to load from in parallel 72 | // Naively calls toArray on Seq, only for testing 73 | val rdd = sc.parallelize(articleSource.toSeq, 8) 74 | (distConfig, articleSource, rdd, lang, date, distFinder) 75 | } catch{ case ex:Exception => ex.printStackTrace(); (null, null,null, null,null, null)} 76 | 77 | implicit def hadoopConfiguration: Configuration = distConfig.hadoopConf 78 | 79 | test("Verify DistRedirects.loadFromRDD output") 80 | { 81 | val distRedirects = DistRedirects.loadFromRDD(rdd, language) 82 | val redirects = Redirects.loadFromSource(articleSource, language) 83 | assertEquals("Testing DistRedirects.loadFromRDD failed!", redirects.map, distRedirects.map) 84 | } 85 | 86 | test("Verify DistRedirects.load output") 87 | { 88 | val cache = distFinder.file(date, "template-redirects.obj") 89 | var distRedirects = DistRedirects.load(rdd, cache, language) 90 | var redirects = Redirects.loadFromSource(articleSource, language) 91 | assertEquals("Testing DistRedirects.loadFromRDD failed!", redirects.map, distRedirects.map) 92 | 93 | // Try again so that cache gets used 94 | distRedirects = DistRedirects.load(rdd, cache, language) 95 | redirects = Redirects.loadFromSource(articleSource, language) 96 | assertEquals("Testing DistRedirects.loadFromRDD failed!", redirects.map, distRedirects.map) 97 | } 98 | 99 | // Taken from org.dbpedia.extraction.dump.extract.Config 100 | def latestDate(config: Config, finder: Finder[_]): String = 101 | { 102 | val isSourceRegex = config.source.startsWith("@") 103 | val source = if (isSourceRegex) config.source.substring(1) else config.source 104 | val fileName = if (config.requireComplete) Download.Complete else source 105 | finder.dates(fileName, isSuffixRegex = isSourceRegex).last 106 | } 107 | 108 | // Taken from org.dbpedia.extraction.dump.extract.Config 109 | def files(source: String, finder: Finder[File], date: String): List[File] = 110 | { 111 | 112 | val files = if (source.startsWith("@")) 113 | { 114 | // the articles source is a regex - we want to match multiple files 115 | finder.matchFiles(date, source.substring(1)) 116 | } else List(finder.file(date, source)) 117 | 118 | files 119 | } 120 | } -------------------------------------------------------------------------------- /extraction/src/test/scala/org/dbpedia/extraction/spark/io/QuadSeqWritableTest.scala: -------------------------------------------------------------------------------- 1 | package org.dbpedia.extraction.spark.io 2 | 3 | import org.junit.runner.RunWith 4 | import org.scalatest.junit.JUnitRunner 5 | import org.scalatest.FunSuiteLike 6 | import org.dbpedia.extraction.destinations.Quad 7 | import scala.util.Random 8 | import org.junit.Assert._ 9 | 10 | @RunWith(classOf[JUnitRunner]) 11 | class QuadSeqWritableTest extends WritableTest[QuadSeqWritable] with FunSuiteLike 12 | { 13 | test("Verify that serialization-deserialization works properly") 14 | { 15 | // Create random List[Quad] 16 | val sampleQuads = for (i <- (0 until 100).toList) yield new Quad(Random.nextString(10), 17 | Random.nextString(10), 18 | Random.nextString(10), 19 | Random.nextString(10), 20 | Random.nextString(10), 21 | Random.nextString(10), 22 | Random.nextString(10)) 23 | 24 | val writable1 = new QuadSeqWritable(sampleQuads) 25 | val writable2 = new QuadSeqWritable() 26 | 27 | performReadWriteRoundTrip(writable1, writable2) 28 | assertEquals(writable1.get, writable2.get) 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /extraction/src/test/scala/org/dbpedia/extraction/spark/io/WikiPageWritableTest.scala: -------------------------------------------------------------------------------- 1 | package org.dbpedia.extraction.spark.io 2 | 3 | import org.junit.Assert._ 4 | import org.junit.runner.RunWith 5 | import org.scalatest.junit.JUnitRunner 6 | import org.scalatest.FunSuiteLike 7 | import org.dbpedia.extraction.sources.XMLSource 8 | import scala.xml.XML 9 | import org.dbpedia.extraction.util.Language 10 | 11 | @RunWith(classOf[JUnitRunner]) 12 | class WikiPageWritableTest extends WritableTest[WikiPageWritable] with FunSuiteLike 13 | { 14 | test("Verify that serialization-deserialization works properly") 15 | { 16 | val samplePage = 17 | """ 18 | | 19 | | Lèmburg 20 | | 0 21 | | 13 22 | | 23 | | 24 | | 196988 25 | | 5980 26 | | 2010-01-25T20:24:26Z 27 | | 28 | | PahlesBot 29 | | 458 30 | | 31 | | 32 | | Bot: automatisch tekst vervangen (-#redirect +#REDIRECT) 33 | | #REDIRECT [[Limburg]] 34 | | 2uewphqvpum37i9d7g5okf5c3m643c7 35 | | wikitext 36 | | text/x-wiki 37 | | 38 | | 39 | """.stripMargin 40 | 41 | val wikiPage = XMLSource.fromXML(XML.loadString("" + samplePage + ""), Language("li")).head 42 | val writable1 = new WikiPageWritable(wikiPage) 43 | val writable2 = new WikiPageWritable() 44 | 45 | performReadWriteRoundTrip(writable1, writable2) 46 | assertEquals(writable1.get.toString, writable2.get.toString) 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /extraction/src/test/scala/org/dbpedia/extraction/spark/io/WritableTest.scala: -------------------------------------------------------------------------------- 1 | package org.dbpedia.extraction.spark.io 2 | 3 | import org.apache.hadoop.io.Writable 4 | import java.io.{ByteArrayInputStream, DataInputStream, DataOutputStream, ByteArrayOutputStream} 5 | 6 | abstract class WritableTest[T <: Writable] 7 | { 8 | /** 9 | * Utility method that takes two Writables as parameters, writes the first Writable to a byte 10 | * array and reads it back into the second Writable. 11 | * 12 | * @param oldWritable Writable to be serialized and deserialized again 13 | * @param newWritable Writable where oldWritable is deserialized into after serialization. 14 | */ 15 | def performReadWriteRoundTrip(oldWritable: T, newWritable: T) = 16 | { 17 | val bos = new ByteArrayOutputStream 18 | val dos = new DataOutputStream(bos) 19 | oldWritable.write(dos) 20 | newWritable.readFields(new DataInputStream(new ByteArrayInputStream(bos.toByteArray))) 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /gce/README.md: -------------------------------------------------------------------------------- 1 | Spark GCE 2 | ========= 3 | 4 | Spark GCE is like Spark Ec2 but for those who run their cluster on Google Cloud. 5 | 6 | - Make sure you have installed and authenticated gcutils where you are running this script. 7 | - Helps you launch a spark cluster in the Google Cloud 8 | - Attaches 100GB empty disk to all nodes in the cluster 9 | - Installs and configures Spark and HDFS automatically 10 | - Starts the Shark server automatically 11 | 12 | Spark GCE is a python script which will help you launch a spark cluster in the google cloud like the way the spark_ec2 script does for AWS. 13 | 14 | Usage 15 | ----- 16 | 17 | > ***spark_gce.py project-name number-of-slaves slave-type master-type identity-file zone cluster-name spark-mem workers-per-node cores-per-worker local-log-dir*** 18 | > 19 | >> 20 | >> - **project-name**: Name of the project where you are going to launch your spark cluster. 21 | >> 22 | >> - **number-of-slave**: Number of slaves that you want to launch. 23 | >> 24 | >> - **slave-type**: Instance type for the slave machines. 25 | >> 26 | >> - **master-type**: Instance type for the master node. 27 | >> 28 | >> - **identity-file**: Identity file to authenticate with your GCE instances, Usually resides at *~/.ssh/google_compute_engine* once you authenticate using gcutils. 29 | >> 30 | >> - **zone:** Specify the zone where you are going to launch the cluster. 31 | >> 32 | >> - **cluster-name**: Name the cluster that you are going to launch. 33 | >> 34 | >> - **spark-mem**: Amount of memory per Spark worker (as a JVM memory string eg. 2500m, 2g) 35 | >> 36 | >> - **workers-per-node**: Number of workers to run on each slave node 37 | >> 38 | >> - **cores-per-worker**: Number of cores each worker should use (optional, 1 by default) 39 | >> 40 | >> - **local-log-dir**: A local directory to download nmon logs from all the nodes (optional, empty, or no logging by default) 41 | >> 42 | > 43 | > ***spark_gce.py project-name cluster-name [identity-fle local-log-dir] destroy*** 44 | > 45 | >> - **project-name**: Name of the project where the spark cluster is at. 46 | >> - **cluster-name**: Name of the cluster that you are going to destroy. 47 | >> - **NOTE**: If you had specified a local-log-dir while starting the cluster, provide it here too, along with the identity-file, else skip both. 48 | 49 | 50 | Installation 51 | -------------- 52 | 53 | ```sh 54 | git clone git@github.com:dbpedia/distributed-extraction-framework.git 55 | cd gce 56 | python spark_gce.py 57 | ``` 58 | -------------------------------------------------------------------------------- /install-run: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Shortcut for 4 | # mvn -f ../pom.xml install && mvn scala:run -Dlauncher=... -DaddArgs=... 5 | # Must be called with one of the modules (core/, dump/, ...) as current directory. 6 | # Example: 7 | # extraction_framework/core> ../install-run LAUNCHER ARG1 ARG2 ARG3 8 | # is equivalent to 9 | # extraction_framework/core> mvn -f ../pom.xml install && mvn scala:run "-Dlauncher=LAUNCHER" "-DaddArgs=ARG1|ARG2|ARG3" 10 | 11 | # if we're not on a terminal, use batch mode to avoid ugly log files 12 | [ ! -t 1 ] && BATCH="-B" 13 | mvn $BATCH -f ../pom.xml install && . ../run "$@" 14 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | 4.0.0 6 | 7 | org.dbpedia 8 | distributed-extraction 9 | pom 10 | 4.1-SNAPSHOT 11 | Parent POM of the DBpedia Distributed Extraction Framework 12 | 13 | 14 | UTF-8 15 | 1.7 16 | 2.11.4 17 | 2.2.0 18 | 1.3.0 19 | 2.11 20 | 2.2.4 21 | -Xmx1024m 22 | 23 | 24 | 25 | 26 | extraction 27 | download 28 | common 29 | 30 | 31 | 32 | 33 | 34 | 35 | net.alchim31.maven 36 | scala-maven-plugin 37 | 3.1.6 38 | 39 | 40 | org.apache.maven.plugins 41 | maven-compiler-plugin 42 | 3.1 43 | 44 | 45 | 46 | 47 | 48 | 49 | 53 | 54 | 55 | net.alchim31.maven 56 | scala-maven-plugin 57 | 58 | 59 | 60 | -unchecked 61 | -deprecation 62 | -feature 63 | 64 | 65 | ${scala.compiler.Xmx} 66 | 67 | 68 | 69 | 70 | 71 | compile 72 | 73 | compile 74 | 75 | compile 76 | 77 | 78 | 79 | test-compile 80 | 81 | testCompile 82 | 83 | test-compile 84 | 85 | 86 | 87 | process-resources 88 | 89 | compile 90 | 91 | process-resources 92 | 93 | 94 | 95 | 96 | 97 | 98 | org.apache.maven.plugins 99 | maven-compiler-plugin 100 | 101 | ${java.version} 102 | ${java.version} 103 | 104 | 105 | 106 | 107 | maven-enforcer-plugin 108 | 1.3.1 109 | 110 | 111 | 112 | enforce 113 | 114 | 115 | 116 | 117 | ${java.version} 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | org.scala-lang 132 | scala-library 133 | ${scala.version} 134 | 135 | 136 | 137 | 138 | org.scala-lang 139 | scala-actors 140 | ${scala.version} 141 | 142 | 143 | 144 | 145 | org.scala-lang 146 | scala-reflect 147 | ${scala.version} 148 | 149 | 150 | 151 | 152 | org.scalatest 153 | scalatest_${scalatest.scala.version} 154 | ${scalatest.version} 155 | test 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | incremental 176 | 177 | 178 | 179 | net.alchim31.maven 180 | scala-maven-plugin 181 | 182 | 183 | 184 | incremental 185 | 188 | true 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | -------------------------------------------------------------------------------- /run: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Shortcut for mvn scala:run -Dlauncher=... -DaddArgs=... 4 | # Must be called with one of the modules (core/, dump/, ...) as current directory. 5 | # Example: 6 | # extraction_framework/core> ../run LAUNCHER ARG1 ARG2 ARG3 7 | # is equivalent to 8 | # extraction_framework/core> mvn scala:run "-Dlauncher=LAUNCHER" "-DaddArgs=ARG1|ARG2|ARG3" 9 | 10 | LAUNCHER="$1" 11 | 12 | ADD_ARGS="$2" 13 | for ARG in ${@:3} 14 | do 15 | ADD_ARGS="$ADD_ARGS|$ARG" 16 | done 17 | 18 | # export MAVEN_OPTS='-Xmx4096M -XX:MaxPermSize=1024M -XX:+HeapDumpOnOutOfMemoryError -XX:+PrintGC -XX:+PrintGCTimeStamps' 19 | # export MAVEN_DEBUG='-X -e' 20 | 21 | # if we're not on a terminal, use batch mode to avoid ugly log files 22 | [ ! -t 1 ] && BATCH="-B" 23 | mvn $MAVEN_DEBUG $BATCH scala:run "-Dlauncher=$LAUNCHER" "-DaddArgs=$ADD_ARGS" 24 | -------------------------------------------------------------------------------- /run-extraction-test: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Performs both normal sequential extraction and distributed extraction and compares the outputs. 4 | 5 | CONFIG_FILE="$1" 6 | SPARK_CONF_FILE="$2" 7 | 8 | echo "====================================================================" 9 | echo "Running sequential extraction" 10 | echo "====================================================================" 11 | ./run seq-extraction $CONFIG_FILE 12 | mkdir /tmp/dbpedia-test-seq-extraction 13 | mv `grep base-dir $CONFIG_FILE | sed -ne 's/^base-dir=//p'`/*wiki/*/*wiki*.gz /tmp/dbpedia-test-seq-extraction 14 | 15 | echo "====================================================================" 16 | echo "Running distributed extraction" 17 | echo "====================================================================" 18 | ./run extraction $CONFIG_FILE $SPARK_CONF_FILE 19 | mkdir /tmp/dbpedia-test-par-extraction 20 | cp -rf `grep base-dir $CONFIG_FILE | sed -ne 's/^base-dir=//p'`/*wiki/*/*wiki*.gz /tmp/dbpedia-test-par-extraction/ 21 | 22 | echo "====================================================================" 23 | echo "Computing diffs:" 24 | echo "====================================================================" 25 | diffs=`diff <(gzip -dc /tmp/dbpedia-test-seq-extraction/*.gz | grep -v "^#" | sort) <(gzip -dc /tmp/dbpedia-test-par-extraction/*wiki*.gz/part*.gz | grep -v "^#" | sort)` 26 | if [ -z "$diffs" ]; then 27 | echo "Outputs match!" 28 | else 29 | echo $diffs 30 | fi 31 | 32 | 33 | rm -rf /tmp/dbpedia-test-???-extraction 34 | --------------------------------------------------------------------------------