├── .gitignore ├── LICENSE ├── README.md ├── project └── plugins.sbt ├── sbin ├── download-dbpedia.sh ├── import-delete.sh └── import-merge-export.sh ├── simple.sbt └── src └── main └── scala ├── Configuration.scala └── DBpediaImporter.scala /.gitignore: -------------------------------------------------------------------------------- 1 | *.class 2 | *.log 3 | 4 | # sbt specific 5 | .cache/ 6 | .history/ 7 | .lib/ 8 | dist/* 9 | target/ 10 | lib_managed/ 11 | src_managed/ 12 | project/boot/ 13 | project/plugins/project/ 14 | 15 | # Scala-IDE specific 16 | .scala_dependencies 17 | .worksheet 18 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | 203 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | Download DBpedia Neo4j Database Files 3 | ---------------- 4 | 5 | The results of the code used to import DBpedia into Neo4j are available as a Neo4j data store (ex. `path/to/neo4j/data/graph.db`): 6 | 7 | * https://s3-us-west-1.amazonaws.com/neo4j-sample-datasets/dbpedia/dbpedia-store.tar.bz2 8 | 9 | Extract the `graph.db` folder to your Neo4j `data` folder and ensure that your configurations allow storage upgrades. 10 | 11 | Import DBpedia into Neo4j 12 | ====================== 13 | 14 | This is a Spark application written in Scala that processes flat file RDF dumps of DBpedia.org and generates CSV files 15 | that are used to generate Neo4j data store files. 16 | 17 | ## File inputs 18 | 19 | DBpedia URI mapped to Wikipedia URI: 20 | 21 | Download: http://data.dws.informatik.uni-mannheim.de/dbpedia/2014/en/wikipedia_links_en.nt.bz2 22 | File size: bzip2 compressed archive (261 MB) 23 | Header: DBPEDIA_RESOURCE_URI, RDF_TYPE, WIKIPEDIA_PAGE_URI 24 | 25 | Wikipedia page link graph: 26 | 27 | Download: http://data.dws.informatik.uni-mannheim.de/dbpedia/2014/en/page_links_en.nt.bz2 28 | File size: bzip2 compressed archive (1.2 GB) 29 | Header: DBPEDIA_RESOURCE_SRC, RDF_TYPE, DBPEDIA_RESOURCE_DST 30 | 31 | Page titles mapped to DBpedia URI: 32 | 33 | Download: http://data.dws.informatik.uni-mannheim.de/dbpedia/2014/en/labels_en.nt.bz2 34 | File size: bzip2 compressed archive (155 MB) 35 | Header: DBPEDIA_RESOURCE_URI, RDF_TYPE, PAGE_NAME 36 | 37 | DBpedia categories mapped to pages: 38 | 39 | Download: http://data.dws.informatik.uni-mannheim.de/dbpedia/2014/en/article_categories_en.nt.bz2 40 | File size: bzip2 compressed archive (178 MB) 41 | Header: DBPEDIA_RESOURCE_URI, RDF_TYPE, DBPEDIA_CATEGORY_URI 42 | 43 | DBpedia ontology mapped to pages: 44 | 45 | Download: http://data.dws.informatik.uni-mannheim.de/dbpedia/2014/en/instance_types_en.nt.bz2 46 | File size: bzip2 compressed archive (117 MB) 47 | Header: DBPEDIA_RESOURCE_URI, RDF_TYPE, DBPEDIA_ONTOLOGY_URI 48 | 49 | File outputs 50 | ------------ 51 | 52 | These file outputs are created as partitioned directories in HDFS. Each node file represents a node record in Neo4j with its properties and label. Each relationship file represents a relationship record that connects two nodes together by their `id` property. 53 | 54 | HDFS dir: /pagenodes 55 | Header: dbpedia, id, l:label, wikipedia, title 56 | 57 | HDFS dir: /pagerels 58 | Header: start, end, type 59 | 60 | HDFS dir: /categorynodes 61 | Header: dbpedia, id, l:label 62 | 63 | HDFS dir: /categoryrels 64 | Header: start, end, type 65 | 66 | HDFS dir: /ontologynodes 67 | Header: dbpedia, id, l:label, wikipedia, title 68 | 69 | HDFS dir: /ontologyrels 70 | Header: start, end, type 71 | 72 | HDFS file merge 73 | --------------- 74 | 75 | To export the partitioned results from Hadoop 1.0.4, you can run the following 76 | HDFS file system commands from the $HADOOP_HOME directory. 77 | 78 | File name: page_nodes.csv 79 | Command: bin/hadoop fs -cat "/pagenodes/part-" | bin/hadoop fs -put - /page_nodes.csv 80 | 81 | File name: page_rels.csv 82 | Command: bin/hadoop fs -cat "/pagerels/part-" | bin/hadoop fs -put - /page_rels.csv 83 | 84 | File name: category_nodes.csv 85 | Command: bin/hadoop fs -cat "/categorynodes/part-" | bin/hadoop fs -put - /category_nodes.csv 86 | 87 | File name: category_rels.csv 88 | Command: bin/hadoop fs -cat "/categoryrels/part-" | bin/hadoop fs -put - /category_rels.csv 89 | 90 | File name: ontology_nodes.csv 91 | Command: bin/hadoop fs -cat "/ontologynodes/part-" | bin/hadoop fs -put - /ontology_nodes.csv 92 | 93 | File name: ontology_rels.csv 94 | Command: bin/hadoop fs -cat "/ontologyrels/part-" | bin/hadoop fs -put - /ontology_rels.csv 95 | 96 | HDFS file export 97 | ---------------- 98 | 99 | To copy the CSV files off of HDFS and onto your local file system, run the following command: 100 | 101 | bin/hadoop fs -copyToLocal /page_nodes.csv ~/neo4j-batch-importer/page_nodes.csv 102 | bin/hadoop fs -copyToLocal /page_rels.csv ~/neo4j-batch-importer/page_rels.csv 103 | bin/hadoop fs -copyToLocal /category_nodes.csv ~/neo4j-batch-importer/category_nodes.csv 104 | bin/hadoop fs -copyToLocal /category_rels.csv ~/neo4j-batch-importer/category_rels.csv 105 | bin/hadoop fs -copyToLocal /ontology_nodes.csv ~/neo4j-batch-importer/ontology_nodes.csv 106 | bin/hadoop fs -copyToLocal /ontology_rels.csv ~/neo4j-batch-importer/ontology_rels.csv 107 | 108 | License 109 | ---------------- 110 | 111 | Apache license version 2.0 112 | -------------------------------------------------------------------------------- /project/plugins.sbt: -------------------------------------------------------------------------------- 1 | logLevel := Level.Warn -------------------------------------------------------------------------------- /sbin/download-dbpedia.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | wget http://data.dws.informatik.uni-mannheim.de/dbpedia/2014/en/article_categories_en.nt.bz2 4 | wget http://data.dws.informatik.uni-mannheim.de/dbpedia/2014/en/labels_en.nt.bz2 5 | wget http://data.dws.informatik.uni-mannheim.de/dbpedia/2014/en/page_links_en.nt.bz2 6 | wget http://data.dws.informatik.uni-mannheim.de/dbpedia/2014/en/wikipedia_links_en.nt.bz2 7 | wget http://data.dws.informatik.uni-mannheim.de/dbpedia/2014/en/skos_categories_en.nt.bz2 8 | wget http://data.dws.informatik.uni-mannheim.de/dbpedia/2014/en/instance_types_en.nt.bz2 9 | 10 | bzip2 -d wikipedia_links_en.nt.bz2 11 | bzip2 -d labels_en.nt.bz2 12 | bzip2 -d page_links_en.nt.bz2 13 | bzip2 -d article_categories_en.nt.bz2 14 | bzip2 -d skos_categories_en.nt.bz2 15 | bzip2 -d instance_types_en.nt.bz2 16 | 17 | /root/ephemeral-hdfs/bin/hadoop fs -copyFromLocal /data/wikipedia_links_en.nt /wikipedia_links_en.nt 18 | /root/ephemeral-hdfs/bin/hadoop fs -copyFromLocal /data/labels_en.nt /labels_en.nt 19 | /root/ephemeral-hdfs/bin/hadoop fs -copyFromLocal /data/page_links_en.nt /page_links_en.nt 20 | /root/ephemeral-hdfs/bin/hadoop fs -copyFromLocal /data/article_categories_en.nt /article_categories_en.nt 21 | /root/ephemeral-hdfs/bin/hadoop fs -copyFromLocal /data/skos_categories_en.nt /skos_categories_en.nt 22 | /root/ephemeral-hdfs/bin/hadoop fs -copyFromLocal /data/instance_types_en.nt /instance_types_en.nt 23 | -------------------------------------------------------------------------------- /sbin/import-delete.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | ./hadoop fs -rmr /pagenodes 4 | ./hadoop fs -rmr /pagerels 5 | ./hadoop fs -rmr /categorynodes 6 | ./hadoop fs -rmr /categoryrels 7 | ./hadoop fs -rmr /*.csv 8 | ./hadoop fs -rmr /categoryrels-stage 9 | ./hadoop fs -rmr /ontologynodes 10 | ./hadoop fs -rmr /ontologyrels-stage 11 | ./hadoop fs -rmr /ontologyrels 12 | -------------------------------------------------------------------------------- /sbin/import-merge-export.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | ./hadoop fs -cat "/pagenodes/part-*" | ./hadoop fs -put - /page_nodes.csv 4 | ./hadoop fs -cat "/pagerels/part-*" | ./hadoop fs -put - /page_rels.csv 5 | ./hadoop fs -cat "/categorynodes/part-*" | ./hadoop fs -put - /category_nodes.csv 6 | ./hadoop fs -cat "/categoryrels/part-*" | ./hadoop fs -put - /category_rels.csv 7 | ./hadoop fs -cat "/ontologynodes/part-*" | ./hadoop fs -put - /ontology_nodes.csv 8 | ./hadoop fs -cat "/ontologyrels/part-*" | ./hadoop fs -put - /ontology_rels.csv 9 | 10 | rm ~/neo4j-batch-importer/*_*.csv 11 | 12 | ./hadoop fs -copyToLocal /page_nodes.csv ~/neo4j-batch-importer/page_nodes.csv 13 | ./hadoop fs -copyToLocal /page_rels.csv ~/neo4j-batch-importer/page_rels.csv 14 | ./hadoop fs -copyToLocal /category_nodes.csv ~/neo4j-batch-importer/category_nodes.csv 15 | ./hadoop fs -copyToLocal /category_rels.csv ~/neo4j-batch-importer/category_rels.csv 16 | ./hadoop fs -copyToLocal /ontology_nodes.csv ~/neo4j-batch-importer/ontology_nodes.csv 17 | ./hadoop fs -copyToLocal /ontology_rels.csv ~/neo4j-batch-importer/ontology_rels.csv 18 | -------------------------------------------------------------------------------- /simple.sbt: -------------------------------------------------------------------------------- 1 | name := "dbpedia-neo4j-importer" 2 | 3 | version := "1.0" 4 | 5 | scalaVersion := "2.10.4" 6 | 7 | libraryDependencies += "org.apache.spark" %% "spark-core" % "1.1.0" 8 | 9 | libraryDependencies += "org.apache.hadoop" % "hadoop-client" % "1.0.4" 10 | 11 | resolvers += "Akka Repository" at "http://repo.akka.io/releases/" 12 | -------------------------------------------------------------------------------- /src/main/scala/Configuration.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (C) 2014 Kenny Bastani 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except 5 | * in compliance with the License. You may obtain a copy of the License at 6 | * 7 | * http://www.apache.org/licenses/LICENSE-2.0 8 | * 9 | * Unless required by applicable law or agreed to in writing, software distributed under the License 10 | * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 11 | * or implied. See the License for the specific language governing permissions and limitations under 12 | * the License. 13 | */ 14 | object Configuration { 15 | 16 | def HDFS_HOST: String = "hdfs://localhost:9000/" 17 | 18 | def PRIMARY_TOPIC_URL: String = "" 19 | 20 | def RDF_LABEL_URL: String = "" 21 | 22 | def RDF_ONTOLOGY_URL: String = "" 23 | 24 | def RDF_CATEGORY_URL: String = "" 25 | 26 | def CATEGORY_SKOS_URL: String = "" 27 | 28 | def WIKI_PAGE_LINK_URL: String = "http://dbpedia.org/ontology/wikiPageWikiLink" 29 | 30 | def EXCLUDE_FILE_PATTERN: String = "http://dbpedia.org/resource/File:" 31 | 32 | def EXCLUDE_CATEGORY_PATTERN: String = "http://dbpedia.org/resource/Category:" 33 | 34 | def WIKI_LINKS_FILE_NAME: String = HDFS_HOST + "wikipedia_links_en.nt" 35 | 36 | def WIKI_NAMES_FILE_NAME: String = HDFS_HOST + "labels_en.nt" 37 | 38 | def PAGE_LINKS_FILE_NAME: String = HDFS_HOST + "page_links_en.nt" 39 | 40 | def CATEGORIES_FILE_NAME: String = HDFS_HOST + "article_categories_en.nt" 41 | 42 | def CATEGORY_SKOS_FILE_NAME: String = HDFS_HOST + "skos_categories_en.nt" 43 | 44 | def INSTANCE_TYPES_FILE_NAME: String = HDFS_HOST + "instance_types_en.nt" 45 | 46 | def PAGE_NODES_CSV_HEADER: String = "dbpedia\tid\tl:label\twikipedia\ttitle"; 47 | 48 | def CATEGORY_NODES_CSV_HEADER: String = "id\tl:label\tdbpedia\ttitle"; 49 | 50 | def ONTOLOGY_NODES_CSV_HEADER: String = "id\tl:label\tdbpedia\ttitle"; 51 | 52 | def PAGE_LINKS_CSV_HEADER: String = "start\tend\ttype"; 53 | } 54 | -------------------------------------------------------------------------------- /src/main/scala/DBpediaImporter.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (C) 2014 Kenny Bastani 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except 5 | * in compliance with the License. You may obtain a copy of the License at 6 | * 7 | * http://www.apache.org/licenses/LICENSE-2.0 8 | * 9 | * Unless required by applicable law or agreed to in writing, software distributed under the License 10 | * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 11 | * or implied. See the License for the specific language governing permissions and limitations under 12 | * the License. 13 | */ 14 | 15 | import java.net.URLDecoder 16 | 17 | import org.apache.spark.SparkContext._ 18 | import org.apache.spark.rdd.RDD 19 | import org.apache.spark.{SparkConf, SparkContext} 20 | 21 | import scala.util.Try 22 | 23 | /** 24 | * This is a Spark application that processes flat file RDF dumps of DBpedia.org and generates CSV files 25 | * that are used to generate Neo4j data store files. 26 | */ 27 | object DBpediaImporter { 28 | 29 | // This requires at least 50gb of system memory to run. You've been warned. Use EC2. 30 | val conf = new SparkConf() 31 | .setAppName("DBpedia Transform") 32 | .setMaster("local[8]") 33 | .set("total-executor-cores", "8") 34 | .set("driver-memory", "50g") 35 | .set("spark.executor.memory", "50g") 36 | .set("spark.driver.memory", "50g") 37 | 38 | 39 | val sc = new SparkContext(conf) 40 | 41 | 42 | 43 | def main(args: Array[String]) { 44 | 45 | // Import the page nodes and link graph 46 | val pageIndex: collection.Map[String, Long] = importPageNodesAndLinks() 47 | 48 | // Import the category nodes 49 | val lastPointer: Long = importCategoryNodesAndLinks(pageIndex) 50 | 51 | // Import the ontology graph 52 | importOntologyNodesAndLinks(lastPointer, pageIndex) 53 | 54 | } 55 | 56 | def importOntologyNodesAndLinks(lastIndexPointer: Long, pageIndex: collection.Map[String, Long]) { 57 | // Load ontology file 58 | val ontologyFile = sc.textFile(Configuration.INSTANCE_TYPES_FILE_NAME) 59 | 60 | // Process and prepare the ontology nodes 61 | val ontologyMap = processOntology(ontologyFile) 62 | 63 | // Step 1: Get a distinct list of ontology and generate a node index 64 | val ontologyNodeData = ontologyMap.map(ont => ont._1) 65 | .zipWithUniqueId() 66 | .map(a => (a._1, a._2 + lastIndexPointer)) 67 | 68 | // Generate the ontology node rows with property name and id 69 | val ontologyNodeRows = generateOntologyNodes(ontologyNodeData) 70 | 71 | // Save the ontology nodes CSV 72 | ontologyNodeRows.saveAsTextFile(Configuration.HDFS_HOST + "ontologynodes") 73 | 74 | val ontologyIndex = ontologyNodeData.collectAsMap() 75 | val relHeader = sc.parallelize(Seq(Configuration.PAGE_LINKS_CSV_HEADER).toList) 76 | val ontologyRelationshipRows = ontologyMap.map(row => { 77 | row._2.map(a => { 78 | (if(ontologyIndex.contains(row._1)) ontologyIndex(row._1) else "-1") + "\t" + (if(pageIndex.contains(a)) pageIndex(a) else "-1") + "\tHAS_ONTOLOGY" 79 | }).mkString("\n") 80 | }) 81 | 82 | // Unions and header 83 | val relResult = relHeader.union(ontologyRelationshipRows) 84 | relResult.saveAsTextFile(Configuration.HDFS_HOST + "ontologyrels-stage") 85 | 86 | // Reload it and filter out bad data 87 | val ontologyMappedRows = sc.textFile(Configuration.HDFS_HOST + "ontologyrels-stage").filter(line => !line.contains("-1")) 88 | 89 | // Save it to HDFS 90 | ontologyMappedRows.saveAsTextFile(Configuration.HDFS_HOST + "ontologyrels") 91 | 92 | } 93 | 94 | /** 95 | * Import category mappings. 96 | * @param pageIndex The hash map of page names to their corresponding import id. 97 | * @return Returns the last index pointer to continue the import process. 98 | */ 99 | def importCategoryNodesAndLinks(pageIndex: collection.Map[String, Long]) : Long = { 100 | // We need the last unique id, which will be used to offset the id for category nodes 101 | val lastIndexPointer = pageIndex.toList.sortBy(a => (a._2, a._1)).last._2: Long 102 | 103 | // Load categories file 104 | val categoriesFile = sc.textFile(Configuration.CATEGORIES_FILE_NAME) 105 | 106 | // Process and prepare the categories for creating the nodes file 107 | val categoriesMap = processCategories(categoriesFile) 108 | 109 | // Generate a categories and then join it to the pageIndex 110 | 111 | // Step 1: Get a distinct list of categories and generate a node index 112 | val categoryNodeData = categoriesMap.map(cat => cat._1) 113 | .zipWithUniqueId() 114 | .map(a => (a._1, a._2 + lastIndexPointer)) 115 | 116 | // Generate the category node rows with property name and id 117 | val categoryNodeRows = generateCategoryNodes(categoryNodeData) 118 | 119 | // Save the category nodes CSV 120 | categoryNodeRows.saveAsTextFile(Configuration.HDFS_HOST + "categorynodes") 121 | 122 | val categoryIndex = categoryNodeData.collectAsMap() 123 | val relHeader = sc.parallelize(Seq(Configuration.PAGE_LINKS_CSV_HEADER).toList) 124 | val categoryRelationshipRows = categoriesMap.map(row => { 125 | row._2.map(a => { 126 | (if(categoryIndex.contains(row._1)) categoryIndex(row._1) else "-1") + "\t" + (if(pageIndex.contains(a)) pageIndex(a) else "-1") + "\tHAS_CATEGORY" 127 | }).mkString("\n") 128 | }) 129 | 130 | // Load categories skos broader concept file 131 | val categoriesSkosFile = sc.textFile(Configuration.CATEGORY_SKOS_FILE_NAME) 132 | val categoriesSkosMap = processCategories(categoriesSkosFile) 133 | val categorySkosRelationshipRows = categoriesSkosMap.map(row => { 134 | row._2.map(a => { 135 | (if(categoryIndex.contains(row._1)) categoryIndex(row._1) else "-1") + "\t" + (if(categoryIndex.contains(a)) categoryIndex(a) else "-1") + "\tHAS_CATEGORY" 136 | }).mkString("\n") 137 | }) 138 | 139 | // Unions and header 140 | val categoryRelationshipMappingResult = categorySkosRelationshipRows.union(categoryRelationshipRows) 141 | val relResult = relHeader.union(categoryRelationshipMappingResult) 142 | relResult.saveAsTextFile(Configuration.HDFS_HOST + "categoryrels-stage") 143 | 144 | // Reload it and filter out bad data 145 | val categoryMappedRows = sc.textFile(Configuration.HDFS_HOST + "categoryrels-stage").filter(line => !line.contains("-1")) 146 | 147 | // Save it to HDFS 148 | categoryMappedRows.saveAsTextFile(Configuration.HDFS_HOST + "categoryrels") 149 | 150 | val categoryLastIndexPointer = categoryIndex.toList.sortBy(a => (a._2, a._1)).last._2: Long 151 | 152 | categoryLastIndexPointer 153 | } 154 | 155 | def processCategories(categoriesFile: RDD[String]): RDD[(String, Iterable[String])] = { 156 | val categoriesMap = categoriesFile 157 | .filter(line => line.contains(Configuration.RDF_CATEGORY_URL) || line.contains(Configuration.CATEGORY_SKOS_URL)) 158 | .map(e => { 159 | e.split("^<|>\\s<|\\>\\s\\\"|>\\s\\.$") 160 | .filter(!_.isEmpty) 161 | .filter(a => !a.contains(Configuration.RDF_CATEGORY_URL.replace("<", "").replace(">", "")) 162 | && !a.contains(Configuration.CATEGORY_SKOS_URL.replace("<", "").replace(">", ""))) }) 163 | .map(uri => (uri(1), uri(0))) 164 | .groupByKey() 165 | 166 | categoriesMap 167 | } 168 | 169 | def processOntology(ontologyFile: RDD[String]): RDD[(String, Iterable[String])] = { 170 | val ontologyMap = ontologyFile 171 | .filter(line => line.contains(Configuration.RDF_ONTOLOGY_URL)) 172 | .map(e => { 173 | e.split("^<|>\\s<|\\>\\s\\\"|>\\s\\.$") 174 | .filter(!_.isEmpty) 175 | .filter(a => !a.contains(Configuration.RDF_ONTOLOGY_URL.replace("<", "").replace(">", ""))) }) 176 | .map(uri => (uri(1), uri(0))) 177 | .groupByKey() 178 | 179 | ontologyMap 180 | } 181 | 182 | def importPageNodesAndLinks(): scala.collection.Map[String, Long] = { 183 | // Load the text files 184 | val wikiLinksFile = sc.textFile(Configuration.WIKI_LINKS_FILE_NAME) 185 | val wikiNamesFile = sc.textFile(Configuration.WIKI_NAMES_FILE_NAME) 186 | val pageLinksFile = sc.textFile(Configuration.PAGE_LINKS_FILE_NAME) 187 | 188 | // First stage: Join the Wikipedia map file and the names map file into a single RDD 189 | // Process and prepare the Wikipedia links file to join on the DBpedia key 190 | val wikiLinksMap = processWikiLinks(wikiLinksFile) 191 | 192 | // Process and prepare the page names to join on the DBpedia key 193 | val pageNamesMap = processPageNames(wikiNamesFile) 194 | 195 | // Join the Wikipedia map and the names map on the DBpedia key 196 | val pageNodeData = joinNamesToLinks(wikiLinksMap, pageNamesMap) 197 | 198 | // Take the union of the two datasets and generate a CSV as an RDD 199 | val pageNodeRows = generatePageNodes(pageNodeData) 200 | 201 | // Second stage: Encode each value in the page links file with the 202 | // unique node id generated during the last stage 203 | 204 | // Create an in-memory hash table to lookup DBpedia keys and return the 205 | // encoded unique node id 206 | val pageNodeIndex = pageNodeData.map(r => { 207 | r._1 208 | }).zipWithUniqueId().collectAsMap() 209 | 210 | // Process and prepare the page links file to be encoded on the DBpedia key 211 | val pageLinkRelationshipData = processPageLinks(pageLinksFile) 212 | 213 | // Encode each DBpedia key with the Neo4j node id located in the pageNodeIndex table 214 | val pageLinkRelationshipRows = encodePageLinks(pageLinkRelationshipData, pageNodeIndex) 215 | 216 | // Final stage: Save the page nodes and relationship results to HDFS 217 | val pageNodeRels = generatePageLinkRelationships(pageLinkRelationshipRows) 218 | 219 | // Save the page nodes CSV 220 | pageNodeRows.saveAsTextFile(Configuration.HDFS_HOST + "pagenodes") 221 | 222 | // Save the page rels CSV 223 | pageNodeRels.saveAsTextFile(Configuration.HDFS_HOST + "pagerels") 224 | 225 | pageNodeIndex 226 | } 227 | 228 | /** 229 | * Process Wikipedia Links RDF file 230 | * @param wikiLinksFile 231 | * @return Returns an RDD[String] map of filtered lines for import into Neo4j 232 | */ 233 | def processWikiLinks(wikiLinksFile: RDD[String]): RDD[String] = { 234 | val wikiLinksMap = wikiLinksFile.filter(line => 235 | line.contains(Configuration.PRIMARY_TOPIC_URL) && 236 | !line.contains(Configuration.EXCLUDE_FILE_PATTERN)) 237 | .map(e => { 238 | e.split("(?<=>)\\s(?=<)|\\s\\.$") 239 | .filter(a => { 240 | !a.contains(Configuration.PRIMARY_TOPIC_URL) 241 | }) 242 | }) 243 | .map(uri => { 244 | (uri(1), uri(0)) 245 | }) 246 | .map(line => { 247 | line._1 + " " + line._2 248 | }) 249 | 250 | wikiLinksMap 251 | } 252 | 253 | /** 254 | * 255 | * @param wikiNamesFile 256 | * @return 257 | */ 258 | def processPageNames(wikiNamesFile: RDD[String]): RDD[String] = { 259 | val wikiNamesMap = wikiNamesFile.filter(line => line.contains(Configuration.RDF_LABEL_URL)) 260 | .filter(line => !line.contains(Configuration.EXCLUDE_FILE_PATTERN)) 261 | .map(e => { 262 | e.split("(?<=>)\\s(?=<)|(?<=>)\\s(?=\\\")|@en\\s\\.$") 263 | .filter(a => { !a.contains(Configuration.RDF_LABEL_URL) }) 264 | }) 265 | .map(uri => { (uri(0), uri(1)) }) 266 | .map(line => { line._1 + " " + Try(URLDecoder.decode(line._2)).getOrElse("") }) 267 | 268 | wikiNamesMap 269 | } 270 | 271 | /** 272 | * 273 | * @param wikiLinksMap 274 | * @param wikiNamesMap 275 | * @return 276 | */ 277 | def joinNamesToLinks(wikiLinksMap: RDD[String], wikiNamesMap: RDD[String]): RDD[(String, Iterable[String])] = { 278 | val joinedList = wikiLinksMap.union(wikiNamesMap).map(line => { 279 | val items = line.split("^<|>\\s<|\\>\\s\\\"|\\\"$|>$").filter(!_.isEmpty) 280 | val mapResult = if (items.length >= 2) (items(0), items(1)) else ("N/A", "N/A") 281 | mapResult 282 | }).filter(items => items._1 != "N/A").map(a => (a._1, a._2)).groupByKey() 283 | 284 | joinedList 285 | } 286 | 287 | /** 288 | * 289 | * @param pageNodeData 290 | * @return 291 | */ 292 | def generatePageNodes(pageNodeData: RDD[(String, Iterable[String])]): RDD[String] = { 293 | val header = sc.parallelize(Seq(Configuration.PAGE_NODES_CSV_HEADER).toList) 294 | val rows = pageNodeData.zipWithUniqueId().map(e => { 295 | e._1._1 + "\t" + e._2 + "\tPage\t" + e._1._2.toList.mkString("\t") 296 | }) 297 | 298 | val result = header.union(rows) 299 | 300 | result 301 | } 302 | 303 | def generateCategoryNodes(categoryNodeData: RDD[(String, Long)]): RDD[String] = { 304 | val namePattern = """(?<=Category\:).*$""".r 305 | val header = sc.parallelize(Seq(Configuration.CATEGORY_NODES_CSV_HEADER).toList) 306 | val rows = categoryNodeData.map(line => line._2 + "\tCategory\t" + line._1 + "\t" + Try(URLDecoder.decode((namePattern findFirstIn line._1).getOrElse("").replace("_", " "))).getOrElse("") ) 307 | val result = header.union(rows) 308 | 309 | result 310 | } 311 | 312 | def generateOntologyNodes(ontologyNodeData: RDD[(String, Long)]): RDD[String] = { 313 | val namePattern = """(?<=[\/\#])[^\/\#]*$""".r 314 | val header = sc.parallelize(Seq(Configuration.ONTOLOGY_NODES_CSV_HEADER).toList) 315 | val rows = ontologyNodeData.map(line => line._2 + "\tOntology\t" + line._1 + "\t" + Try(URLDecoder.decode((namePattern findFirstIn line._1).getOrElse("").replace("_", " "))).getOrElse("") ) 316 | val result = header.union(rows) 317 | 318 | result 319 | } 320 | 321 | /** 322 | * 323 | * @param pageLinks 324 | * @param pageNodeIndex 325 | * @return 326 | */ 327 | def encodePageLinks(pageLinks: RDD[String], pageNodeIndex: scala.collection.Map[String, Long]): RDD[(Long, Long)] = { 328 | val matchPattern = """([^\s]+)""".r 329 | 330 | // Filter out bad links 331 | val encodedPageLinksResult = pageLinks.map(uri => { 332 | val matches = for (m <- matchPattern findAllMatchIn uri) yield m group 1 333 | val uris:List[String] = matches.toList.take(2) 334 | (pageNodeIndex.getOrElse(uris(0), -1) :Long, pageNodeIndex.getOrElse(uris(1), -1) :Long) 335 | }).filter(uri => { 336 | !(uri._1 == -1 || uri._2 == -1) 337 | }) 338 | 339 | encodedPageLinksResult 340 | } 341 | 342 | /** 343 | * 344 | * @param pageLinksFile 345 | * @return 346 | */ 347 | def processPageLinks(pageLinksFile: RDD[String]): RDD[String] = { 348 | val pageLinks = pageLinksFile.filter(line => 349 | line.contains(Configuration.WIKI_PAGE_LINK_URL) && 350 | !line.contains(Configuration.EXCLUDE_FILE_PATTERN) && 351 | !line.contains(Configuration.EXCLUDE_CATEGORY_PATTERN)) 352 | .map(e => { 353 | e.split("^<|>\\s<|\\>\\s\\\"|>\\s\\.$") 354 | .filter(!_.isEmpty) 355 | .filter(a => { !a.contains(Configuration.WIKI_PAGE_LINK_URL) }) 356 | }) 357 | .map(uri => { 358 | (uri(0), uri(1)) 359 | }) 360 | .map(line => { 361 | line._1 + " " + line._2 362 | }) 363 | 364 | pageLinks 365 | } 366 | 367 | /** 368 | * 369 | * @param pageLinkResults 370 | * @return 371 | */ 372 | def generatePageLinkRelationships(pageLinkResults: RDD[(Long, Long)]): RDD[String] = { 373 | val relHeader = sc.parallelize(Seq(Configuration.PAGE_LINKS_CSV_HEADER).toList) 374 | val relRows = pageLinkResults.map(line => { line._1 + "\t" + line._2 + "\tHAS_LINK" }) 375 | val relResult = relHeader.union(relRows) 376 | 377 | relResult 378 | } 379 | } 380 | 381 | 382 | --------------------------------------------------------------------------------