├── src
└── main
│ └── java
│ └── wip
│ └── wikidata_neo4j_importer
│ ├── Config.java
│ ├── Util.java
│ ├── ItemImporter.java
│ ├── Runner.java
│ └── EdgeImporter.java
├── pom.xml
└── README.md
/src/main/java/wip/wikidata_neo4j_importer/Config.java:
--------------------------------------------------------------------------------
1 | package wip.wikidata_neo4j_importer;
2 |
3 | public class Config {
4 | public static int restartNodeNum = 2000000; // after "restartNodeNum" items, we restart neo4j (when importing nodes)
5 | public static int printNodeNum = 5000; // after "printNodeNum" items, we print progress (when importing nodes)
6 |
7 | public static int restartEdgeNum = 2000000; // after "restartEdgeNum" items, we restart neo4j (when importing edges)
8 | public static int printEdgeNum = 5000; // after "printEdgeNum" items, we print progress (when importing edges)
9 |
10 | /*
11 | Every node is either an item or property in wikidata, and each of them has 2 ids:
12 | 1. id for neo4j
13 | 2. id for wikidata (their original id in wikidata)
14 | For items Qxx, its neo4j id is 1xx (e.g. neo4j id for Q1 is 11).
15 | For property Pxx, its neo4j id is 2xx (e.g. neo4j id for P31 is 231).
16 | This conversion can facilitate import procedure.
17 | */
18 | public static int itemPrefix = 1;
19 | public static int propPrefix = 2;
20 | }
21 |
--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
5 | 4.0.0
6 |
7 | wip
8 | wikidata_neo4j_importer
9 | 1.0-SNAPSHOT
10 |
11 |
12 |
13 | org.neo4j
14 | neo4j
15 | 2.3.1
16 |
17 |
18 | org.neo4j
19 | neo4j-kernel
20 | 2.3.1
21 |
22 |
23 |
24 | org.apache.commons
25 | commons-compress
26 | 1.10
27 |
28 |
29 |
30 | org.json
31 | json
32 | 20151123
33 |
34 |
35 |
36 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | #Introduction
2 | This project import wikidata json.bz2 dump into neo4j.
3 |
4 | Note that, this project only import English data and also ignores references, relations between properties and properties whose range is note wikidata item. For example, we ignore properties like "image(P18)", because its range is string.
5 |
6 | However, it is convenient to extend this project and add these information if you want.
7 |
8 | #Implementation Overview
9 | commons-compress (from org.apache.commons) to read bz2 file
10 |
11 | BatchInserter (form java neo4j interface) to import data
12 |
13 | json (from json) to process json strings
14 |
15 | Our implementation is a two-step procedure:
16 |
17 | 1. Import all nodes (items and properties)
18 |
19 | 2. Import all edges (properties) between them
20 |
21 | #Statistics
22 | The 20160118 dump (wikidata-20160118-all.json.bz2), which is around 4G, takes 4.3G after imported to neo4j. (Note: not all data are imported, see Introduction part)
23 |
24 | It processes around 2500 items per second, and 3 hours to import the whole dump (5113 seconds to import nodes, 6018 seconds to import edges).
25 |
26 | It seems that parsing json strings is very time consuming compared to importing data. So, one possible improvement is probably to create nodes and add edges in one pass.
27 |
28 | #Usage:
29 | 1. Import this project.
30 |
31 | 2. Use maven to download dependencies.
32 |
33 | 3. Run the Runner class, parameters are:
34 |
35 | param 1: path of wikidata dump
36 |
37 | param 2: path of neo4j database directory
38 |
39 | param 3: enter 'node' or 'edge', indicating whether import node or edge
40 |
41 |
42 |
--------------------------------------------------------------------------------
/src/main/java/wip/wikidata_neo4j_importer/Util.java:
--------------------------------------------------------------------------------
1 | package wip.wikidata_neo4j_importer;
2 |
3 | import org.apache.commons.compress.compressors.CompressorException;
4 | import org.apache.commons.compress.compressors.CompressorInputStream;
5 | import org.apache.commons.compress.compressors.CompressorStreamFactory;
6 |
7 | import java.io.*;
8 |
9 | public class Util {
10 |
11 | /**
12 | * Get Buffered Reader for Compressed File (e.g. bz2 file)
13 | *
14 | * @param fileIn input file path string
15 | * @return
16 | * @throws FileNotFoundException
17 | * @throws CompressorException
18 | */
19 | public static BufferedReader getBufferedReaderForCompressedFile(String fileIn) throws FileNotFoundException, CompressorException {
20 | FileInputStream fin = new FileInputStream(fileIn);
21 | BufferedInputStream bis = new BufferedInputStream(fin);
22 | CompressorInputStream input = new CompressorStreamFactory().createCompressorInputStream(bis);
23 | BufferedReader br2 = new BufferedReader(new InputStreamReader(input));
24 | return br2;
25 | }
26 |
27 | /**
28 | * Find the base number of a number.
29 | * e.g. the base number of 200 with base 10 is 3
30 | *
31 | * @param num input number
32 | * @param base base
33 | * @return
34 | */
35 | public static long findBaseNumber(long num, long base) {
36 | if (num == 0) return 1;
37 |
38 | long baseNum = 0;
39 | while (num != 0) {
40 | baseNum += 1;
41 | num = num / base;
42 | }
43 | return baseNum;
44 | }
45 |
46 | /**
47 | * Add prefix to a long number.
48 | * e.g. if we want to add prefix 2 to an input number 31 with base 10, then we get 231
49 | *
50 | * @param num input number
51 | * @param prefix prefix to add
52 | * @param base base
53 | * @return
54 | */
55 | public static long addPrefixToLong(long num, long prefix, long base) {
56 | long baseNum = findBaseNumber(num, base);
57 | while (baseNum > 0) {
58 | baseNum -= 1;
59 | prefix *= base;
60 | }
61 | return prefix + num;
62 | }
63 | }
64 |
--------------------------------------------------------------------------------
/src/main/java/wip/wikidata_neo4j_importer/ItemImporter.java:
--------------------------------------------------------------------------------
1 | package wip.wikidata_neo4j_importer;
2 |
3 | import org.json.JSONArray;
4 | import org.json.JSONObject;
5 | import org.neo4j.graphdb.DynamicLabel;
6 | import org.neo4j.graphdb.Label;
7 | import org.neo4j.unsafe.batchinsert.BatchInserter;
8 | import org.neo4j.unsafe.batchinsert.BatchInserters;
9 |
10 | import java.io.File;
11 | import java.io.IOException;
12 | import java.io.PrintWriter;
13 | import java.util.HashMap;
14 | import java.util.Map;
15 |
16 | public class ItemImporter {
17 |
18 | Label labelItem; // group nodes of items
19 | Label labelProp; // group nodes of properties
20 |
21 | // Write all properties to a file.
22 | // We don't use them during importing actually.
23 | // But it can be convenient and efficient to use this dump,
24 | // rather than original json file if you want to add additional information to edges.
25 | PrintWriter propertyWriter;
26 |
27 | BatchInserter inserter;
28 |
29 | public ItemImporter(String pathNeo4jDatabase, String propDumpPath) throws IOException {
30 | labelItem = DynamicLabel.label("Item");
31 | labelProp = DynamicLabel.label("Property");
32 | propertyWriter = new PrintWriter(propDumpPath);
33 |
34 | initializeInserter(pathNeo4jDatabase);
35 | }
36 |
37 | public void initializeInserter(String pathNeo4jDatabase) throws IOException {
38 | inserter = BatchInserters.inserter(new File(pathNeo4jDatabase));
39 | // inserter.createDeferredSchemaIndex(labelItem).on("wikidataId").create();
40 | }
41 |
42 | public void importItem(String itemDocStr, Boolean isItem) {
43 | // Extract key information from json string
44 | JSONObject obj = new JSONObject(itemDocStr);
45 | String wikidataId = obj.getString("id");
46 | String datatype = getDatatype(obj); // only exists in property
47 | String label = getEnLabel(obj);
48 | String description = getEnDescription(obj);
49 | String aliasStr = getEnAliases(obj);
50 |
51 | // Construct property map of current node
52 | Map properties = new HashMap();
53 | properties.put("wikidataId", wikidataId);
54 | if (!datatype.equals("")) properties.put("datatype", datatype);
55 | if (!label.equals("")) properties.put("label", label);
56 | if (!description.equals("")) properties.put("description", description);
57 | if (!aliasStr.equals("")) properties.put("aliases", aliasStr);
58 |
59 | // Generate id of current node and insert it
60 | long nodeId = Long.parseLong(wikidataId.substring(1));
61 | if (isItem) {
62 | nodeId = Util.addPrefixToLong(nodeId, Config.itemPrefix, 10);
63 | if (!inserter.nodeExists(nodeId))
64 | inserter.createNode(nodeId, properties, labelItem);
65 | } else {
66 | nodeId = Util.addPrefixToLong(nodeId, Config.propPrefix, 10);
67 | if (!inserter.nodeExists(nodeId))
68 | inserter.createNode(nodeId, properties, labelProp);
69 | }
70 |
71 | // If current document is a property, dump it
72 | if (!isItem) {
73 | JSONObject resObj = new JSONObject();
74 | resObj.put("wikidataId", wikidataId);
75 | resObj.put("label", label);
76 | resObj.put("datatype", datatype);
77 | resObj.put("description", description);
78 | resObj.put("aliases", aliasStr);
79 |
80 | propertyWriter.write(resObj.toString() + "\n");
81 | }
82 | }
83 |
84 | public void shutDownNeo4j(){
85 | inserter.shutdown();
86 | }
87 |
88 | public void close() {
89 | inserter.shutdown();
90 | propertyWriter.close();
91 | }
92 |
93 | private String getDatatype(JSONObject obj) {
94 | if (!obj.has("datatype")) return "";
95 | return obj.getString("datatype");
96 | }
97 |
98 | private String getEnLabel(JSONObject obj) {
99 | if (!obj.has("labels")) return "";
100 | if (!obj.getJSONObject("labels").has("en")) return "";
101 | return obj.getJSONObject("labels").getJSONObject("en").getString("value");
102 | }
103 |
104 | private String getEnDescription(JSONObject obj) {
105 | if (!obj.has("descriptions")) return "";
106 | if (!obj.getJSONObject("descriptions").has("en")) return "";
107 | return obj.getJSONObject("descriptions").getJSONObject("en").getString("value");
108 | }
109 |
110 | private String getEnAliases(JSONObject obj) {
111 | if (!obj.has("aliases")) return "";
112 | if (!obj.getJSONObject("aliases").has("en")) return "";
113 |
114 | JSONArray aliases = obj.getJSONObject("aliases").getJSONArray("en");
115 | String aliasStr = "";
116 | for (Object aliasObj : aliases) {
117 | String tempAlias = ((JSONObject) aliasObj).getString("value");
118 | aliasStr += tempAlias + "\n";
119 | }
120 | return aliasStr.substring(0, aliases.length()-1);
121 | }
122 |
123 | }
124 |
--------------------------------------------------------------------------------
/src/main/java/wip/wikidata_neo4j_importer/Runner.java:
--------------------------------------------------------------------------------
1 | package wip.wikidata_neo4j_importer;
2 |
3 | import java.io.BufferedReader;
4 | import java.io.IOException;
5 |
6 |
7 | public class Runner {
8 | public static void main(String[] args) throws IOException {
9 |
10 | if (args.length < 3){
11 | System.out.println("Usage: java Runner wikidata_dump_path neo4j_db_dir [node/edge]\n" +
12 | "param 1: path of wikidata dump\n" +
13 | "param 2: path of neo4j database directory\n" +
14 | "param 3: enter 'node' or 'edge', indicating whether import node or edge");
15 | System.exit(1);
16 | }
17 |
18 | String dataPath = args[0];
19 | String pathNeo4jDatabase = args[1];
20 | String mode = args[2]; // whether we are inserting nodes or edges
21 | String propertyDumPath = "./propertyDump.txt"; // dump all property information in this file
22 | String edgeLogPath = "./edgeLog.txt"; // log uncreated nodes when adding edges
23 |
24 | int itemCounter = 0; // number of items processed
25 | int propCounter = 0; // number of properties processed
26 | long startMilli = System.currentTimeMillis();
27 |
28 | // Note: we don't add any index while importing
29 | // If you need one, you can manually create them with cypher commands like:
30 | // "create index on :Item(wikidataId)"
31 | if (mode.equals("node")) {
32 | ItemImporter itemImporter = new ItemImporter(pathNeo4jDatabase, propertyDumPath);
33 | try{
34 | // Reader of compressed wikidata dump
35 | BufferedReader inputReader = Util.getBufferedReaderForCompressedFile(dataPath);
36 |
37 | while (true) {
38 | // for (int i = 0; i < 100; i++) { // this line is for test
39 | if (itemCounter!=0 && itemCounter % Config.printNodeNum == 0) {
40 | long tempMilli = System.currentTimeMillis();
41 | System.out.printf("Processed %d nodes and %d properties. Used %d secondes.\n",
42 | itemCounter, propCounter, (tempMilli-startMilli)/1000);
43 | }
44 | if (itemCounter!=0 && itemCounter % Config.restartNodeNum == 0) {
45 | System.out.println("Restarting Neo4j...");
46 |
47 | itemImporter.shutDownNeo4j();
48 | itemImporter.initializeInserter(pathNeo4jDatabase);
49 | System.out.println("Neo4j is back!\n");
50 | }
51 |
52 | String tempDocStr = inputReader.readLine();
53 | if (tempDocStr==null || tempDocStr.trim().equals("")) break;
54 | if (tempDocStr.startsWith("{\"type\":\"property\"")){
55 | propCounter += 1;
56 | itemImporter.importItem(tempDocStr, false);
57 | } else if (tempDocStr.startsWith("{\"type\":\"item\"")){
58 | itemCounter += 1;
59 | itemImporter.importItem(tempDocStr, true);
60 | }
61 | }
62 | } catch (Exception e) {
63 | e.printStackTrace();
64 | } finally {
65 | long tempMilli = System.currentTimeMillis();
66 | System.out.printf("Processed %d nodes and %d properties. Used %d secondes.\n",
67 | itemCounter, propCounter, (tempMilli - startMilli) / 1000);
68 | System.out.println("Shutting Down Neo4j...");
69 |
70 | itemImporter.close();
71 | }
72 | } else if (mode.equals("edge")) {
73 | EdgeImporter edgeImporter = new EdgeImporter(pathNeo4jDatabase, propertyDumPath, edgeLogPath);
74 | try{
75 | // Reader of compressed wikidata dump
76 | BufferedReader inputReader = Util.getBufferedReaderForCompressedFile(dataPath);
77 |
78 | while (true) {
79 | // for (int i = 0; i < 100; i++) { // this line is for test
80 | if (itemCounter!=0 && itemCounter % Config.printEdgeNum == 0) {
81 | long tempMilli = System.currentTimeMillis();
82 | System.out.printf("Processed %d items and %d properties. Created %d nodes. Used %d secondes.\n",
83 | itemCounter, propCounter, edgeImporter.nodeCreatedCnt, (tempMilli-startMilli)/1000);
84 | }
85 | if (itemCounter!=0 && itemCounter % Config.restartEdgeNum == 0) {
86 | System.out.println("Restarting Neo4j...");
87 |
88 | edgeImporter.shutDownNeo4j();
89 | edgeImporter.initializeInserter(pathNeo4jDatabase);
90 | System.out.println("Neo4j is back!\n");
91 | }
92 |
93 | String tempDocStr = inputReader.readLine();
94 | if (tempDocStr==null || tempDocStr.trim().equals("")) break;
95 | // ignore property when importing edges
96 | if (tempDocStr.startsWith("{\"type\":\"property\"")){
97 | propCounter += 1;
98 | } else if (tempDocStr.startsWith("{\"type\":\"item\"")){
99 | itemCounter += 1;
100 | edgeImporter.importEdge(tempDocStr);
101 | }
102 | }
103 | } catch (Exception e) {
104 | e.printStackTrace();
105 | } finally {
106 | long tempMilli = System.currentTimeMillis();
107 | System.out.printf("Processed %d items and %d properties. Created %d nodes. Used %d secondes.\n",
108 | itemCounter, propCounter, edgeImporter.nodeCreatedCnt, (tempMilli-startMilli)/1000);
109 | System.out.println("Shutting Down Neo4j...");
110 |
111 | edgeImporter.close();
112 | }
113 | }
114 |
115 | }
116 | }
117 |
--------------------------------------------------------------------------------
/src/main/java/wip/wikidata_neo4j_importer/EdgeImporter.java:
--------------------------------------------------------------------------------
1 | package wip.wikidata_neo4j_importer;
2 |
3 | import org.json.JSONArray;
4 | import org.json.JSONObject;
5 | import org.neo4j.graphdb.DynamicLabel;
6 | import org.neo4j.graphdb.DynamicRelationshipType;
7 | import org.neo4j.graphdb.Label;
8 | import org.neo4j.graphdb.RelationshipType;
9 | import org.neo4j.unsafe.batchinsert.BatchInserter;
10 | import org.neo4j.unsafe.batchinsert.BatchInserters;
11 |
12 | import java.io.File;
13 | import java.io.IOException;
14 | import java.io.PrintWriter;
15 | import java.nio.file.Paths;
16 | import java.util.HashMap;
17 | import java.util.Map;
18 | import java.util.Scanner;
19 |
20 | public class EdgeImporter {
21 |
22 | Map propDic = new HashMap();
23 | PrintWriter logWriter; // log all created new nodes when importing edges
24 | BatchInserter inserter;
25 |
26 | Label labelItem; // group nodes of items
27 | Label labelProp; // group nodes of properties
28 |
29 | public int nodeCreatedCnt = 0; // number of nodes created when importing edges
30 |
31 | public EdgeImporter(String pathNeo4jDatabase, String propDumpPath, String logPath) throws IOException {
32 | labelItem = DynamicLabel.label("Item");
33 | labelProp = DynamicLabel.label("Property");
34 | logWriter = new PrintWriter(logPath);
35 |
36 | // readPropDic(propDumpPath); // you can uncomment this line if you want to use the property dump.
37 |
38 | initializeInserter(pathNeo4jDatabase);
39 | }
40 |
41 | public void initializeInserter(String pathNeo4jDatabase) throws IOException {
42 | inserter = BatchInserters.inserter(new File(pathNeo4jDatabase));
43 | }
44 |
45 | public void importEdge(String itemDocStr) {
46 | JSONObject obj = new JSONObject(itemDocStr);
47 | String subjWikidataId = obj.getString("id");
48 | long subjNodeId = Util.addPrefixToLong(Long.parseLong(subjWikidataId.substring(1)), Config.itemPrefix, 10);
49 |
50 | if (!obj.has("claims")) return;
51 | JSONObject claimObj = obj.getJSONObject("claims");
52 | for (String propId : claimObj.keySet()) {
53 | JSONArray valueArray = claimObj.getJSONArray(propId);
54 | String snakType = valueArray.getJSONObject(0).getJSONObject("mainsnak").getString("snaktype");
55 | if (!snakType.equals("value")) continue; // ignore some value and no value
56 | String valueType = valueArray.getJSONObject(0).getJSONObject("mainsnak")
57 | .getJSONObject("datavalue").getString("type");
58 |
59 | // Only import edges between entities
60 | // Ignore other property values
61 | if (valueType.equals("wikibase-entityid")) {
62 | RelationshipType tempPropType = DynamicRelationshipType.withName(propId);
63 |
64 | for (int i = 0; i < valueArray.length(); i++) {
65 | JSONObject mainSnakObj = valueArray.getJSONObject(i).getJSONObject("mainsnak");
66 | if (!mainSnakObj.getString("snaktype").equals("value")) continue; // ignore novalue and somevalue
67 | if (!mainSnakObj.getJSONObject("datavalue").getString("type").equals("wikibase-entityid"))
68 | continue;
69 |
70 | String nodeType = mainSnakObj.getJSONObject("datavalue").
71 | getJSONObject("value").getString("entity-type");
72 | long objNodeId = mainSnakObj.getJSONObject("datavalue")
73 | .getJSONObject("value").getLong("numeric-id");
74 | String objWikidataId = objNodeId + "";
75 | if (nodeType.equals("item")){
76 | objNodeId = Util.addPrefixToLong(objNodeId, Config.itemPrefix, 10);
77 | objWikidataId = "Q" + objWikidataId;
78 | } else {
79 | objNodeId = Util.addPrefixToLong(objNodeId, Config.propPrefix, 10);
80 | objWikidataId = "P" + objWikidataId;
81 | }
82 |
83 |
84 | // Create subject node if not exist (normally, this shouldn't be triggered)
85 | if (!inserter.nodeExists(subjNodeId)) {
86 | Map nodeProperties = new HashMap();
87 | nodeProperties.put("wikidataId", subjWikidataId);
88 | inserter.createNode(subjNodeId, nodeProperties, labelItem);
89 |
90 | nodeCreatedCnt += 1;
91 | logWriter.write("Inserted: " + subjWikidataId+ ", from " + subjWikidataId + "\n");
92 | }
93 | // Create object node if not exist (normally, this shouldn't be triggered)
94 | if (!inserter.nodeExists(objNodeId)) {
95 | Map nodeProperties = new HashMap();
96 | nodeProperties.put("wikidataId", objWikidataId);
97 | inserter.createNode(objNodeId, nodeProperties, labelItem);
98 |
99 | nodeCreatedCnt += 1;
100 | logWriter.write("Inserted: " + objWikidataId + ", from " + subjWikidataId + "\n");
101 | }
102 |
103 | inserter.createRelationship(subjNodeId, objNodeId, tempPropType, null);
104 | }
105 | }
106 | }
107 | }
108 |
109 | public void shutDownNeo4j(){
110 | inserter.shutdown();
111 | }
112 |
113 | public void close() {
114 | inserter.shutdown();
115 | logWriter.close();
116 | }
117 |
118 | /**
119 | * Read the property dump produced when importing nodes.
120 | * @param propDumpPath dump file path
121 | * @throws IOException
122 | */
123 | private void readPropDic(String propDumpPath) throws IOException {
124 | Scanner scanner = new Scanner(Paths.get(propDumpPath));
125 | while (scanner.hasNext()) {
126 | String propDocStr = scanner.nextLine();
127 | if (propDocStr.equals("")) break;
128 |
129 | JSONObject obj = new JSONObject(propDocStr);
130 | String wikidataId = obj.getString("wikidataId");
131 | String label = obj.getString("label");
132 | propDic.put(wikidataId, label);
133 | }
134 | }
135 | }
136 |
--------------------------------------------------------------------------------