├── .travis.yml ├── src ├── main │ ├── java │ │ └── org │ │ │ └── wikidata │ │ │ └── history │ │ │ ├── sparql │ │ │ ├── NotSupportedValueException.java │ │ │ ├── TripleArrayUtils.java │ │ │ ├── ExprValueConverter.java │ │ │ ├── LongRangeUtils.java │ │ │ ├── SimpleQueryPreparer.java │ │ │ ├── RocksRevisionLoader.java │ │ │ ├── HistoryEvaluationStatistics.java │ │ │ ├── PropertyPathOptimizer.java │ │ │ ├── Vocabulary.java │ │ │ ├── RocksTripleLoader.java │ │ │ └── HistoryRepository.java │ │ │ ├── web │ │ │ ├── NotAcceptableResponse.java │ │ │ ├── QueryLogger.java │ │ │ ├── Main.java │ │ │ ├── ContentNegotiation.java │ │ │ └── SparqlEndpoint.java │ │ │ ├── preprocessor │ │ │ ├── HistoryOutput.java │ │ │ ├── FileHistoryOutput.java │ │ │ ├── WikidataPropertyInformation.java │ │ │ ├── WikibaseValueHasher.java │ │ │ └── RevisionFileConverter.java │ │ │ └── Main.java │ └── resources │ │ ├── prefixes.json │ │ └── index.html └── test │ ├── resources │ ├── entities │ │ ├── P3.json │ │ ├── P2.json │ │ ├── Q3.json │ │ ├── Q6.json │ │ ├── Q4.json │ │ ├── Q8.json │ │ └── Q7.json │ ├── rdf │ │ ├── Q3.nt │ │ ├── P2.nt │ │ ├── P3.nt │ │ ├── Q6.nt │ │ ├── Q7.nt │ │ └── Q4.nt │ └── dump_file_sample.xml │ └── java │ └── org │ └── wikidata │ └── history │ ├── web │ └── ContentNegotiationTest.java │ ├── preprocessor │ ├── RevisionFileConverterTest.java │ └── RdfBuilderTest.java │ └── sparql │ ├── NumericValueFactoryTest.java │ └── RocksTripleSourceTest.java ├── download_wd_history.sh ├── README.md └── pom.xml /.travis.yml: -------------------------------------------------------------------------------- 1 | sudo: false 2 | language: java 3 | jdk: 4 | - openjdk8 5 | 6 | cache: 7 | directories: 8 | - $HOME/.m2 -------------------------------------------------------------------------------- /src/main/java/org/wikidata/history/sparql/NotSupportedValueException.java: -------------------------------------------------------------------------------- 1 | package org.wikidata.history.sparql; 2 | 3 | class NotSupportedValueException extends Exception { 4 | NotSupportedValueException(String msg) { 5 | super(msg); 6 | } 7 | } 8 | -------------------------------------------------------------------------------- /download_wd_history.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | curl https://dumps.wikimedia.org/wikidatawiki/latest/ | grep -Po "wikidatawiki-latest-pages-meta-history[0-9]+\.xml-[p0-9]+\.bz2" | while read -r url ; do 4 | echo $url 5 | wget -c "https://dumps.wikimedia.org/wikidatawiki/latest/$url" 6 | done 7 | -------------------------------------------------------------------------------- /src/test/resources/entities/P3.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "property", 3 | "datatype": "time", 4 | "id": "P3", 5 | "labels": { 6 | "en": { 7 | "language": "en", 8 | "value": "Time" 9 | } 10 | }, 11 | "descriptions": {}, 12 | "aliases": {}, 13 | "claims": {} 14 | } 15 | -------------------------------------------------------------------------------- /src/test/resources/entities/P2.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "property", 3 | "datatype": "string", 4 | "id": "P2", 5 | "labels": { 6 | "en": { 7 | "language": "en", 8 | "value": "Instance of" 9 | } 10 | }, 11 | "descriptions": {}, 12 | "aliases": {}, 13 | "claims": {} 14 | } 15 | -------------------------------------------------------------------------------- /src/test/resources/entities/Q3.json: -------------------------------------------------------------------------------- 1 | { 2 | "id": "Q3", 3 | "type": "item", 4 | "sitelinks": { 5 | "enwiki": { 6 | "site": "enwiki", 7 | "title": "San Francisco", 8 | "badges": [ 9 | ] 10 | }, 11 | "ruwiki": { 12 | "site": "ruwiki", 13 | "title": "\u0421\u0430\u043d \u0424\u0440\u0430\u043d\u0446\u0438\u0441\u043a\u043e", 14 | "badges": [ 15 | ] 16 | } 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /src/main/java/org/wikidata/history/web/NotAcceptableResponse.java: -------------------------------------------------------------------------------- 1 | package org.wikidata.history.web; 2 | 3 | import io.javalin.http.HttpResponseException; 4 | import org.jetbrains.annotations.NotNull; 5 | 6 | import java.util.Collections; 7 | import java.util.Map; 8 | 9 | class NotAcceptableResponse extends HttpResponseException { 10 | 11 | NotAcceptableResponse(@NotNull String msg, @NotNull Map details) { 12 | super(406, msg, details); 13 | } 14 | 15 | NotAcceptableResponse(@NotNull String msg) { 16 | this(msg, Collections.emptyMap()); 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /src/main/java/org/wikidata/history/preprocessor/HistoryOutput.java: -------------------------------------------------------------------------------- 1 | package org.wikidata.history.preprocessor; 2 | 3 | import org.eclipse.rdf4j.model.IRI; 4 | import org.eclipse.rdf4j.model.Resource; 5 | import org.eclipse.rdf4j.model.Value; 6 | 7 | import java.io.IOException; 8 | import java.time.Instant; 9 | 10 | interface HistoryOutput extends AutoCloseable { 11 | void addRevision( 12 | long revisionId, long parentRevisionId, String entityId, Instant timestamp, String contributorName, String comment 13 | ) throws IOException; 14 | 15 | void addTriple(Resource subject, IRI predicate, Value object, long... revisionIds) throws IOException; 16 | } 17 | -------------------------------------------------------------------------------- /src/test/java/org/wikidata/history/web/ContentNegotiationTest.java: -------------------------------------------------------------------------------- 1 | package org.wikidata.history.web; 2 | 3 | 4 | import org.junit.jupiter.api.Assertions; 5 | import org.junit.jupiter.api.Test; 6 | 7 | import java.util.Arrays; 8 | import java.util.List; 9 | import java.util.Optional; 10 | 11 | class ContentNegotiationTest { 12 | 13 | @Test 14 | void testNegotiateAccept() { 15 | testAcceptNegotiation(null, Optional.of("application/ld+json")); 16 | testAcceptNegotiation("application/ld+json", Optional.of("application/ld+json")); 17 | testAcceptNegotiation("application/*", Optional.of("application/ld+json")); 18 | testAcceptNegotiation("*/*", Optional.of("application/ld+json")); 19 | testAcceptNegotiation("application/json", Optional.of("application/json")); 20 | testAcceptNegotiation("application/ld+json; charset=UTF-8", Optional.of("application/ld+json")); 21 | testAcceptNegotiation("application/*; charset=UTF-8", Optional.of("application/ld+json")); 22 | testAcceptNegotiation("*/*; charset=UTF-8", Optional.of("application/ld+json")); 23 | testAcceptNegotiation("application/json; charset=UTF-8", Optional.of("application/json")); 24 | testAcceptNegotiation("application/xml", Optional.empty()); 25 | 26 | } 27 | 28 | private void testAcceptNegotiation(String header, Optional expected) { 29 | List possibles = Arrays.asList("application/ld+json", "application/json"); 30 | Assertions.assertEquals(expected, ContentNegotiation.negotiateAccept(header, possibles)); 31 | } 32 | } 33 | 34 | -------------------------------------------------------------------------------- /src/test/resources/rdf/Q3.nt: -------------------------------------------------------------------------------- 1 | "wikipedia" . 2 | . 3 | "en" . 4 | . 5 | "San Francisco"@en . 6 | . 7 | . 8 | "ru" . 9 | . 10 | "\u0421\u0430\u043D \u0424\u0440\u0430\u043D\u0446\u0438\u0441\u043A\u043E"@ru . 11 | . 12 | "wikipedia" . 13 | . 14 | -------------------------------------------------------------------------------- /src/main/resources/prefixes.json: -------------------------------------------------------------------------------- 1 | { 2 | "cc": "http://creativecommons.org/ns#", 3 | "dct": "http://purl.org/dc/terms/", 4 | "geo": "http://www.opengis.net/ont/geosparql#", 5 | "hist": "http://wikiba.se/history/ontology#", 6 | "ontolex": "http://www.w3.org/ns/lemon/ontolex#", 7 | "owl": "http://www.w3.org/2002/07/owl#", 8 | "p": "http://www.wikidata.org/prop/", 9 | "pq": "http://www.wikidata.org/prop/qualifier/", 10 | "pqn": "http://www.wikidata.org/prop/qualifier/value-normalized/", 11 | "pqv": "http://www.wikidata.org/prop/qualifier/value/", 12 | "pr": "http://www.wikidata.org/prop/reference/", 13 | "prn": "http://www.wikidata.org/prop/reference/value-normalized/", 14 | "prov": "http://www.w3.org/ns/prov#", 15 | "prv": "http://www.wikidata.org/prop/reference/value/", 16 | "ps": "http://www.wikidata.org/prop/statement/", 17 | "psn": "http://www.wikidata.org/prop/statement/value-normalized/", 18 | "psv": "http://www.wikidata.org/prop/statement/value/", 19 | "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#", 20 | "rdfs": "http://www.w3.org/2000/01/rdf-schema#", 21 | "schema": "http://schema.org/", 22 | "skos": "http://www.w3.org/2004/02/skos/core#", 23 | "wdata": "http://www.wikidata.org/wiki/Special:EntityData/", 24 | "wd": "http://www.wikidata.org/entity/", 25 | "wdno": "http://www.wikidata.org/prop/novalue/", 26 | "wdref": "http://www.wikidata.org/reference/", 27 | "wds": "http://www.wikidata.org/entity/statement/", 28 | "wdt": "http://www.wikidata.org/prop/direct/", 29 | "wdtn": "http://www.wikidata.org/prop/direct-normalized/", 30 | "wdv": "http://www.wikidata.org/value/", 31 | "wikibase": "http://wikiba.se/ontology#", 32 | "xsd": "http://www.w3.org/2001/XMLSchema#" 33 | } -------------------------------------------------------------------------------- /src/main/java/org/wikidata/history/sparql/TripleArrayUtils.java: -------------------------------------------------------------------------------- 1 | package org.wikidata.history.sparql; 2 | 3 | 4 | final class TripleArrayUtils { 5 | 6 | static long[] addToSortedArray(long[] array, long[] triple) { 7 | int position = 0; 8 | while (position < array.length && (array[position] < triple[0] || (array[position] == triple[0] && (array[position + 1] < triple[1] || (array[position + 1] == triple[1] && array[position + 2] < triple[2]))))) { 9 | position += 3; 10 | } 11 | 12 | //Case where it is already in 13 | if (position < array.length && array[position] == triple[0] && array[position + 1] == triple[1] && array[position + 2] == triple[2]) { 14 | return array; 15 | } 16 | 17 | //We build the new array 18 | long[] newArray = new long[array.length + 3]; 19 | System.arraycopy(array, 0, newArray, 0, position); 20 | System.arraycopy(triple, 0, newArray, position, 3); 21 | System.arraycopy(array, position, newArray, position + 3, array.length - position); 22 | 23 | return newArray; 24 | } 25 | 26 | static long[] removeFromSortedArray(long[] array, long[] triple) { 27 | int position = 0; 28 | while (position < array.length && (array[position] != triple[0] || array[position + 1] != triple[1] || array[position + 2] != triple[2])) { 29 | position += 3; 30 | } 31 | 32 | //Case where it is not in 33 | if (position >= array.length) { 34 | return array; 35 | } 36 | 37 | //We build the new array 38 | long[] newArray = new long[array.length - 3]; 39 | System.arraycopy(array, 0, newArray, 0, position); 40 | System.arraycopy(array, position + 3, newArray, position, array.length - position - 3); 41 | return newArray; 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /src/main/java/org/wikidata/history/sparql/ExprValueConverter.java: -------------------------------------------------------------------------------- 1 | package org.wikidata.history.sparql; 2 | 3 | import org.eclipse.rdf4j.model.*; 4 | import org.eclipse.rdf4j.query.BindingSet; 5 | import org.eclipse.rdf4j.query.Dataset; 6 | import org.eclipse.rdf4j.query.algebra.TupleExpr; 7 | import org.eclipse.rdf4j.query.algebra.Var; 8 | import org.eclipse.rdf4j.query.algebra.evaluation.QueryOptimizer; 9 | import org.eclipse.rdf4j.query.algebra.helpers.AbstractQueryModelVisitor; 10 | 11 | final class ExprValueConverter implements QueryOptimizer { 12 | 13 | private final ValueFactory valueFactory; 14 | 15 | ExprValueConverter(ValueFactory valueFactory) { 16 | this.valueFactory = valueFactory; 17 | } 18 | 19 | public void optimize(TupleExpr tupleExpr, Dataset dataset, BindingSet bindings) { 20 | if (valueFactory instanceof NumericValueFactory) { 21 | tupleExpr.visit(new ClosureVisitor((NumericValueFactory) valueFactory)); 22 | } 23 | } 24 | 25 | protected static class ClosureVisitor extends AbstractQueryModelVisitor { 26 | 27 | private final NumericValueFactory valueFactory; 28 | 29 | ClosureVisitor(NumericValueFactory valueFactory) { 30 | this.valueFactory = valueFactory; 31 | } 32 | 33 | @Override 34 | public void meet(Var var) throws RuntimeException { 35 | Value value = var.getValue(); 36 | if (value instanceof IRI) { 37 | var.setValue(valueFactory.createIRI((IRI) value)); 38 | } else if (value instanceof BNode) { 39 | var.setValue(valueFactory.createBNode((BNode) value)); 40 | } else if (value instanceof Literal) { 41 | var.setValue(valueFactory.createLiteral((Literal) value)); 42 | } 43 | } 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /src/test/resources/dump_file_sample.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | Q1 4 | 0 5 | 1 6 | 7 | 2 8 | 2004-03-22T10:24:41Z 9 | 10 | Foo 11 | 1 12 | 13 | root 14 | {"type": "item", "id": "Q1", "labels": {"fr": {"value": "foo", "language": "fr"}, "en": {"value": 15 | "bar", "language": "en"}, "de": {"value": "bar", "language": "de"}}} 16 | 17 | 18 | 19 | 9 20 | 2 21 | 2004-03-22T10:56:13Z 22 | 23 | Bar 24 | 2 25 | 26 | 27 | intro 28 | {"type": "item", "id": "Q1", "labels": {"fr": {"value": "foo", "language": "fr"}, "en": {"value": 29 | "foo", "language": "en"}, "es": {"value": "foo", "language": "es"}}} 30 | 31 | 32 | 33 | 11 34 | 2 35 | 2004-03-22T10:58:13Z 36 | 37 | Bar 38 | 2 39 | 40 | 41 | intro 42 | {"type": "item", "id": "Q1", "labels": {"fr": {"value": "foo", "language": "fr"}, "en": {"value": 43 | "foo", "language": "en"}, "de": {"value": "bar", "language": "de"}, "es": {"value": "foo", "language": 44 | "es"}}} 45 | 46 | 47 | 48 | -------------------------------------------------------------------------------- /src/main/java/org/wikidata/history/web/QueryLogger.java: -------------------------------------------------------------------------------- 1 | package org.wikidata.history.web; 2 | 3 | import org.slf4j.Logger; 4 | import org.slf4j.LoggerFactory; 5 | 6 | import java.io.BufferedWriter; 7 | import java.io.IOException; 8 | import java.nio.file.Files; 9 | import java.nio.file.Path; 10 | import java.nio.file.StandardOpenOption; 11 | import java.time.LocalDate; 12 | import java.util.ArrayList; 13 | import java.util.Collections; 14 | import java.util.List; 15 | 16 | class QueryLogger implements AutoCloseable { 17 | private final static Logger LOGGER = LoggerFactory.getLogger(QueryLogger.class); 18 | private final static int MAX_CACHE_SIZE = 128; 19 | 20 | private final Path logDirectory; 21 | private final List buffer = new ArrayList<>(); 22 | private LocalDate bufferLocalDate = LocalDate.now(); 23 | 24 | QueryLogger(Path logDirectory) throws IOException { 25 | if (!Files.exists(logDirectory)) { 26 | Files.createDirectory(logDirectory); 27 | } 28 | if (!Files.isDirectory(logDirectory)) { 29 | throw new IOException("The element " + logDirectory + " is not a directory."); 30 | } 31 | this.logDirectory = logDirectory; 32 | } 33 | 34 | synchronized void logQuery(String query) { 35 | LocalDate date = LocalDate.now(); 36 | if (!bufferLocalDate.equals(date)) { 37 | writeCache(); 38 | bufferLocalDate = date; 39 | } 40 | buffer.add(query.replaceAll("\\s+", " ")); 41 | if (buffer.size() > MAX_CACHE_SIZE) { 42 | writeCache(); 43 | } 44 | } 45 | 46 | private void writeCache() { 47 | if (buffer.isEmpty()) { 48 | return; 49 | } 50 | 51 | Collections.shuffle(buffer); 52 | 53 | Path logFile = logDirectory.resolve(bufferLocalDate.toString() + ".txt"); 54 | try (BufferedWriter writer = Files.newBufferedWriter(logFile, StandardOpenOption.CREATE, StandardOpenOption.APPEND)) { 55 | for (String query : buffer) { 56 | writer.write(query); 57 | writer.write('\n'); 58 | } 59 | } catch (IOException e) { 60 | LOGGER.error(e.getMessage(), e); 61 | } 62 | 63 | buffer.clear(); 64 | } 65 | 66 | @Override 67 | public void close() { 68 | writeCache(); 69 | } 70 | } 71 | -------------------------------------------------------------------------------- /src/test/resources/rdf/P2.nt: -------------------------------------------------------------------------------- 1 | . 2 | . 3 | . 4 | . 5 | . 6 | . 7 | . 8 | . 9 | . 10 | . 11 | . 12 | _:genid1 . 13 | . 14 | . 15 | . 16 | _:genid1 . 17 | _:genid1 . 18 | _:genid1 . 19 | "Instance of"@en . -------------------------------------------------------------------------------- /src/main/java/org/wikidata/history/preprocessor/FileHistoryOutput.java: -------------------------------------------------------------------------------- 1 | package org.wikidata.history.preprocessor; 2 | 3 | import org.eclipse.rdf4j.model.IRI; 4 | import org.eclipse.rdf4j.model.Resource; 5 | import org.eclipse.rdf4j.model.Value; 6 | import org.eclipse.rdf4j.rio.helpers.NTriplesUtil; 7 | 8 | import java.io.BufferedWriter; 9 | import java.io.IOException; 10 | import java.io.OutputStreamWriter; 11 | import java.io.Writer; 12 | import java.nio.file.Files; 13 | import java.nio.file.Path; 14 | import java.time.Instant; 15 | import java.util.Arrays; 16 | import java.util.stream.Collectors; 17 | import java.util.zip.GZIPOutputStream; 18 | 19 | public class FileHistoryOutput implements HistoryOutput { 20 | 21 | private final Writer revisionsWriter; 22 | private final Writer triplesWriter; 23 | 24 | 25 | public FileHistoryOutput(Path directory) throws IOException { 26 | revisionsWriter = gzipWriter(directory.resolve("revisions.tsv.gz")); 27 | triplesWriter = gzipWriter(directory.resolve("triples.tsv.gz")); 28 | } 29 | 30 | private Writer gzipWriter(Path path) throws IOException { 31 | return new BufferedWriter(new OutputStreamWriter(new GZIPOutputStream(Files.newOutputStream(path)))); 32 | } 33 | 34 | public synchronized void addRevision( 35 | long revisionId, long parentRevisionId, String entityId, Instant timestamp, String contributorName, String comment 36 | ) throws IOException { 37 | revisionsWriter 38 | .append(Long.toString(revisionId)).append('\t') 39 | .append(Long.toString(parentRevisionId)).append('\t') 40 | .append(entityId).append('\t') 41 | .append(Long.toString(timestamp.getEpochSecond())).append('\t') 42 | .append(contributorName).append('\t') 43 | .append((comment == null) ? "" : NTriplesUtil.escapeString(comment)).append('\n'); 44 | } 45 | 46 | public synchronized void addTriple(Resource subject, IRI predicate, Value object, long... revisionIds) throws IOException { 47 | triplesWriter.append(NTriplesUtil.toNTriplesString(subject)).append('\t') 48 | .append(NTriplesUtil.toNTriplesString(predicate)).append('\t') 49 | .append(NTriplesUtil.toNTriplesString(object)).append('\t') 50 | .append(Arrays.stream(revisionIds).mapToObj(Long::toString).collect(Collectors.joining(" "))).append("\n"); 51 | } 52 | 53 | @Override 54 | public void close() throws IOException { 55 | revisionsWriter.close(); 56 | triplesWriter.close(); 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /src/main/resources/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Wikidata History Query Service 6 | 7 | 9 | 10 | 11 | 16 | 17 | 18 |
19 | 37 |
38 |
39 |
40 |
41 |
42 | 43 | 44 | 62 | -------------------------------------------------------------------------------- /src/main/java/org/wikidata/history/sparql/LongRangeUtils.java: -------------------------------------------------------------------------------- 1 | package org.wikidata.history.sparql; 2 | 3 | import org.eclipse.collections.impl.list.mutable.primitive.LongArrayList; 4 | 5 | final class LongRangeUtils { 6 | 7 | static boolean isInRange(long element, long[] range) { 8 | for (int i = 0; i < range.length; i += 2) { 9 | if (range[i] <= element && element < range[i + 1]) { 10 | return true; 11 | } 12 | } 13 | return false; 14 | } 15 | 16 | static boolean isRangeStart(long element, long[] range) { 17 | for (int i = 0; i < range.length; i += 2) { 18 | if (range[i] == element) { 19 | return true; 20 | } 21 | } 22 | return false; 23 | } 24 | 25 | static boolean isRangeEnd(long element, long[] range) { 26 | for (int i = 1; i < range.length; i += 2) { 27 | if (range[i] == element) { 28 | return true; 29 | } 30 | } 31 | return false; 32 | } 33 | 34 | static long[] union(long[] a, long[] b) { 35 | if (a.length == 0) { 36 | return b; 37 | } else if (b.length == 0) { 38 | return a; 39 | } else if (a.length == 2 && b.length == 2) { 40 | //Simple case optimization 41 | if (a[1] < b[0]) { 42 | return new long[]{a[0], a[1], b[0], b[1]}; 43 | } else if (b[1] < a[0]) { 44 | return new long[]{b[0], b[1], a[0], a[1]}; 45 | } else { 46 | return new long[]{ 47 | Math.min(a[0], b[0]), 48 | Math.max(a[1], b[1]) 49 | }; 50 | } 51 | } else { 52 | LongArrayList result = new LongArrayList(); 53 | for (int i = 0, j = 0; i < a.length || j < b.length; ) { 54 | if (i < a.length && (j >= b.length || a[i] <= b[j])) { 55 | if (!result.isEmpty() && result.getLast() >= a[i]) { 56 | result.set(result.size() - 1, Math.max(result.getLast(), a[i + 1])); 57 | } else { 58 | result.add(a[i]); 59 | result.add(a[i + 1]); 60 | } 61 | i += 2; 62 | } else { 63 | if (!result.isEmpty() && result.getLast() >= b[j]) { 64 | result.set(result.size() - 1, Math.max(result.getLast(), b[j + 1])); 65 | } else { 66 | result.add(b[j]); 67 | result.add(b[j + 1]); 68 | } 69 | j += 2; 70 | } 71 | } 72 | return result.toArray(); 73 | } 74 | } 75 | 76 | static boolean isSorted(long[] array) { 77 | for (int i = 1; i < array.length; i++) { 78 | if (array[i] <= array[i - 1]) { 79 | return false; 80 | } 81 | } 82 | return true; 83 | } 84 | } 85 | -------------------------------------------------------------------------------- /src/main/java/org/wikidata/history/web/Main.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2017 Simple WD Developers 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.wikidata.history.web; 18 | 19 | import io.javalin.Javalin; 20 | import org.apache.commons.cli.*; 21 | import org.wikidata.history.sparql.RocksTripleSource; 22 | 23 | import java.io.IOException; 24 | import java.nio.file.Path; 25 | import java.nio.file.Paths; 26 | 27 | public class Main { 28 | 29 | public static void main(String[] args) throws ParseException, IOException { 30 | Options options = new Options(); 31 | options.addOption("i", "index", true, "Directory where the index data are."); 32 | options.addOption("h", "host", true, "Host name"); 33 | options.addOption("p", "port", true, "Name of the port to listen from"); 34 | options.addOption("l", "logFile", true, "Name of the query log file. By default query-log.txt"); 35 | 36 | CommandLineParser parser = new DefaultParser(); 37 | CommandLine line = parser.parse(options, args); 38 | Path indexPath = Paths.get(line.getOptionValue("index", "wd-history-index")); 39 | Path queryLog = Paths.get(line.getOptionValue("logFile", "query-log")); 40 | 41 | String portString = line.getOptionValue("port", System.getenv("PORT")); 42 | int port = (portString != null) ? Integer.parseInt(portString) : 7000; 43 | 44 | RocksTripleSource tripleSource = new RocksTripleSource(indexPath); 45 | QueryLogger queryLogger = new QueryLogger(queryLog); 46 | SparqlEndpoint sparqlEndpoint = new SparqlEndpoint(tripleSource, queryLogger); 47 | Javalin javalin = Javalin.create() 48 | .get("", ctx -> ctx.contentType("text/html").result(Main.class.getResourceAsStream("/index.html"))) 49 | .get("/sparql", sparqlEndpoint::get) 50 | .post("/sparql", sparqlEndpoint::post) 51 | .get("/prefixes", ctx -> ctx.contentType("application/json").result(Main.class.getResourceAsStream("/prefixes.json"))) 52 | .start(port); 53 | 54 | Runtime.getRuntime().addShutdownHook(new Thread(() -> { 55 | javalin.stop(); 56 | queryLogger.close(); 57 | tripleSource.close(); 58 | })); 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | SPARQL endpoint for Wikidata history 2 | ==================================== 3 | 4 | This repository provides a SPARQL endpoint for Wikidata history, allowing to do queries like "count the number of humans in Wikidata in 2015" or "how many contributors have added values for the sex or gender property". 5 | 6 | A [documentation page is available on Wikidata.org](https://www.wikidata.org/wiki/Wikidata:History_Query_Service). 7 | 8 | [![Build Status](https://travis-ci.org/Tpt/wikidata-sparql-history.svg?branch=master)](https://travis-ci.org/Tpt/wikidata-sparql-history) 9 | 10 | ## Developer documentation 11 | 12 | To setup a working endpoint do: 13 | 14 | * Compile the Java program `mvn package` 15 | * Download the Wikidata history dumps to a directory `mkdir dumps && cd dumps && bash ../download_wd_history.sh`. Warning: it requires around 600GB of disk. 16 | * Preprocess the dump to get all revision metadata and triples annotated with their insertions and deletions (takes a few days and all your CPU cores): `java -server -jar target/sparql-endpoint-0.1-SNAPSHOT.jar -preprocess` 17 | * Build database indexes: `java -server -jar target/sparql-endpoint-0.1-SNAPSHOT.jar -load`. This task is mostly I/O bounded. A (big) fast SSD helps a lot. 18 | * Start the web server `java -server -classpath target/sparql-endpoint-0.1-SNAPSHOT.jar org.wikidata.history.web.Main` 19 | 20 | ## License 21 | 22 | Copyright (C) 2019 Thomas Pellissier Tanon. 23 | 24 | This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. 25 | 26 | This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. 27 | 28 | ## Citation 29 | 30 | If you want to cite this software in an academic publication, please use: 31 | 32 | Pellissier Tanon T., Suchanek F. (2019) Querying the Edit History of Wikidata. In: Hitzler P. et al. (eds) The Semantic Web: ESWC 2019 Satellite Events. ESWC 2019. Lecture Notes in Computer Science, vol 11762. Springer, Cham 33 | 34 | In BibTex: 35 | ```bibtex 36 | @inproceedings{wikdiataHistoryQueryService, 37 | author = {Thomas Pellissier Tanon and Fabian M. Suchanek}, 38 | title = {Querying the Edit History of Wikidata}, 39 | booktitle = {The Semantic Web: {ESWC} 2019 Satellite Events - {ESWC} 2019 Satellite 40 | Events, Portoro{\v{z}}, Slovenia, June 2-6, 2019, Revised Selected 41 | Papers}, 42 | pages = {161--166}, 43 | year = {2019}, 44 | doi = {10.1007/978-3-030-32327-1\_32} 45 | } 46 | ``` 47 | -------------------------------------------------------------------------------- /src/main/java/org/wikidata/history/preprocessor/WikidataPropertyInformation.java: -------------------------------------------------------------------------------- 1 | package org.wikidata.history.preprocessor; 2 | 3 | import org.eclipse.rdf4j.model.IRI; 4 | import org.eclipse.rdf4j.query.AbstractTupleQueryResultHandler; 5 | import org.eclipse.rdf4j.query.BindingSet; 6 | import org.eclipse.rdf4j.query.TupleQueryResultHandlerException; 7 | import org.eclipse.rdf4j.repository.RepositoryConnection; 8 | import org.eclipse.rdf4j.repository.sparql.SPARQLRepository; 9 | import org.wikidata.wdtk.datamodel.helpers.Datamodel; 10 | import org.wikidata.wdtk.datamodel.interfaces.DatatypeIdValue; 11 | import org.wikidata.wdtk.datamodel.interfaces.PropertyIdValue; 12 | 13 | import java.util.Collections; 14 | import java.util.HashMap; 15 | import java.util.Map; 16 | 17 | class WikidataPropertyInformation { 18 | 19 | private static final String WDQS_ENDPOINT = "https://query.wikidata.org/sparql"; 20 | private static final String USER_AGENT = "WikidataHistoryLoader/0.1"; 21 | private static final String QUERY = "SELECT ?property ?datatype ?uriPattern WHERE { ?property wikibase:propertyType ?datatype . OPTIONAL { ?property wdt:P1921 ?uriPattern }}"; 22 | 23 | private final Map datatypes = new HashMap<>(); 24 | private final Map uriPatterns = new HashMap<>(); 25 | 26 | WikidataPropertyInformation() { 27 | SPARQLRepository repository = new SPARQLRepository(WDQS_ENDPOINT); 28 | repository.setAdditionalHttpHeaders(Collections.singletonMap("User-Agent", USER_AGENT)); 29 | repository.init(); 30 | try (RepositoryConnection connection = repository.getConnection()) { 31 | connection.prepareTupleQuery(QUERY).evaluate(new AbstractTupleQueryResultHandler() { 32 | @Override 33 | public void handleSolution(BindingSet bindingSet) throws TupleQueryResultHandlerException { 34 | IRI propertyIRI = (IRI) bindingSet.getValue("property"); 35 | PropertyIdValue property = Datamodel.makePropertyIdValue(propertyIRI.getLocalName(), propertyIRI.getNamespace()); 36 | datatypes.put(property, Datamodel.makeDatatypeIdValue(bindingSet.getValue("datatype").stringValue())); 37 | if (bindingSet.hasBinding("uriPattern")) { 38 | uriPatterns.put(property, bindingSet.getValue("uriPattern").stringValue()); 39 | } 40 | } 41 | }); 42 | } 43 | repository.shutDown(); 44 | } 45 | 46 | DatatypeIdValue getDatatype(PropertyIdValue propertyId) { 47 | return datatypes.get(propertyId); 48 | } 49 | 50 | String getDatatypeIRI(PropertyIdValue propertyId) { 51 | DatatypeIdValue datatype = datatypes.get(propertyId); 52 | return datatype == null ? DatatypeIdValue.DT_STRING : datatype.getIri(); 53 | } 54 | 55 | String getUriPattern(PropertyIdValue propertyId) { 56 | return uriPatterns.get(propertyId); 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /src/main/java/org/wikidata/history/web/ContentNegotiation.java: -------------------------------------------------------------------------------- 1 | package org.wikidata.history.web; 2 | 3 | import java.util.Arrays; 4 | import java.util.Comparator; 5 | import java.util.List; 6 | import java.util.Optional; 7 | 8 | /** 9 | * TODO: does not support parameters (charset...) 10 | */ 11 | class ContentNegotiation { 12 | 13 | static Optional negotiateAccept(String acceptHeader, List available) throws IllegalArgumentException { 14 | if (acceptHeader == null) { 15 | acceptHeader = "*/*"; 16 | } 17 | 18 | return Arrays.stream(acceptHeader.split(",")) 19 | .map(MediaRange::parse) 20 | .sorted(Comparator.reverseOrder()) 21 | .flatMap(r -> available.stream().filter(r::match)) 22 | .findFirst(); 23 | } 24 | 25 | private static final class MediaRange implements Comparable { 26 | final String type; 27 | final String subType; 28 | final float q; 29 | 30 | private MediaRange(String type, String subType, float q) { 31 | this.type = type; 32 | this.subType = subType; 33 | this.q = q; 34 | } 35 | 36 | static MediaRange parse(String mediaRange) throws IllegalArgumentException { 37 | float q = 1; 38 | 39 | String[] parts = mediaRange.split(";"); 40 | String[] mime = parts[0].split("/"); 41 | if (mime.length != 2) { 42 | throw new IllegalArgumentException("Invalid mime type: " + parts[0]); 43 | } 44 | String type = mime[0].trim(); 45 | String subType = mime[1].trim(); 46 | for (int i = 1; i < parts.length; i++) { 47 | String[] parameterParts = parts[i].split("="); 48 | if (parameterParts.length != 2) { 49 | throw new IllegalArgumentException("Invalid parameter: " + parts[i]); 50 | } 51 | String name = parameterParts[0].trim(); 52 | String value = parameterParts[1].trim(); 53 | if ("q".equals(name)) { 54 | try { 55 | q = Float.parseFloat(value); 56 | } catch (IllegalArgumentException e) { 57 | throw new IllegalArgumentException("q parameter value should be a float, found " + value); 58 | } 59 | if (q < 0 || q > 1) { 60 | throw new IllegalArgumentException("q parameter value should be between 0 and 1, found " + value); 61 | } 62 | } 63 | } 64 | return new MediaRange(type, subType, q); 65 | } 66 | 67 | boolean match(String mime) { 68 | String[] parts = mime.split("/", 2); 69 | return type.equals("*") || (type.equals(parts[0])) && (subType.equals("*") || subType.equals(parts[1])); 70 | } 71 | 72 | @Override 73 | public int compareTo(MediaRange mediaRange) { 74 | return Float.compare(q, mediaRange.q); 75 | } 76 | 77 | @Override 78 | public String toString() { 79 | return type + '/' + subType + "; q=" + q; 80 | } 81 | } 82 | } 83 | -------------------------------------------------------------------------------- /src/test/resources/rdf/P3.nt: -------------------------------------------------------------------------------- 1 | . 2 | . 3 | . 4 | . 5 | . 6 | . 7 | . 8 | . 9 | . 10 | . 11 | . 12 | . 13 | . 14 | . 15 | _:genid1 . 16 | . 17 | . 18 | . 19 | . 20 | . 21 | . 22 | _:genid1 . 23 | _:genid1 . 24 | _:genid1 . 25 | "Time"@en . -------------------------------------------------------------------------------- /src/main/java/org/wikidata/history/sparql/SimpleQueryPreparer.java: -------------------------------------------------------------------------------- 1 | package org.wikidata.history.sparql; 2 | 3 | import org.eclipse.rdf4j.common.iteration.CloseableIteration; 4 | import org.eclipse.rdf4j.query.BindingSet; 5 | import org.eclipse.rdf4j.query.Dataset; 6 | import org.eclipse.rdf4j.query.QueryEvaluationException; 7 | import org.eclipse.rdf4j.query.UpdateExecutionException; 8 | import org.eclipse.rdf4j.query.algebra.QueryRoot; 9 | import org.eclipse.rdf4j.query.algebra.TupleExpr; 10 | import org.eclipse.rdf4j.query.algebra.UpdateExpr; 11 | import org.eclipse.rdf4j.query.algebra.evaluation.AbstractQueryPreparer; 12 | import org.eclipse.rdf4j.query.algebra.evaluation.EvaluationStrategy; 13 | import org.eclipse.rdf4j.query.algebra.evaluation.QueryOptimizer; 14 | import org.eclipse.rdf4j.query.algebra.evaluation.TripleSource; 15 | import org.eclipse.rdf4j.query.algebra.evaluation.impl.*; 16 | import org.eclipse.rdf4j.repository.sparql.federation.SPARQLServiceResolver; 17 | 18 | public final class SimpleQueryPreparer extends AbstractQueryPreparer { 19 | private static final SPARQLServiceResolver SPARQL_SERVICE_RESOLVER = new SPARQLServiceResolver(); 20 | private static final EvaluationStatistics EVALUATION_STATISTICS = new HistoryEvaluationStatistics(); 21 | private static final QueryOptimizer[] SIMPLE_OPTIMIZERS = new QueryOptimizer[]{ 22 | new PropertyPathOptimizer(), 23 | new BindingAssigner(), 24 | new CompareOptimizer(), 25 | new ConjunctiveConstraintSplitter(), 26 | new DisjunctiveConstraintOptimizer(), 27 | new SameTermFilterOptimizer(), 28 | new QueryModelNormalizer(), 29 | new IterativeEvaluationOptimizer(), 30 | new FilterOptimizer(), 31 | new OrderLimitOptimizer(), 32 | new QueryJoinOptimizer(EVALUATION_STATISTICS) 33 | }; 34 | 35 | public SimpleQueryPreparer(TripleSource tripleSource) { 36 | super(tripleSource); 37 | } 38 | 39 | @Override 40 | protected CloseableIteration evaluate( 41 | TupleExpr tupleExpr, Dataset dataset, BindingSet bindings, boolean includeInferred, int maxExecutionTime 42 | ) throws QueryEvaluationException { 43 | tupleExpr = tupleExpr.clone(); 44 | if (!(tupleExpr instanceof QueryRoot)) { 45 | tupleExpr = new QueryRoot(tupleExpr); 46 | } 47 | 48 | EvaluationStrategy strategy = new ExtendedEvaluationStrategy(getTripleSource(), dataset, SPARQL_SERVICE_RESOLVER, 0L, EVALUATION_STATISTICS); 49 | 50 | for (QueryOptimizer optimizer : SIMPLE_OPTIMIZERS) { 51 | optimizer.optimize(tupleExpr, dataset, bindings); 52 | } 53 | new ConstantOptimizer(strategy).optimize(tupleExpr, dataset, bindings); 54 | 55 | new ExprValueConverter(getTripleSource().getValueFactory()).optimize(tupleExpr, dataset, bindings); 56 | 57 | //System.out.println("Query plan:\n" + tupleExpr); 58 | 59 | return strategy.evaluate(tupleExpr, bindings); 60 | } 61 | 62 | @Override 63 | protected void execute( 64 | UpdateExpr updateExpr, Dataset dataset, BindingSet bindings, boolean includeInferred, int maxExecutionTime 65 | ) throws UpdateExecutionException { 66 | throw new UpdateExecutionException("This repository is read only"); 67 | } 68 | } -------------------------------------------------------------------------------- /src/main/java/org/wikidata/history/sparql/RocksRevisionLoader.java: -------------------------------------------------------------------------------- 1 | package org.wikidata.history.sparql; 2 | 3 | import org.apache.commons.lang3.tuple.Pair; 4 | import org.slf4j.Logger; 5 | import org.slf4j.LoggerFactory; 6 | 7 | import java.io.BufferedReader; 8 | import java.io.IOException; 9 | import java.io.InputStreamReader; 10 | import java.nio.file.Files; 11 | import java.nio.file.Path; 12 | import java.util.Arrays; 13 | import java.util.Map; 14 | import java.util.zip.GZIPInputStream; 15 | 16 | public final class RocksRevisionLoader implements AutoCloseable { 17 | private static final Logger LOGGER = LoggerFactory.getLogger(RocksRevisionLoader.class); 18 | private static final long[] EMPTY_ARRAY = new long[]{}; 19 | 20 | private final NumericValueFactory valueFactory; 21 | private final RocksStore store; 22 | 23 | public RocksRevisionLoader(Path path) { 24 | LOGGER.info("Loading revision data to " + path); 25 | 26 | valueFactory = new NumericValueFactory(new NumericValueFactory.EmptyStringStore()); 27 | store = new RocksStore(path, false); 28 | } 29 | 30 | public void load(Path file) throws IOException { 31 | RocksStore.Index revisionDateOutput = store.revisionDateIndex(); 32 | RocksStore.Index dateRevisionsOutput = store.dateRevisionsIndex(); 33 | RocksStore.Index parentRevisionOutput = store.parentRevisionIndex(); 34 | RocksStore.Index childRevisionOutput = store.childRevisionIndex(); 35 | RocksStore.Index revisionTopicOutput = store.revisionTopicIndex(); 36 | RocksStore.Index topicRevisionsOutput = store.topicRevisionIndex(); 37 | RocksStore.Index revisionContributorOutput = store.revisionContributorIndex(); 38 | RocksStore.Index, Object> contributorRevisionsIndex = store.contributorRevisionsIndex(); 39 | 40 | try (BufferedReader reader = gzipReader(file)) { 41 | reader.lines().parallel().forEach(line -> { 42 | String[] parts = line.split("\t"); 43 | long revisionId = Long.parseLong(parts[0]); 44 | long parentRevisionId = Long.parseLong(parts[1]); 45 | long timestamp = Long.parseLong(parts[3]); 46 | String contributor = parts[4]; 47 | 48 | if (parentRevisionId >= 0) { 49 | parentRevisionOutput.put(revisionId, parentRevisionId); 50 | childRevisionOutput.put(parentRevisionId, revisionId); 51 | } 52 | 53 | try { 54 | long entity = valueFactory.encodeValue(valueFactory.createIRI(Vocabulary.WD_NAMESPACE, parts[2])); 55 | revisionTopicOutput.put(revisionId, entity); 56 | addToMultipleValuesIndex(topicRevisionsOutput, entity, revisionId); 57 | } catch (NotSupportedValueException e) { 58 | LOGGER.error(e.getMessage(), e); 59 | } 60 | 61 | revisionDateOutput.put(revisionId, timestamp); 62 | addToMultipleValuesIndex(dateRevisionsOutput, timestamp, revisionId); 63 | 64 | revisionContributorOutput.put(revisionId, contributor); 65 | contributorRevisionsIndex.put(Pair.of(contributor, revisionId), null); 66 | }); 67 | } 68 | 69 | LOGGER.info("Compacting store"); 70 | store.compact(); 71 | } 72 | 73 | private BufferedReader gzipReader(Path path) throws IOException { 74 | return new BufferedReader(new InputStreamReader(new GZIPInputStream(Files.newInputStream(path)))); 75 | } 76 | 77 | private void addToMultipleValuesIndex(RocksStore.Index index, K key, long value) { 78 | long[] otherValues = index.getOrDefault(key, EMPTY_ARRAY); 79 | long[] allValues = Arrays.copyOfRange(otherValues, 0, otherValues.length + 1); 80 | allValues[otherValues.length] = value; 81 | index.put(key, allValues); 82 | } 83 | 84 | @Override 85 | public void close() { 86 | valueFactory.close(); 87 | store.close(); 88 | } 89 | } 90 | -------------------------------------------------------------------------------- /src/main/java/org/wikidata/history/sparql/HistoryEvaluationStatistics.java: -------------------------------------------------------------------------------- 1 | package org.wikidata.history.sparql; 2 | 3 | import org.eclipse.rdf4j.model.IRI; 4 | import org.eclipse.rdf4j.model.Value; 5 | import org.eclipse.rdf4j.query.algebra.StatementPattern; 6 | import org.eclipse.rdf4j.query.algebra.evaluation.impl.EvaluationStatistics; 7 | 8 | import java.util.Arrays; 9 | import java.util.List; 10 | 11 | public class HistoryEvaluationStatistics extends EvaluationStatistics { 12 | 13 | private static final List REVISION_ATTRIBUTES = Arrays.asList( 14 | Vocabulary.HISTORY_GLOBAL_STATE, 15 | Vocabulary.HISTORY_ADDITIONS, 16 | Vocabulary.HISTORY_DELETIONS, 17 | Vocabulary.HISTORY_REVISION_ID, 18 | Vocabulary.HISTORY_PREVIOUS_REVISION, 19 | Vocabulary.HISTORY_NEXT_REVISION, 20 | Vocabulary.SCHEMA_DATE_CREATED, 21 | Vocabulary.SCHEMA_IS_BASED_ON 22 | ); 23 | 24 | @Override 25 | protected CardinalityCalculator createCardinalityCalculator() { 26 | return new HistoryCardinalityCalculator(); 27 | } 28 | 29 | protected static class HistoryCardinalityCalculator extends CardinalityCalculator { 30 | 31 | @Override 32 | public double getCardinality(StatementPattern sp) { 33 | Value context = (sp.getContextVar() == null) ? null : sp.getContextVar().getValue(); 34 | 35 | if (context != null) { 36 | if (!(context instanceof IRI)) { 37 | return 0; 38 | } 39 | String contextNamespace = ((IRI) context).getNamespace(); 40 | switch (contextNamespace) { 41 | case Vocabulary.REVISION_ADDITIONS_NAMESPACE: 42 | case Vocabulary.REVISION_DELETIONS_NAMESPACE: 43 | return 1; // The number of triple is always small, let's use it first 44 | case Vocabulary.REVISION_GLOBAL_STATE_NAMESPACE: 45 | return getDataTripleCardinality(sp.getSubjectVar().getValue(), sp.getPredicateVar().getValue(), sp.getObjectVar().getValue()); 46 | default: 47 | return 0; // Does not exists 48 | } 49 | } 50 | 51 | Value subject = sp.getSubjectVar().getValue(); 52 | Value predicate = sp.getPredicateVar().getValue(); 53 | Value object = sp.getObjectVar().getValue(); 54 | 55 | if (predicate != null && sp.getContextVar() == null) { 56 | if (REVISION_ATTRIBUTES.contains(predicate)) { 57 | return (subject != null || object != null) ? 1 : Integer.MAX_VALUE; 58 | } 59 | if (predicate.equals(Vocabulary.SCHEMA_ABOUT)) { 60 | if (subject != null) { 61 | return 1; 62 | } else if (object != null) { 63 | return 100; 64 | } else { 65 | return Integer.MAX_VALUE; 66 | } 67 | } 68 | if (predicate.equals(Vocabulary.SCHEMA_AUTHOR)) { 69 | if (subject != null) { 70 | return 1; 71 | } else if (object != null) { 72 | return 10000; 73 | } else { 74 | return Integer.MAX_VALUE; 75 | } 76 | } 77 | if (predicate.equals(Vocabulary.HISTORY_GLOBAL_STATE_AT)) { 78 | if (object != null) { 79 | return 1; 80 | } else { 81 | return Integer.MAX_VALUE; 82 | } 83 | } 84 | } 85 | 86 | //We are querying revision data without predicate 87 | if ( 88 | (subject instanceof IRI && Vocabulary.REVISION_NAMESPACE.equals(((IRI) subject).getNamespace())) || 89 | (object instanceof IRI && Vocabulary.REVISION_NAMESPACE.equals(((IRI) object).getNamespace())) 90 | ) { 91 | return 10; 92 | } 93 | 94 | return getDataTripleCardinality(subject, predicate, object); 95 | } 96 | 97 | private double getDataTripleCardinality(Value subject, Value predicate, Value object) { 98 | //TODO: improve with statistics 99 | if (subject != null) { 100 | return (predicate != null || object != null) ? 1 : 100; 101 | } 102 | if (object != null) { 103 | return 100_000; 104 | } 105 | return 1_000_000_000; 106 | } 107 | } 108 | } 109 | -------------------------------------------------------------------------------- /src/main/java/org/wikidata/history/sparql/PropertyPathOptimizer.java: -------------------------------------------------------------------------------- 1 | package org.wikidata.history.sparql; 2 | 3 | import org.eclipse.rdf4j.query.BindingSet; 4 | import org.eclipse.rdf4j.query.Dataset; 5 | import org.eclipse.rdf4j.query.algebra.ArbitraryLengthPath; 6 | import org.eclipse.rdf4j.query.algebra.Join; 7 | import org.eclipse.rdf4j.query.algebra.StatementPattern; 8 | import org.eclipse.rdf4j.query.algebra.TupleExpr; 9 | import org.eclipse.rdf4j.query.algebra.evaluation.QueryOptimizer; 10 | import org.eclipse.rdf4j.query.algebra.helpers.AbstractQueryModelVisitor; 11 | 12 | import java.util.Objects; 13 | 14 | /** 15 | * Improves the property paths. Currently rewrites: 16 | * - p / p* to p+ 17 | * - p* / p to p+ 18 | */ 19 | final class PropertyPathOptimizer implements QueryOptimizer { 20 | 21 | public void optimize(TupleExpr tupleExpr, Dataset dataset, BindingSet bindings) { 22 | tupleExpr.visit(new ClosureVisitor()); 23 | } 24 | 25 | protected static class ClosureVisitor extends AbstractQueryModelVisitor { 26 | 27 | @Override 28 | public void meet(Join join) { 29 | super.meet(join); 30 | 31 | TupleExpr left = join.getLeftArg(); 32 | TupleExpr right = join.getRightArg(); 33 | 34 | if (left instanceof StatementPattern && right instanceof ArbitraryLengthPath) { 35 | StatementPattern leftPattern = (StatementPattern) left; 36 | ArbitraryLengthPath rightClosure = (ArbitraryLengthPath) right; 37 | 38 | if (rightClosure.getPathExpression() instanceof StatementPattern) { 39 | StatementPattern rightPattern = (StatementPattern) rightClosure.getPathExpression(); 40 | if (Objects.equals(leftPattern.getPredicateVar(), rightPattern.getPredicateVar()) && 41 | Objects.equals(leftPattern.getContextVar(), rightPattern.getContextVar()) && 42 | leftPattern.getScope() == rightPattern.getScope()) { 43 | if (leftPattern.getObjectVar().equals(rightPattern.getSubjectVar())) { 44 | join.replaceWith(new ArbitraryLengthPath( 45 | rightClosure.getScope(), 46 | leftPattern.getSubjectVar(), 47 | new StatementPattern( 48 | leftPattern.getScope(), 49 | leftPattern.getSubjectVar(), 50 | leftPattern.getPredicateVar(), 51 | rightClosure.getObjectVar(), 52 | rightClosure.getContextVar() 53 | ), 54 | rightClosure.getObjectVar(), 55 | rightClosure.getContextVar(), 56 | rightClosure.getMinLength() + 1 57 | )); 58 | } 59 | } 60 | } 61 | } else if (left instanceof ArbitraryLengthPath && right instanceof StatementPattern) { 62 | ArbitraryLengthPath leftClosure = (ArbitraryLengthPath) left; 63 | StatementPattern rightPattern = (StatementPattern) right; 64 | 65 | if (leftClosure.getPathExpression() instanceof StatementPattern) { 66 | StatementPattern leftPattern = (StatementPattern) leftClosure.getPathExpression(); 67 | if (Objects.equals(leftPattern.getPredicateVar(), rightPattern.getPredicateVar()) && 68 | Objects.equals(leftPattern.getContextVar(), rightPattern.getContextVar()) && 69 | leftPattern.getScope() == rightPattern.getScope()) { 70 | if (leftPattern.getObjectVar().equals(rightPattern.getSubjectVar())) { 71 | join.replaceWith(new ArbitraryLengthPath( 72 | leftClosure.getScope(), 73 | leftPattern.getSubjectVar(), 74 | new StatementPattern( 75 | leftPattern.getScope(), 76 | leftPattern.getSubjectVar(), 77 | leftPattern.getPredicateVar(), 78 | rightPattern.getObjectVar(), 79 | rightPattern.getContextVar() 80 | ), 81 | rightPattern.getObjectVar(), 82 | leftClosure.getContextVar(), 83 | leftClosure.getMinLength() + 1 84 | )); 85 | } 86 | } 87 | } 88 | } 89 | } 90 | } 91 | } 92 | -------------------------------------------------------------------------------- /src/test/java/org/wikidata/history/preprocessor/RevisionFileConverterTest.java: -------------------------------------------------------------------------------- 1 | package org.wikidata.history.preprocessor; 2 | 3 | import org.apache.commons.compress.compressors.bzip2.BZip2CompressorOutputStream; 4 | import org.apache.commons.io.IOUtils; 5 | import org.eclipse.rdf4j.model.*; 6 | import org.eclipse.rdf4j.model.impl.SimpleValueFactory; 7 | import org.eclipse.rdf4j.model.vocabulary.RDF; 8 | import org.eclipse.rdf4j.model.vocabulary.RDFS; 9 | import org.junit.jupiter.api.Assertions; 10 | import org.junit.jupiter.api.Test; 11 | import org.wikidata.history.sparql.Vocabulary; 12 | 13 | import java.io.IOException; 14 | import java.io.InputStream; 15 | import java.io.OutputStream; 16 | import java.nio.file.Files; 17 | import java.nio.file.Path; 18 | import java.time.Instant; 19 | import java.util.HashMap; 20 | import java.util.Map; 21 | 22 | class RevisionFileConverterTest { 23 | 24 | private static final Map EXPECTED_TRIPLES = new HashMap<>(); 25 | 26 | static { 27 | ValueFactory vf = SimpleValueFactory.getInstance(); 28 | EXPECTED_TRIPLES.put(vf.createStatement( 29 | vf.createIRI(Vocabulary.WD_NAMESPACE, "Q1"), 30 | RDF.TYPE, 31 | vf.createIRI(Vocabulary.WB_NAMESPACE, "Item") 32 | ), new long[]{2, Long.MAX_VALUE}); 33 | EXPECTED_TRIPLES.put(vf.createStatement( 34 | vf.createIRI(Vocabulary.WD_NAMESPACE, "Q1"), 35 | RDFS.LABEL, 36 | vf.createLiteral("foo", "fr") 37 | ), new long[]{2, Long.MAX_VALUE}); 38 | EXPECTED_TRIPLES.put(vf.createStatement( 39 | vf.createIRI(Vocabulary.WD_NAMESPACE, "Q1"), 40 | RDFS.LABEL, 41 | vf.createLiteral("bar", "en") 42 | ), new long[]{2, 9}); 43 | EXPECTED_TRIPLES.put(vf.createStatement( 44 | vf.createIRI(Vocabulary.WD_NAMESPACE, "Q1"), 45 | RDFS.LABEL, 46 | vf.createLiteral("foo", "en") 47 | ), new long[]{9, Long.MAX_VALUE}); 48 | EXPECTED_TRIPLES.put(vf.createStatement( 49 | vf.createIRI(Vocabulary.WD_NAMESPACE, "Q1"), 50 | RDFS.LABEL, 51 | vf.createLiteral("bar", "de") 52 | ), new long[]{2, 9, 11, Long.MAX_VALUE}); 53 | EXPECTED_TRIPLES.put(vf.createStatement( 54 | vf.createIRI(Vocabulary.WD_NAMESPACE, "Q1"), 55 | RDFS.LABEL, 56 | vf.createLiteral("foo", "es") 57 | ), new long[]{9, Long.MAX_VALUE}); 58 | } 59 | 60 | @Test 61 | void test() throws IOException, InterruptedException { 62 | ListHistoryOutput output = new ListHistoryOutput(); 63 | RevisionFileConverter revisionFileConverter = new RevisionFileConverter(output); 64 | revisionFileConverter.process(makeDumpFile()); 65 | assertMapEquals(EXPECTED_TRIPLES, output.triples); 66 | } 67 | 68 | private Path makeDumpFile() throws IOException { 69 | Path file = Files.createTempFile("foo", ".xml.bz2"); 70 | try ( 71 | InputStream input = getClass().getResourceAsStream("/dump_file_sample.xml"); 72 | OutputStream output = new BZip2CompressorOutputStream(Files.newOutputStream(file)) 73 | ) { 74 | IOUtils.copy(input, output); 75 | 76 | } 77 | return file; 78 | } 79 | 80 | private static final class ListHistoryOutput implements HistoryOutput { 81 | 82 | private final Map triples = new HashMap<>(); 83 | 84 | @Override 85 | public void addRevision(long revisionId, long parentRevisionId, String entityId, Instant timestamp, String contributorName, String comment) { 86 | } 87 | 88 | @Override 89 | public void addTriple(Resource subject, IRI predicate, Value object, long... revisionIds) { 90 | triples.put(SimpleValueFactory.getInstance().createStatement(subject, predicate, object), revisionIds); 91 | } 92 | 93 | @Override 94 | public void close() { 95 | 96 | } 97 | } 98 | 99 | private static void assertMapEquals(Map expected, Map actual) { 100 | Assertions.assertEquals(expected.size(), actual.size()); 101 | for (Map.Entry e : expected.entrySet()) { 102 | if (e.getValue() instanceof long[]) { 103 | Assertions.assertArrayEquals((long[]) e.getValue(), (long[]) actual.get(e.getKey())); 104 | } else { 105 | Assertions.assertEquals(e.getValue(), actual.get(e.getKey())); 106 | } 107 | } 108 | } 109 | } 110 | -------------------------------------------------------------------------------- /src/main/java/org/wikidata/history/sparql/Vocabulary.java: -------------------------------------------------------------------------------- 1 | package org.wikidata.history.sparql; 2 | 3 | import com.google.common.collect.Sets; 4 | import org.eclipse.rdf4j.model.IRI; 5 | import org.eclipse.rdf4j.model.impl.SimpleValueFactory; 6 | import org.eclipse.rdf4j.query.QueryEvaluationException; 7 | 8 | import java.util.Set; 9 | 10 | public final class Vocabulary { 11 | enum SnapshotType { 12 | NONE, 13 | GLOBAL_STATE, 14 | ADDITIONS, 15 | DELETIONS 16 | } 17 | 18 | private static final SimpleValueFactory VALUE_FACTORY = SimpleValueFactory.getInstance(); 19 | 20 | public static final IRI SCHEMA_ABOUT = VALUE_FACTORY.createIRI("http://schema.org/about"); 21 | public static final IRI SCHEMA_AUTHOR = VALUE_FACTORY.createIRI("http://schema.org/author"); 22 | public static final IRI SCHEMA_DATE_CREATED = VALUE_FACTORY.createIRI("http://schema.org/dateCreated"); 23 | public static final IRI SCHEMA_IS_BASED_ON = VALUE_FACTORY.createIRI("http://schema.org/isBasedOn"); 24 | 25 | public static final String WB_NAMESPACE = "http://wikiba.se/ontology#"; 26 | public static final String WBHISTORY_NAMESPACE = "http://wikiba.se/history/ontology#"; 27 | public static final IRI HISTORY_ADDITION = VALUE_FACTORY.createIRI(WBHISTORY_NAMESPACE, "addition"); 28 | public static final IRI HISTORY_DELETION = VALUE_FACTORY.createIRI(WBHISTORY_NAMESPACE, "deletion"); 29 | public static final IRI HISTORY_ADDITIONS = VALUE_FACTORY.createIRI(WBHISTORY_NAMESPACE, "additions"); 30 | public static final IRI HISTORY_DELETIONS = VALUE_FACTORY.createIRI(WBHISTORY_NAMESPACE, "deletions"); 31 | public static final IRI HISTORY_PREVIOUS_REVISION = VALUE_FACTORY.createIRI(WBHISTORY_NAMESPACE, "previousRevision"); 32 | public static final IRI HISTORY_NEXT_REVISION = VALUE_FACTORY.createIRI(WBHISTORY_NAMESPACE, "nextRevision"); 33 | public static final IRI HISTORY_GLOBAL_STATE = VALUE_FACTORY.createIRI(WBHISTORY_NAMESPACE, "globalState"); 34 | public static final IRI HISTORY_REVISION_ID = VALUE_FACTORY.createIRI(WBHISTORY_NAMESPACE, "revisionId"); 35 | public static final IRI HISTORY_GLOBAL_STATE_AT = VALUE_FACTORY.createIRI(WBHISTORY_NAMESPACE, "globalStateAt"); 36 | 37 | public static final String WD_NAMESPACE = "http://www.wikidata.org/entity/"; 38 | public static final String WDS_NAMESPACE = "http://www.wikidata.org/entity/statement/"; 39 | public static final String WDV_NAMESPACE = "http://www.wikidata.org/value/"; 40 | public static final String WDREF_NAMESPACE = "http://www.wikidata.org/reference/"; 41 | public static final String WDT_NAMESPACE = "http://www.wikidata.org/prop/direct/"; 42 | public static final String P_NAMESPACE = "http://www.wikidata.org/prop/"; 43 | public static final String WDNO_NAMESPACE = "http://www.wikidata.org/prop/novalue/"; 44 | public static final String PS_NAMESPACE = "http://www.wikidata.org/prop/statement/"; 45 | public static final String PSV_NAMESPACE = "http://www.wikidata.org/prop/statement/value/"; 46 | public static final String PQ_NAMESPACE = "http://www.wikidata.org/prop/qualifier/"; 47 | public static final String PQV_NAMESPACE = "http://www.wikidata.org/prop/qualifier/value/"; 48 | public static final String PR_NAMESPACE = "http://www.wikidata.org/prop/reference/"; 49 | public static final String PRV_NAMESPACE = "http://www.wikidata.org/prop/reference/value/"; 50 | 51 | public static final String REVISION_NAMESPACE = "http://www.wikidata.org/revision/"; 52 | public static final String REVISION_ADDITIONS_NAMESPACE = REVISION_NAMESPACE + "additions/"; 53 | public static final String REVISION_DELETIONS_NAMESPACE = REVISION_NAMESPACE + "deletions/"; 54 | public static final String REVISION_GLOBAL_STATE_NAMESPACE = REVISION_NAMESPACE + "global/"; 55 | 56 | public static final IRI CURRENT_GLOBAL_STATE = VALUE_FACTORY.createIRI(REVISION_GLOBAL_STATE_NAMESPACE, Long.toString(Long.MAX_VALUE / 256)); 57 | 58 | private static final Set REVISION_NAMESPACES = Sets.newHashSet(REVISION_NAMESPACE, REVISION_GLOBAL_STATE_NAMESPACE, REVISION_ADDITIONS_NAMESPACE, REVISION_DELETIONS_NAMESPACE); 59 | 60 | public static IRI toDirectProperty(IRI propertyIri) { 61 | if (!propertyIri.getNamespace().equals(WD_NAMESPACE) || !propertyIri.getLocalName().startsWith("P")) { 62 | throw new IllegalArgumentException("Not valid property IRI: " + propertyIri); 63 | } 64 | return VALUE_FACTORY.createIRI(WDT_NAMESPACE, propertyIri.getLocalName()); 65 | } 66 | 67 | public static IRI toGlobalState(IRI revisionIRI) { 68 | assertsInRevisionNamespace(revisionIRI); 69 | return VALUE_FACTORY.createIRI(Vocabulary.REVISION_GLOBAL_STATE_NAMESPACE, revisionIRI.getLocalName()); 70 | } 71 | 72 | public static IRI previousRevision(IRI revisionIRI) { 73 | assertsInRevisionNamespace(revisionIRI); 74 | return VALUE_FACTORY.createIRI(Vocabulary.REVISION_NAMESPACE, Integer.toString(Integer.parseInt(revisionIRI.getLocalName()) - 1)); 75 | } 76 | 77 | private static void assertsInRevisionNamespace(IRI revisionIRI) { 78 | if (!REVISION_NAMESPACES.contains(revisionIRI.getNamespace())) { 79 | throw new QueryEvaluationException("Not supported revision IRI: " + revisionIRI); 80 | } 81 | } 82 | } 83 | -------------------------------------------------------------------------------- /src/test/java/org/wikidata/history/preprocessor/RdfBuilderTest.java: -------------------------------------------------------------------------------- 1 | package org.wikidata.history.preprocessor; 2 | 3 | import com.fasterxml.jackson.databind.ObjectMapper; 4 | import org.eclipse.rdf4j.model.*; 5 | import org.eclipse.rdf4j.model.impl.LinkedHashModel; 6 | import org.eclipse.rdf4j.model.impl.SimpleValueFactory; 7 | import org.eclipse.rdf4j.model.util.Models; 8 | import org.eclipse.rdf4j.rio.RDFFormat; 9 | import org.eclipse.rdf4j.rio.Rio; 10 | import org.eclipse.rdf4j.rio.helpers.NTriplesUtil; 11 | import org.junit.jupiter.api.Assertions; 12 | import org.junit.jupiter.api.Test; 13 | import org.wikidata.wdtk.datamodel.helpers.Datamodel; 14 | import org.wikidata.wdtk.datamodel.helpers.DatamodelMapper; 15 | import org.wikidata.wdtk.datamodel.implementation.EntityDocumentImpl; 16 | import org.wikidata.wdtk.datamodel.interfaces.EntityDocument; 17 | import org.wikidata.wdtk.datamodel.interfaces.Sites; 18 | import org.wikidata.wdtk.dumpfiles.DumpProcessingController; 19 | 20 | import java.io.IOException; 21 | import java.util.Arrays; 22 | import java.util.List; 23 | import java.util.stream.Collectors; 24 | 25 | class RdfBuilderTest { 26 | 27 | private static final ObjectMapper OBJECT_MAPPER = new DatamodelMapper(Datamodel.SITE_WIKIDATA); 28 | private static final List HASHED_NAMESPACES = Arrays.asList( 29 | "http://www.wikidata.org/value/", 30 | "http://www.wikidata.org/reference/" 31 | ); 32 | 33 | private final Sites sites; 34 | private final WikidataPropertyInformation PROPERTY_INFORMATION; 35 | 36 | RdfBuilderTest() throws IOException { 37 | sites = (new DumpProcessingController("wikidatawiki")).getSitesInformation(); 38 | PROPERTY_INFORMATION = new WikidataPropertyInformation(); 39 | } 40 | 41 | @Test 42 | void testItemConversion() throws IOException { 43 | testEntityConversion("Q3"); 44 | testEntityConversion("Q4"); 45 | testEntityConversion("Q6"); 46 | testEntityConversion("Q7"); 47 | //TODO: date normalization testEntityConversion("Q8"); 48 | } 49 | 50 | @Test 51 | void testPropertyConversion() throws IOException { 52 | testEntityConversion("P2"); 53 | testEntityConversion("P3"); 54 | } 55 | 56 | private void testEntityConversion(String entityId) throws IOException { 57 | EntityDocument entity = OBJECT_MAPPER.readValue(getClass().getResource("/entities/" + entityId + ".json"), EntityDocumentImpl.class); 58 | Model expected = makeHashedBlankNodes(Rio.parse(getClass().getResourceAsStream("/rdf/" + entityId + ".nt"), "", RDFFormat.NTRIPLES)); 59 | ModelRdfOutput output = new ModelRdfOutput(); 60 | RdfBuilder rdfBuilder = new RdfBuilder(output, sites, PROPERTY_INFORMATION); 61 | rdfBuilder.addEntityDocument(entity); 62 | Model actual = makeHashedBlankNodes(output.getModel()); 63 | if (!Models.isomorphic(expected, actual)) { 64 | Assertions.fail("Mapping failed." + diff(expected, actual)); 65 | } 66 | } 67 | 68 | private Model makeHashedBlankNodes(Model model) { 69 | Model newModel = new LinkedHashModel(model.size()); 70 | model.forEach(statement -> newModel.add( 71 | (Resource) makeHashedBlankNodes(statement.getSubject()), 72 | (IRI) makeHashedBlankNodes(statement.getPredicate()), 73 | makeHashedBlankNodes(statement.getObject()) 74 | )); 75 | return newModel; 76 | } 77 | 78 | private Value makeHashedBlankNodes(Value value) { 79 | if (value instanceof IRI) { 80 | IRI iri = (IRI) value; 81 | if (HASHED_NAMESPACES.contains(iri.getNamespace())) { 82 | return SimpleValueFactory.getInstance().createBNode(iri.stringValue()); 83 | } 84 | } 85 | return value; 86 | } 87 | 88 | private String diff(Model expected, Model actual) { 89 | Model missing = new LinkedHashModel(); 90 | Model extra = new LinkedHashModel(); 91 | for (Statement statement : expected) { 92 | if (notInWithoutBNode(actual, statement)) { 93 | missing.add(statement); 94 | } 95 | } 96 | for (Statement statement : actual) { 97 | if (notInWithoutBNode(expected, statement)) { 98 | extra.add(statement); 99 | } 100 | } 101 | return "\nMissing:\n" + toNt(missing) + "\nExtra:\n" + toNt(extra); 102 | } 103 | 104 | private boolean notInWithoutBNode(Model model, Statement statement) { 105 | return model.filter( 106 | statement.getSubject() instanceof BNode ? null : statement.getSubject(), 107 | statement.getPredicate(), 108 | statement.getObject() instanceof BNode ? null : statement.getObject() 109 | ).isEmpty(); 110 | } 111 | 112 | private String toNt(Model model) { 113 | return model.stream().map(statement -> NTriplesUtil.toNTriplesString(statement.getSubject()) + " " + 114 | NTriplesUtil.toNTriplesString(statement.getPredicate()) + " " + 115 | NTriplesUtil.toNTriplesString(statement.getObject()) + " . " 116 | ).sorted().collect(Collectors.joining("\n")); 117 | } 118 | 119 | private static final class ModelRdfOutput implements RdfBuilder.RdfOutput { 120 | private final Model model = new LinkedHashModel(); 121 | 122 | @Override 123 | public void outputStatement(Statement statement) { 124 | model.add(statement); 125 | } 126 | 127 | Model getModel() { 128 | return model; 129 | } 130 | } 131 | } 132 | -------------------------------------------------------------------------------- /src/main/java/org/wikidata/history/preprocessor/WikibaseValueHasher.java: -------------------------------------------------------------------------------- 1 | package org.wikidata.history.preprocessor; 2 | 3 | import org.eclipse.rdf4j.model.IRI; 4 | import org.wikidata.wdtk.datamodel.interfaces.*; 5 | 6 | import java.math.BigDecimal; 7 | import java.nio.charset.StandardCharsets; 8 | import java.security.MessageDigest; 9 | import java.security.NoSuchAlgorithmException; 10 | import java.util.Comparator; 11 | import java.util.List; 12 | 13 | class WikibaseValueHasher { 14 | private static final char[] HEX_ARRAY = "0123456789abcdef".toCharArray(); 15 | 16 | private final MessageDigest digest; 17 | 18 | WikibaseValueHasher() { 19 | try { 20 | digest = MessageDigest.getInstance("SHA-1"); 21 | } catch (NoSuchAlgorithmException e) { 22 | throw new RuntimeException(e); 23 | } 24 | } 25 | 26 | String hash(Reference reference) { 27 | add(reference); 28 | return digestAndReset(); 29 | } 30 | 31 | String hash(IRI subjectIRI, PropertyIdValue property) { 32 | add(subjectIRI.stringValue()); 33 | add(property); 34 | return digestAndReset(); 35 | } 36 | 37 | String hash(Value value) { 38 | add(value); 39 | return digestAndReset(); 40 | } 41 | 42 | private void add(Reference reference) { 43 | List groups = reference.getSnakGroups(); 44 | groups.sort(Comparator.comparing(a -> a.getProperty().getId())); 45 | for (SnakGroup group : groups) { 46 | //TODO: sort snaks 47 | for (Snak snak : group) { 48 | add(snak); 49 | } 50 | } 51 | } 52 | 53 | private void add(Snak snak) { 54 | if (snak instanceof ValueSnak) { 55 | add(((ValueSnak) snak)); 56 | } else if (snak instanceof SomeValueSnak) { 57 | add((SomeValueSnak) snak); 58 | } else if (snak instanceof NoValueSnak) { 59 | add((NoValueSnak) snak); 60 | } else { 61 | throw new IllegalArgumentException("Unexpected snak type: " + snak); 62 | } 63 | } 64 | 65 | private void add(ValueSnak snak) { 66 | add((byte) 0); 67 | add(snak.getPropertyId()); 68 | add(snak.getValue()); 69 | } 70 | 71 | private void add(SomeValueSnak snak) { 72 | add((byte) 1); 73 | add(snak.getPropertyId()); 74 | } 75 | 76 | private void add(NoValueSnak snak) { 77 | add((byte) 2); 78 | add(snak.getPropertyId()); 79 | } 80 | 81 | private void add(Value value) { 82 | if (value instanceof EntityIdValue) { 83 | add(((EntityIdValue) value)); 84 | } else if (value instanceof StringValue) { 85 | add((StringValue) value); 86 | } else if (value instanceof MonolingualTextValue) { 87 | add((MonolingualTextValue) value); 88 | } else if (value instanceof TimeValue) { 89 | add((TimeValue) value); 90 | } else if (value instanceof GlobeCoordinatesValue) { 91 | add((GlobeCoordinatesValue) value); 92 | } else if (value instanceof QuantityValue) { 93 | add((QuantityValue) value); 94 | } else { 95 | throw new IllegalArgumentException("Unexpected value type: " + value); 96 | } 97 | } 98 | 99 | private void add(EntityIdValue value) { 100 | add(value.getId()); 101 | } 102 | 103 | private void add(MonolingualTextValue value) { 104 | add(value.getText()); 105 | add(value.getLanguageCode()); 106 | } 107 | 108 | private void add(StringValue value) { 109 | add(value.getString()); 110 | 111 | } 112 | 113 | private void add(TimeValue value) { 114 | add(value.getYear()); 115 | add(value.getMonth()); 116 | add(value.getDay()); 117 | add(value.getHour()); 118 | add(value.getMinute()); 119 | add(value.getSecond()); 120 | add(value.getPrecision()); 121 | add(value.getTimezoneOffset()); 122 | add(value.getPreferredCalendarModel()); 123 | } 124 | 125 | private void add(GlobeCoordinatesValue value) { 126 | add(value.getLatitude()); 127 | add(value.getLongitude()); 128 | add(value.getPrecision()); 129 | add(value.getGlobe()); 130 | } 131 | 132 | 133 | private void add(QuantityValue value) { 134 | add(value.getNumericValue()); 135 | add(value.getUpperBound()); 136 | add(value.getLowerBound()); 137 | add(value.getUnit()); 138 | } 139 | 140 | private void add(String value) { 141 | if (value != null) { 142 | digest.update(value.getBytes(StandardCharsets.UTF_8)); 143 | } 144 | } 145 | 146 | private void add(byte value) { 147 | digest.update(value); 148 | } 149 | 150 | private void add(int value) { 151 | for (byte i = 0; i < Integer.BYTES; i++) { 152 | digest.update((byte) value); 153 | value >>>= Byte.SIZE; 154 | } 155 | } 156 | 157 | private void add(long value) { 158 | for (byte i = 0; i < Long.BYTES; i++) { 159 | digest.update((byte) value); 160 | value >>>= Byte.SIZE; 161 | } 162 | } 163 | 164 | private void add(double value) { 165 | add(Double.doubleToLongBits(value)); 166 | } 167 | 168 | private void add(BigDecimal value) { 169 | if (value != null) { 170 | add(value.toString()); 171 | } 172 | } 173 | 174 | private String digestAndReset() { 175 | String value = bytesToHex(digest.digest()); 176 | digest.reset(); 177 | return value; 178 | } 179 | 180 | private static String bytesToHex(byte[] bytes) { 181 | char[] hexChars = new char[bytes.length * 2]; 182 | for (int j = 0; j < bytes.length; j++) { 183 | int v = bytes[j] & 0xFF; 184 | hexChars[j * 2] = HEX_ARRAY[v >>> 4]; 185 | hexChars[j * 2 + 1] = HEX_ARRAY[v & 0x0F]; 186 | } 187 | return new String(hexChars); 188 | } 189 | } 190 | -------------------------------------------------------------------------------- /src/main/java/org/wikidata/history/Main.java: -------------------------------------------------------------------------------- 1 | package org.wikidata.history; 2 | 3 | import org.apache.commons.cli.*; 4 | import org.eclipse.rdf4j.query.resultio.text.tsv.SPARQLResultsTSVWriterFactory; 5 | import org.slf4j.Logger; 6 | import org.slf4j.LoggerFactory; 7 | import org.wikidata.history.preprocessor.FileHistoryOutput; 8 | import org.wikidata.history.preprocessor.RevisionFileConverter; 9 | import org.wikidata.history.sparql.HistoryRepository; 10 | import org.wikidata.history.sparql.RocksRevisionLoader; 11 | import org.wikidata.history.sparql.RocksTripleLoader; 12 | 13 | import java.io.BufferedWriter; 14 | import java.io.IOException; 15 | import java.nio.file.Files; 16 | import java.nio.file.Path; 17 | import java.nio.file.Paths; 18 | import java.util.List; 19 | import java.util.concurrent.ExecutionException; 20 | import java.util.concurrent.ExecutorService; 21 | import java.util.concurrent.Executors; 22 | import java.util.concurrent.Future; 23 | import java.util.stream.Collectors; 24 | 25 | public class Main { 26 | private static final Logger LOGGER = LoggerFactory.getLogger(Main.class); 27 | 28 | public static void main(String[] args) throws ParseException, IOException, InterruptedException { 29 | Options options = new Options(); 30 | options.addOption("p", "preprocess", false, "Preprocess the data from Wikidata history XML dump compressed with bz2"); 31 | options.addOption("l", "load", false, "Build database indexes from the preprocessed data"); 32 | options.addOption("q", "sparql", true, "SPARQL query to execute"); 33 | 34 | options.addOption("dd", "dumps-dir", true, "Directory to preprocess data from."); 35 | options.addOption("pd", "preprocessed-dir", true, "Directory where preprocessed data are."); 36 | options.addOption("id", "index-dir", true, "Directory where index data are."); 37 | options.addOption("t", "triples-only", false, "Load only triples"); 38 | options.addOption("wdt", "wdt-only", false, "Load only wdt: and owl:sameAs relations"); 39 | 40 | CommandLineParser parser = new DefaultParser(); 41 | CommandLine line = parser.parse(options, args); 42 | 43 | Path dumpsDir = Paths.get(line.getOptionValue("dumps-dir", "dumps")); 44 | Path preprocessedDir = Paths.get(line.getOptionValue("preprocessed-dir", "wd-preprocessed")); 45 | Path indexDir = Paths.get(line.getOptionValue("index-dir", "wd-history-index")); 46 | 47 | if (line.hasOption("preprocess")) { 48 | System.setProperty("jdk.xml.entityExpansionLimit", String.valueOf(Integer.MAX_VALUE)); 49 | System.setProperty("jdk.xml.totalEntitySizeLimit", String.valueOf(Integer.MAX_VALUE)); 50 | if (!Files.isDirectory(preprocessedDir)) { 51 | Files.createDirectories(preprocessedDir); 52 | } 53 | 54 | ExecutorService executorService = Executors.newFixedThreadPool( 55 | Runtime.getRuntime().availableProcessors() 56 | ); 57 | try ( 58 | FileHistoryOutput historyOutput = new FileHistoryOutput(preprocessedDir); 59 | BufferedWriter log = Files.newBufferedWriter(preprocessedDir.resolve("logs.txt")) 60 | ) { 61 | RevisionFileConverter revisionFileConverter = new RevisionFileConverter(historyOutput); 62 | List files = Files.walk(dumpsDir) 63 | .filter(file -> file.toString().endsWith(".bz2")) 64 | .collect(Collectors.toList()); 65 | System.out.println("Loading " + files.size() + " files."); 66 | List> futures = files.stream().map(file -> executorService.submit(() -> { 67 | try { 68 | revisionFileConverter.process(file); 69 | log.write(file + "\tok\n"); 70 | } catch (Exception e) { 71 | LOGGER.error(e.getMessage(), e); 72 | try { 73 | log.write(file.toString() + "\terror\t" + e.getMessage() + "\n"); 74 | } catch (IOException e2) { 75 | throw new RuntimeException(e2); 76 | } 77 | } 78 | })).collect(Collectors.toList()); 79 | for (Future future : futures) { 80 | try { 81 | future.get(); 82 | } catch (ExecutionException e) { 83 | LOGGER.error(e.toString(), e); 84 | } 85 | } 86 | } finally { 87 | executorService.shutdown(); 88 | } 89 | } 90 | 91 | if (line.hasOption("load")) { 92 | if (!Files.isDirectory(indexDir)) { 93 | Files.createDirectories(indexDir); 94 | } 95 | 96 | if (!line.hasOption("triples-only")) { 97 | Path revisionsFile = preprocessedDir.resolve("revisions.tsv.gz"); 98 | if (Files.exists(revisionsFile)) { 99 | try (RocksRevisionLoader loader = new RocksRevisionLoader(indexDir)) { 100 | loader.load(revisionsFile); 101 | } 102 | } else { 103 | LOGGER.warn("Skipping revisions loading " + revisionsFile + " does not exists"); 104 | } 105 | } 106 | 107 | Path triplesFile = preprocessedDir.resolve("triples.tsv.gz"); 108 | if (Files.exists(triplesFile)) { 109 | try (RocksTripleLoader loader = new RocksTripleLoader(indexDir, options.hasOption("wdt-only"))) { 110 | loader.load(triplesFile); 111 | } 112 | } else { 113 | LOGGER.warn("Skipping revisions loading " + triplesFile + " does not exists"); 114 | } 115 | } 116 | 117 | if (line.hasOption("sparql")) { 118 | try (HistoryRepository historyRepository = new HistoryRepository(indexDir)) { 119 | historyRepository.getConnection().prepareTupleQuery(line.getOptionValue("sparql")) 120 | .evaluate((new SPARQLResultsTSVWriterFactory()).getWriter(System.out)); 121 | } 122 | } 123 | } 124 | } 125 | -------------------------------------------------------------------------------- /src/test/resources/rdf/Q6.nt: -------------------------------------------------------------------------------- 1 | . 2 | "string" . 3 | . 4 | . 5 | . 6 | . 7 | . 8 | . 9 | . 10 | . 11 | "Point(67.25 12.125)"^^ . 12 | _:genid1 . 13 | "\u0431\u0440\u0435\u0434"@ru . 14 | "\u043F\u0440\u0435\u0432\u0435\u0434"@ru . 15 | "+19.768000000000000682121026329696178436279296875"^^ . 16 | "simplestring" . 17 | "-0200-00-00T00:00:00Z"^^ . 18 | . 19 | . 20 | . 21 | . 22 | . 23 | "string" . 24 | . 25 | "12.125"^^ . 26 | "67.25"^^ . 27 | "0.0625"^^ . 28 | . 29 | "+19.768000000000000682121026329696178436279296875"^^ . 30 | "+19.766999999999999459987520822323858737945556640625"^^ . 31 | . 32 | "+19.76899999999999835154085303656756877899169921875"^^ . 33 | . 34 | . 35 | "9"^^ . 36 | "0"^^ . 37 | "-0200-00-00T00:00:00Z"^^ . 38 | . 39 | -------------------------------------------------------------------------------- /src/test/resources/entities/Q6.json: -------------------------------------------------------------------------------- 1 | { 2 | "id": "Q6", 3 | "type": "item", 4 | "claims": { 5 | "P7": [ 6 | { 7 | "id": "TEST-Qualifiers", 8 | "mainsnak": { 9 | "snaktype": "value", 10 | "property": "P7", 11 | "datatype": "string", 12 | "datavalue": { 13 | "value": "string", 14 | "type": "string" 15 | } 16 | }, 17 | "qualifiers": { 18 | "P2": [ 19 | { 20 | "snaktype": "value", 21 | "property": "P2", 22 | "datatype": "wikibase-entityid", 23 | "datavalue": { 24 | "value": { 25 | "entity-type": "item", 26 | "numeric-id": 42 27 | }, 28 | "type": "wikibase-entityid" 29 | } 30 | }, 31 | { 32 | "snaktype": "value", 33 | "property": "P2", 34 | "datatype": "wikibase-entityid", 35 | "datavalue": { 36 | "value": { 37 | "entity-type": "item", 38 | "numeric-id": 666 39 | }, 40 | "type": "wikibase-entityid" 41 | } 42 | } 43 | ], 44 | "P18": [ 45 | { 46 | "snaktype": "value", 47 | "property": "P18", 48 | "datatype": "commonsMedia", 49 | "datavalue": { 50 | "value": "Universe.svg", 51 | "type": "string" 52 | } 53 | }, 54 | { 55 | "snaktype": "novalue", 56 | "property": "P18" 57 | } 58 | ], 59 | "P4": [ 60 | { 61 | "snaktype": "value", 62 | "property": "P4", 63 | "datatype": "globecoordinate", 64 | "datavalue": { 65 | "value": { 66 | "latitude": 12.125, 67 | "longitude": 67.25, 68 | "precision": 0.0625, 69 | "globe": "http:\/\/www.wikidata.org\/entity\/Q2" 70 | }, 71 | "type": "globecoordinate" 72 | } 73 | } 74 | ], 75 | "P5": [ 76 | { 77 | "snaktype": "value", 78 | "property": "P5", 79 | "datatype": "monolingualtext", 80 | "datavalue": { 81 | "value": { 82 | "text": "\u043f\u0440\u0435\u0432\u0435\u0434", 83 | "language": "ru" 84 | }, 85 | "type": "monolingualtext" 86 | } 87 | }, 88 | { 89 | "snaktype": "somevalue", 90 | "property": "P5" 91 | }, 92 | { 93 | "snaktype": "value", 94 | "property": "P5", 95 | "datatype": "monolingualtext", 96 | "datavalue": { 97 | "value": { 98 | "text": "\u0431\u0440\u0435\u0434", 99 | "language": "ru" 100 | }, 101 | "type": "monolingualtext" 102 | } 103 | } 104 | ], 105 | "P6": [ 106 | { 107 | "snaktype": "value", 108 | "property": "P6", 109 | "datatype": "quantity", 110 | "datavalue": { 111 | "value": { 112 | "amount": "+19.768000000000000682121026329696178436279296875", 113 | "unit": "1", 114 | "upperBound": "+19.76899999999999835154085303656756877899169921875", 115 | "lowerBound": "+19.766999999999999459987520822323858737945556640625" 116 | }, 117 | "type": "quantity" 118 | } 119 | } 120 | ], 121 | "P7": [ 122 | { 123 | "snaktype": "value", 124 | "property": "P7", 125 | "datatype": "string", 126 | "datavalue": { 127 | "value": "simplestring", 128 | "type": "string" 129 | } 130 | } 131 | ], 132 | "P8": [ 133 | { 134 | "snaktype": "value", 135 | "property": "P8", 136 | "datatype": "time", 137 | "datavalue": { 138 | "value": { 139 | "time": "-0200-00-00T00:00:00Z", 140 | "timezone": 0, 141 | "before": 0, 142 | "after": 0, 143 | "precision": 9, 144 | "calendarmodel": "http:\/\/www.wikidata.org\/entity\/Q1985727" 145 | }, 146 | "type": "time" 147 | } 148 | } 149 | ], 150 | "P856": [ 151 | { 152 | "snaktype": "value", 153 | "property": "P856", 154 | "datatype": "url", 155 | "datavalue": { 156 | "value": "http:\/\/url.acme.test\/", 157 | "type": "string" 158 | } 159 | }, 160 | { 161 | "snaktype": "value", 162 | "property": "P856", 163 | "datatype": "url", 164 | "datavalue": { 165 | "value": " http:\/\/url.acme2.test\/\n", 166 | "type": "string" 167 | } 168 | } 169 | ] 170 | }, 171 | "qualifiers-order": [ 172 | "P2", 173 | "P18", 174 | "P4", 175 | "P5", 176 | "P6", 177 | "P7", 178 | "P8", 179 | "P856" 180 | ], 181 | "type": "statement", 182 | "rank": "normal" 183 | } 184 | ] 185 | } 186 | } 187 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | org.wikidata.history 8 | sparql-endpoint 9 | 0.1-SNAPSHOT 10 | 11 | 12 | 13 | org.apache.commons 14 | commons-compress 15 | 1.21 16 | 17 | 18 | commons-cli 19 | commons-cli 20 | 1.5.0 21 | 22 | 23 | org.wikidata.wdtk 24 | wdtk-dumpfiles 25 | 0.12.1 26 | 27 | 28 | org.eclipse.rdf4j 29 | rdf4j-queryparser-sparql 30 | 3.7.4 31 | 32 | 33 | org.eclipse.rdf4j 34 | rdf4j-queryalgebra-evaluation 35 | 3.7.4 36 | 37 | 38 | org.eclipse.rdf4j 39 | rdf4j-queryresultio-sparqljson 40 | 3.7.4 41 | 42 | 43 | org.eclipse.rdf4j 44 | rdf4j-queryresultio-sparqlxml 45 | 3.7.4 46 | 47 | 48 | org.eclipse.rdf4j 49 | rdf4j-queryresultio-text 50 | 3.7.4 51 | 52 | 53 | org.eclipse.rdf4j 54 | rdf4j-rio-turtle 55 | 3.7.4 56 | 57 | 58 | org.eclipse.rdf4j 59 | rdf4j-rio-ntriples 60 | 3.7.4 61 | 62 | 63 | org.eclipse.rdf4j 64 | rdf4j-rio-rdfxml 65 | 3.7.4 66 | 67 | 68 | io.javalin 69 | javalin 70 | 4.3.0 71 | 72 | 73 | org.slf4j 74 | slf4j-jdk14 75 | 1.7.32 76 | 77 | 78 | commons-logging 79 | commons-logging 80 | 1.2 81 | 82 | 83 | org.junit.jupiter 84 | junit-jupiter 85 | 5.8.2 86 | test 87 | 88 | 89 | org.rocksdb 90 | rocksdbjni 91 | 6.28.2 92 | 93 | 94 | org.eclipse.collections 95 | eclipse-collections 96 | 11.0.0 97 | 98 | 99 | com.fasterxml.jackson.core 100 | jackson-annotations 101 | 2.11.4 102 | 103 | 104 | com.fasterxml.jackson.core 105 | jackson-databind 106 | 2.11.4 107 | 108 | 109 | 110 | 111 | 112 | 113 | org.apache.maven.plugins 114 | maven-compiler-plugin 115 | 3.8.0 116 | 117 | 11 118 | 11 119 | 120 | 121 | 122 | org.apache.maven.plugins 123 | maven-shade-plugin 124 | 3.2.0 125 | 126 | 127 | assemble-all 128 | package 129 | 130 | shade 131 | 132 | 133 | 134 | 136 | org.wikidata.history.Main 137 | 138 | 140 | 141 | 142 | 143 | *:* 144 | 145 | META-INF/*.SF 146 | META-INF/*.RSA 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | -------------------------------------------------------------------------------- /src/test/resources/rdf/Q7.nt: -------------------------------------------------------------------------------- 1 | . 2 | "string" . 3 | "string2" . 4 | . 5 | . 6 | . 7 | . 8 | . 9 | . 10 | . 11 | "Point(67.25 12.125)"^^ . 12 | _:genid1 . 13 | "\u0431\u0440\u0435\u0434"@ru . 14 | "\u043F\u0440\u0435\u0432\u0435\u0434"@ru . 15 | "+19.768000000000000682121026329696178436279296875"^^ . 16 | "simplestring" . 17 | "-0200-00-00T00:00:00Z"^^ . 18 | . 19 | . 20 | . 21 | . 22 | . 23 | . 24 | . 25 | . 26 | "string2" . 27 | . 28 | . 29 | . 30 | . 31 | "string" . 32 | . 33 | "12.125"^^ . 34 | "67.25"^^ . 35 | "0.0625"^^ . 36 | . 37 | "+19.768000000000000682121026329696178436279296875"^^ . 38 | "+19.766999999999999459987520822323858737945556640625"^^ . 39 | . 40 | "+19.76899999999999835154085303656756877899169921875"^^ . 41 | . 42 | . 43 | "9"^^ . 44 | "0"^^ . 45 | "-0200-00-00T00:00:00Z"^^ . 46 | . 47 | -------------------------------------------------------------------------------- /src/test/resources/entities/Q4.json: -------------------------------------------------------------------------------- 1 | { 2 | "id": "Q4", 3 | "type": "item", 4 | "claims": { 5 | "P2": [ 6 | { 7 | "id": "TEST-Statement-2-423614cd831ed4e8da1138c9229cb65cf96f9366", 8 | "mainsnak": { 9 | "snaktype": "value", 10 | "property": "P2", 11 | "datatype": "wikibase-entityid", 12 | "datavalue": { 13 | "value": { 14 | "entity-type": "item", 15 | "numeric-id": 42 16 | }, 17 | "type": "wikibase-entityid" 18 | } 19 | }, 20 | "type": "statement", 21 | "rank": "preferred" 22 | }, 23 | { 24 | "id": "TEST-Statement-2-475ae31b07cff4f0e33531030b1ba58f004fcd4b", 25 | "mainsnak": { 26 | "snaktype": "value", 27 | "property": "P2", 28 | "datatype": "wikibase-entityid", 29 | "datavalue": { 30 | "value": { 31 | "entity-type": "item", 32 | "numeric-id": 666 33 | }, 34 | "type": "wikibase-entityid" 35 | } 36 | }, 37 | "type": "statement", 38 | "rank": "normal" 39 | } 40 | ], 41 | "P18": [ 42 | { 43 | "id": "TEST-Statement-3-b181ddac61642fe80bbf8e4a8eaa1da425cb0ac9", 44 | "mainsnak": { 45 | "snaktype": "value", 46 | "property": "P18", 47 | "datatype": "commonsMedia", 48 | "datavalue": { 49 | "value": "Universe.svg", 50 | "type": "string" 51 | } 52 | }, 53 | "type": "statement", 54 | "rank": "normal" 55 | }, 56 | { 57 | "id": "TEST-Statement-3-12914044e0dbab210aa9d81168bd50471bbde12d", 58 | "mainsnak": { 59 | "snaktype": "novalue", 60 | "property": "P18" 61 | }, 62 | "type": "statement", 63 | "rank": "normal" 64 | } 65 | ], 66 | "P4": [ 67 | { 68 | "id": "TEST-Statement-4-8749fa158a249e1befa6ed077f648c56197a2b2d", 69 | "mainsnak": { 70 | "snaktype": "value", 71 | "property": "P4", 72 | "datatype": "globecoordinate", 73 | "datavalue": { 74 | "value": { 75 | "latitude": 12.125, 76 | "longitude": 67.25, 77 | "precision": 0.0625, 78 | "globe": "http:\/\/www.wikidata.org\/entity\/Q2" 79 | }, 80 | "type": "globecoordinate" 81 | } 82 | }, 83 | "type": "statement", 84 | "rank": "normal" 85 | } 86 | ], 87 | "P5": [ 88 | { 89 | "id": "TEST-Statement-5-93da31338cb80c2eb0f92a5459a186bd59579180", 90 | "mainsnak": { 91 | "snaktype": "value", 92 | "property": "P5", 93 | "datatype": "monolingualtext", 94 | "datavalue": { 95 | "value": { 96 | "text": "\u043f\u0440\u0435\u0432\u0435\u0434", 97 | "language": "ru" 98 | }, 99 | "type": "monolingualtext" 100 | } 101 | }, 102 | "type": "statement", 103 | "rank": "normal" 104 | }, 105 | { 106 | "id": "TEST-Statement-5-8c5d9fe1bfe1fe52e5ab706ae3e5d62f4aaa8d5b", 107 | "mainsnak": { 108 | "snaktype": "somevalue", 109 | "property": "P5" 110 | }, 111 | "type": "statement", 112 | "rank": "normal" 113 | }, 114 | { 115 | "id": "TEST-Statement-5-b27fe5a95fa506ca99acebd9e97c9c5a81e14f99", 116 | "mainsnak": { 117 | "snaktype": "value", 118 | "property": "P5", 119 | "datatype": "monolingualtext", 120 | "datavalue": { 121 | "value": { 122 | "text": "\u0431\u0440\u0435\u0434", 123 | "language": "ru" 124 | }, 125 | "type": "monolingualtext" 126 | } 127 | }, 128 | "type": "statement", 129 | "rank": "deprecated" 130 | } 131 | ], 132 | "P6": [ 133 | { 134 | "id": "TEST-Statement-6-9ae284048af6d9ab0f2815ef104216cb8b22e8bc", 135 | "mainsnak": { 136 | "snaktype": "value", 137 | "property": "P6", 138 | "datatype": "quantity", 139 | "datavalue": { 140 | "value": { 141 | "amount": "+19.768000000000000682121026329696178436279296875", 142 | "unit": "1", 143 | "upperBound": "+19.76899999999999835154085303656756877899169921875", 144 | "lowerBound": "+19.766999999999999459987520822323858737945556640625" 145 | }, 146 | "type": "quantity" 147 | } 148 | }, 149 | "type": "statement", 150 | "rank": "normal" 151 | } 152 | ], 153 | "P7": [ 154 | { 155 | "id": "TEST-Statement-7-6063d202e584b79a2e9f89ab92b51e7f22ef9886", 156 | "mainsnak": { 157 | "snaktype": "value", 158 | "property": "P7", 159 | "datatype": "string", 160 | "datavalue": { 161 | "value": "simplestring", 162 | "type": "string" 163 | } 164 | }, 165 | "type": "statement", 166 | "rank": "normal" 167 | } 168 | ], 169 | "P8": [ 170 | { 171 | "id": "TEST-Statement-8-5dd0f6624a7545401bc306a068ac1bbe8148bfac", 172 | "mainsnak": { 173 | "snaktype": "value", 174 | "property": "P8", 175 | "datatype": "time", 176 | "datavalue": { 177 | "value": { 178 | "time": "-0200-00-00T00:00:00Z", 179 | "timezone": 0, 180 | "before": 0, 181 | "after": 0, 182 | "precision": 9, 183 | "calendarmodel": "http:\/\/www.wikidata.org\/entity\/Q1985727" 184 | }, 185 | "type": "time" 186 | } 187 | }, 188 | "type": "statement", 189 | "rank": "normal" 190 | } 191 | ], 192 | "P856": [ 193 | { 194 | "id": "TEST-Statement-9-2669d541dfd2d6cc0105927bff02bbe0eec0e921", 195 | "mainsnak": { 196 | "snaktype": "value", 197 | "property": "P856", 198 | "datatype": "url", 199 | "datavalue": { 200 | "value": "http:\/\/url.acme.test\\badurl?chars=\\привет< >\"", 201 | "type": "string" 202 | } 203 | }, 204 | "type": "statement", 205 | "rank": "normal" 206 | } 207 | ], 208 | "P3896": [ 209 | { 210 | "id": "TEST-Statement-10-geo-shape", 211 | "mainsnak": { 212 | "snaktype": "value", 213 | "property": "P3896", 214 | "datatype": "geo-shape", 215 | "datavalue": { 216 | "value": "Data:Berlin.map", 217 | "type": "string" 218 | } 219 | }, 220 | "type": "statement", 221 | "rank": "normal" 222 | } 223 | ], 224 | "P11": [ 225 | { 226 | "id": "TEST-Statement-11-external-id", 227 | "mainsnak": { 228 | "snaktype": "value", 229 | "property": "P11", 230 | "datatype": "external-id", 231 | "datavalue": { 232 | "value": "test-external-identifier", 233 | "type": "string" 234 | } 235 | }, 236 | "type": "statement", 237 | "rank": "normal" 238 | } 239 | ] 240 | } 241 | } 242 | -------------------------------------------------------------------------------- /src/main/java/org/wikidata/history/sparql/RocksTripleLoader.java: -------------------------------------------------------------------------------- 1 | package org.wikidata.history.sparql; 2 | 3 | import org.eclipse.rdf4j.model.IRI; 4 | import org.eclipse.rdf4j.model.Resource; 5 | import org.eclipse.rdf4j.model.Value; 6 | import org.eclipse.rdf4j.model.impl.SimpleValueFactory; 7 | import org.eclipse.rdf4j.model.vocabulary.OWL; 8 | import org.eclipse.rdf4j.model.vocabulary.RDFS; 9 | import org.eclipse.rdf4j.model.vocabulary.SKOS; 10 | import org.eclipse.rdf4j.rio.helpers.NTriplesUtil; 11 | import org.slf4j.Logger; 12 | import org.slf4j.LoggerFactory; 13 | 14 | import java.io.BufferedReader; 15 | import java.io.IOException; 16 | import java.io.InputStreamReader; 17 | import java.nio.file.Files; 18 | import java.nio.file.Path; 19 | import java.util.Arrays; 20 | import java.util.concurrent.atomic.AtomicLong; 21 | import java.util.zip.GZIPInputStream; 22 | 23 | public final class RocksTripleLoader implements AutoCloseable { 24 | private static final Logger LOGGER = LoggerFactory.getLogger(RocksTripleLoader.class); 25 | private static final IRI SCHEMA_DESCRIPTION = SimpleValueFactory.getInstance().createIRI("http://schema.org/description"); 26 | private static final IRI SCHEMA_ABOUT = SimpleValueFactory.getInstance().createIRI("http://schema.org/about"); 27 | 28 | private final RocksStore store; 29 | private final Path countFile; 30 | private final boolean wdtOnly; 31 | private final NumericValueFactory valueFactory; 32 | private final RocksStore.Index spoIndex; 33 | private final RocksStore.Index posIndex; 34 | private final RocksStore.Index ospIndex; 35 | private final RocksStore.Index insertedStatement; 36 | private final RocksStore.Index deletedStatement; 37 | 38 | public RocksTripleLoader(Path path, boolean wdtOnly) { 39 | store = new RocksStore(path, false); 40 | countFile = path.resolve("triple-progress.txt"); 41 | valueFactory = new NumericValueFactory(store.getReadWriteStringStore()); 42 | spoIndex = store.spoStatementIndex(); 43 | posIndex = store.posStatementIndex(); 44 | ospIndex = store.ospStatementIndex(); 45 | insertedStatement = store.insertedStatementIndex(); 46 | deletedStatement = store.deletedStatementIndex(); 47 | this.wdtOnly = wdtOnly; 48 | } 49 | 50 | public void load(Path file) throws IOException { 51 | LOGGER.info("Loading triples"); 52 | if (wdtOnly) { 53 | LOGGER.info("Loading only direct properties"); 54 | } 55 | loadTriples(file); 56 | 57 | LOGGER.info("Compacting store"); 58 | store.compact(); 59 | } 60 | 61 | private BufferedReader gzipReader(Path path) throws IOException { 62 | return new BufferedReader(new InputStreamReader(new GZIPInputStream(Files.newInputStream(path)))); 63 | } 64 | 65 | private void loadTriples(Path path) throws IOException { 66 | long start = 0; 67 | try { 68 | start = Long.parseLong(new String(Files.readAllBytes(countFile)).trim()); 69 | } catch (IOException e) { 70 | LOGGER.error(e.getMessage(), e); 71 | } 72 | 73 | AtomicLong done = new AtomicLong(start); 74 | try (BufferedReader reader = gzipReader(path)) { 75 | // We skip the lines we have to skip 76 | for (long i = 0; i < start; i++) { 77 | reader.readLine(); 78 | } 79 | 80 | reader.lines().parallel().peek(line -> { 81 | long count = done.getAndIncrement(); 82 | if (count % 1_000_000 == 0) { 83 | try { 84 | Files.write(countFile, Long.toString(count).getBytes()); 85 | } catch (IOException e) { 86 | LOGGER.error(e.getMessage(), e); 87 | } 88 | LOGGER.info(count + " triples imported"); 89 | } 90 | }).forEach(line -> { 91 | String[] parts = line.split("\t"); 92 | try { 93 | long[] revisionIds = Arrays.stream(parts[3].split(" ")).mapToLong(Long::parseLong).toArray(); 94 | if (!LongRangeUtils.isSorted(revisionIds)) { 95 | LOGGER.error("the revision ranges are not sorted: " + Arrays.toString(revisionIds)); 96 | } 97 | Resource subject = NTriplesUtil.parseResource(parts[0], valueFactory); 98 | IRI predicate = NTriplesUtil.parseURI(parts[1], valueFactory); 99 | Value object = NTriplesUtil.parseValue(parts[2], valueFactory); 100 | if (wdtOnly && !(OWL.SAMEAS.equals(predicate) || RDFS.LABEL.equals(predicate) || SCHEMA_DESCRIPTION.equals(predicate) || SKOS.ALT_LABEL.equals(predicate) || SCHEMA_ABOUT.equals(predicate) || Vocabulary.WDT_NAMESPACE.equals(predicate.getNamespace()))) { 101 | return; 102 | } 103 | addTriple( 104 | valueFactory.encodeValue(subject), 105 | valueFactory.encodeValue(predicate), 106 | valueFactory.encodeValue(object), 107 | revisionIds 108 | ); 109 | } catch (NotSupportedValueException e) { 110 | // We ignore it for now 111 | } catch (Exception e) { 112 | LOGGER.error(e.getMessage(), e); 113 | } 114 | }); 115 | } 116 | } 117 | 118 | private void addTriple(long subject, long predicate, long object, long[] range) { 119 | if (range == null) { 120 | throw new IllegalArgumentException("Triple without revision range"); 121 | } 122 | long[] spoTriple = new long[]{subject, predicate, object}; 123 | long[] posTriple = new long[]{predicate, object, subject}; 124 | long[] ospTriple = new long[]{object, subject, predicate}; 125 | 126 | long[] existingRange = spoIndex.get(spoTriple); 127 | if (existingRange != null) { 128 | range = LongRangeUtils.union(existingRange, range); 129 | } 130 | spoIndex.put(spoTriple, range); 131 | posIndex.put(posTriple, range); 132 | ospIndex.put(ospTriple, range); 133 | 134 | // Range additions 135 | for (int i = 0; i < range.length; i += 2) { 136 | addToStatementListIndex(insertedStatement, range[i], spoTriple); 137 | if (range[i + 1] != Long.MAX_VALUE) { 138 | addToStatementListIndex(deletedStatement, range[i + 1], spoTriple); 139 | } 140 | } 141 | 142 | // Range deletions 143 | if (existingRange != null) { 144 | for (int i = 0; i < existingRange.length; i += 2) { 145 | if (!LongRangeUtils.isRangeStart(existingRange[i], range)) { 146 | removeFromStatementListIndex(insertedStatement, existingRange[i], spoTriple); 147 | } 148 | if (!LongRangeUtils.isRangeEnd(existingRange[i + 1], range) && existingRange[i + 1] != Long.MAX_VALUE) { 149 | removeFromStatementListIndex(deletedStatement, existingRange[i + 1], spoTriple); 150 | } 151 | } 152 | } 153 | } 154 | 155 | private static void addToStatementListIndex(RocksStore.Index index, long key, long[] triple) { 156 | long[] existingTriples = index.get(key); 157 | long[] newTriples = (existingTriples == null) ? triple : TripleArrayUtils.addToSortedArray(existingTriples, triple); 158 | if (newTriples != existingTriples) { 159 | index.put(key, newTriples); 160 | } 161 | } 162 | 163 | private static void removeFromStatementListIndex(RocksStore.Index index, long key, long[] triple) { 164 | long[] existingTriples = index.get(key); 165 | if (existingTriples == null) { 166 | return; 167 | } 168 | long[] newTriples = TripleArrayUtils.removeFromSortedArray(existingTriples, triple); 169 | if (newTriples != existingTriples) { 170 | index.put(key, newTriples); 171 | } 172 | } 173 | 174 | @Override 175 | public void close() { 176 | valueFactory.close(); 177 | store.close(); 178 | } 179 | } 180 | -------------------------------------------------------------------------------- /src/test/resources/entities/Q8.json: -------------------------------------------------------------------------------- 1 | { 2 | "id": "Q8", 3 | "type": "item", 4 | "claims": { 5 | "P8": [ 6 | { 7 | "id": "TEST-Dates-8-1", 8 | "mainsnak": { 9 | "snaktype": "value", 10 | "property": "P8", 11 | "datatype": "time", 12 | "datavalue": { 13 | "value": { 14 | "time": "-0200-00-00T00:00:00Z", 15 | "timezone": 0, 16 | "before": 0, 17 | "after": 0, 18 | "precision": 9, 19 | "calendarmodel": "http:\/\/www.wikidata.org\/entity\/Q1985727" 20 | }, 21 | "type": "time" 22 | } 23 | }, 24 | "type": "statement", 25 | "rank": "normal" 26 | }, 27 | { 28 | "id": "TEST-Dates-8-2", 29 | "mainsnak": { 30 | "snaktype": "value", 31 | "property": "P8", 32 | "datatype": "time", 33 | "datavalue": { 34 | "value": { 35 | "time": "+0200-10-00T00:00:00Z", 36 | "timezone": 60, 37 | "before": 0, 38 | "after": 0, 39 | "precision": 11, 40 | "calendarmodel": "http:\/\/www.wikidata.org\/entity\/Q1985727" 41 | }, 42 | "type": "time" 43 | } 44 | }, 45 | "type": "statement", 46 | "rank": "normal" 47 | }, 48 | { 49 | "id": "TEST-Dates-8-3", 50 | "mainsnak": { 51 | "snaktype": "value", 52 | "property": "P8", 53 | "datatype": "time", 54 | "datavalue": { 55 | "value": { 56 | "time": "-200000200-04-31T00:00:00Z", 57 | "timezone": -60, 58 | "before": 0, 59 | "after": 0, 60 | "precision": 11, 61 | "calendarmodel": "http:\/\/www.wikidata.org\/entity\/Q1985727" 62 | }, 63 | "type": "time" 64 | } 65 | }, 66 | "type": "statement", 67 | "rank": "normal" 68 | }, 69 | { 70 | "id": "TEST-Dates-8-4", 71 | "mainsnak": { 72 | "snaktype": "value", 73 | "property": "P8", 74 | "datatype": "time", 75 | "datavalue": { 76 | "value": { 77 | "time": "+0200-02-30T00:00:00Z", 78 | "timezone": 0, 79 | "before": 0, 80 | "after": 0, 81 | "precision": 11, 82 | "calendarmodel": "http:\/\/www.wikidata.org\/entity\/Q1985727" 83 | }, 84 | "type": "time" 85 | } 86 | }, 87 | "type": "statement", 88 | "rank": "normal" 89 | }, 90 | { 91 | "id": "TEST-Dates-8-5", 92 | "mainsnak": { 93 | "snaktype": "value", 94 | "property": "P8", 95 | "datatype": "time", 96 | "datavalue": { 97 | "value": { 98 | "time": "+2014-02-29T00:00:00Z", 99 | "timezone": 0, 100 | "before": 0, 101 | "after": 0, 102 | "precision": 11, 103 | "calendarmodel": "http:\/\/www.wikidata.org\/entity\/Q1985727" 104 | }, 105 | "type": "time" 106 | } 107 | }, 108 | "type": "statement", 109 | "rank": "normal" 110 | }, 111 | { 112 | "id": "TEST-Dates-8-6", 113 | "mainsnak": { 114 | "snaktype": "value", 115 | "property": "P8", 116 | "datatype": "time", 117 | "datavalue": { 118 | "value": { 119 | "time": "+2014-04-31T00:00:00Z", 120 | "timezone": 0, 121 | "before": 0, 122 | "after": 0, 123 | "precision": 9, 124 | "calendarmodel": "http:\/\/www.wikidata.org\/entity\/Q1985727" 125 | }, 126 | "type": "time" 127 | } 128 | }, 129 | "type": "statement", 130 | "rank": "normal" 131 | }, 132 | { 133 | "id": "TEST-Dates-8-7", 134 | "mainsnak": { 135 | "snaktype": "value", 136 | "property": "P8", 137 | "datatype": "time", 138 | "datavalue": { 139 | "value": { 140 | "time": "+2012-02-29T00:00:00Z", 141 | "timezone": 0, 142 | "before": 0, 143 | "after": 0, 144 | "precision": 9, 145 | "calendarmodel": "http:\/\/www.wikidata.org\/entity\/Q1985727" 146 | }, 147 | "type": "time" 148 | } 149 | }, 150 | "type": "statement", 151 | "rank": "normal" 152 | }, 153 | { 154 | "id": "TEST-Dates-8-8", 155 | "mainsnak": { 156 | "snaktype": "value", 157 | "property": "P8", 158 | "datatype": "time", 159 | "datavalue": { 160 | "value": { 161 | "time": "+2012-02-29T00:00:00Z", 162 | "timezone": 0, 163 | "before": 0, 164 | "after": 0, 165 | "precision": 11, 166 | "calendarmodel": "http:\/\/www.wikidata.org\/entity\/Q1985786" 167 | }, 168 | "type": "time" 169 | } 170 | }, 171 | "type": "statement", 172 | "rank": "normal" 173 | }, 174 | { 175 | "id": "TEST-Dates-8-9", 176 | "mainsnak": { 177 | "snaktype": "value", 178 | "property": "P8", 179 | "datatype": "time", 180 | "datavalue": { 181 | "value": { 182 | "time": "+2012-02-31T00:00:00Z", 183 | "timezone": 0, 184 | "before": 0, 185 | "after": 0, 186 | "precision": 11, 187 | "calendarmodel": "http:\/\/acme.test\/calendar" 188 | }, 189 | "type": "time" 190 | } 191 | }, 192 | "type": "statement", 193 | "rank": "normal" 194 | }, 195 | { 196 | "id": "TEST-Dates-8-10", 197 | "mainsnak": { 198 | "snaktype": "value", 199 | "property": "P8", 200 | "datatype": "time", 201 | "datavalue": { 202 | "value": { 203 | "time": "+2000000200-10-00T00:00:00Z", 204 | "timezone": 0, 205 | "before": 0, 206 | "after": 0, 207 | "precision": 5, 208 | "calendarmodel": "http:\/\/www.wikidata.org\/entity\/Q1985727" 209 | }, 210 | "type": "time" 211 | } 212 | }, 213 | "type": "statement", 214 | "rank": "normal" 215 | }, 216 | { 217 | "id": "TEST-Dates-8-11", 218 | "mainsnak": { 219 | "snaktype": "value", 220 | "property": "P8", 221 | "datatype": "time", 222 | "datavalue": { 223 | "value": { 224 | "time": "-2010-02-29T00:00:00Z", 225 | "timezone": 0, 226 | "before": 0, 227 | "after": 0, 228 | "precision": 9, 229 | "calendarmodel": "http:\/\/www.wikidata.org\/entity\/Q1985786" 230 | }, 231 | "type": "time" 232 | } 233 | }, 234 | "type": "statement", 235 | "rank": "normal" 236 | }, 237 | { 238 | "id": "TEST-Dates-8-12", 239 | "mainsnak": { 240 | "snaktype": "value", 241 | "property": "P8", 242 | "datatype": "time", 243 | "datavalue": { 244 | "value": { 245 | "time": "-2000002010-02-29T00:00:00Z", 246 | "timezone": 0, 247 | "before": 0, 248 | "after": 0, 249 | "precision": 8, 250 | "calendarmodel": "http:\/\/www.wikidata.org\/entity\/Q1985786" 251 | }, 252 | "type": "time" 253 | } 254 | }, 255 | "type": "statement", 256 | "rank": "normal" 257 | }, 258 | { 259 | "id": "TEST-Dates-8-13", 260 | "mainsnak": { 261 | "snaktype": "value", 262 | "property": "P8", 263 | "datatype": "time", 264 | "datavalue": { 265 | "value": { 266 | "time": "+0000-02-29T00:00:00Z", 267 | "timezone": 0, 268 | "before": 0, 269 | "after": 0, 270 | "precision": 9, 271 | "calendarmodel": "http:\/\/www.wikidata.org\/entity\/Q1985786" 272 | }, 273 | "type": "time" 274 | } 275 | }, 276 | "type": "statement", 277 | "rank": "normal" 278 | } 279 | ] 280 | } 281 | } 282 | -------------------------------------------------------------------------------- /src/test/java/org/wikidata/history/sparql/NumericValueFactoryTest.java: -------------------------------------------------------------------------------- 1 | package org.wikidata.history.sparql; 2 | 3 | import org.eclipse.rdf4j.model.IRI; 4 | import org.eclipse.rdf4j.model.Literal; 5 | import org.eclipse.rdf4j.model.vocabulary.GEO; 6 | import org.eclipse.rdf4j.model.vocabulary.RDF; 7 | import org.eclipse.rdf4j.model.vocabulary.XSD; 8 | import org.junit.jupiter.api.Assertions; 9 | import org.junit.jupiter.api.Test; 10 | 11 | import java.util.HashMap; 12 | import java.util.Map; 13 | import java.util.Optional; 14 | 15 | class NumericValueFactoryTest { 16 | 17 | @Test 18 | void testIRIEncoding() throws NotSupportedValueException { 19 | NumericValueFactory valueFactory = new NumericValueFactory(new TestStringStore()); 20 | testIRIConversion(Vocabulary.WD_NAMESPACE + "Q42", valueFactory); 21 | testIRIConversion(Vocabulary.WDT_NAMESPACE + "P42", valueFactory); 22 | testIRIConversion(Vocabulary.P_NAMESPACE + "P42", valueFactory); 23 | testIRIConversion(Vocabulary.PS_NAMESPACE + "P42", valueFactory); 24 | testIRIConversion(Vocabulary.PQV_NAMESPACE + "P42", valueFactory); 25 | testIRIConversion(Vocabulary.REVISION_NAMESPACE + "123", valueFactory); 26 | Assertions.assertEquals(valueFactory.createIRI(Vocabulary.REVISION_NAMESPACE + "123"), valueFactory.createRevisionIRI(123)); 27 | testIRIConversion(Vocabulary.REVISION_GLOBAL_STATE_NAMESPACE + "123", valueFactory); 28 | Assertions.assertEquals(valueFactory.createIRI(Vocabulary.REVISION_GLOBAL_STATE_NAMESPACE + "123"), valueFactory.createRevisionIRI(123,Vocabulary.SnapshotType.GLOBAL_STATE)); 29 | testIRIConversion(Vocabulary.REVISION_ADDITIONS_NAMESPACE + "123", valueFactory); 30 | Assertions.assertEquals(valueFactory.createIRI(Vocabulary.REVISION_ADDITIONS_NAMESPACE + "123"), valueFactory.createRevisionIRI(123,Vocabulary.SnapshotType.ADDITIONS)); 31 | testIRIConversion(Vocabulary.REVISION_DELETIONS_NAMESPACE + "123", valueFactory); 32 | Assertions.assertEquals(valueFactory.createIRI(Vocabulary.REVISION_DELETIONS_NAMESPACE + "123"), valueFactory.createRevisionIRI(123,Vocabulary.SnapshotType.DELETIONS)); 33 | testIRIConversion("http://example.com", valueFactory); 34 | } 35 | 36 | @Test 37 | void testStringEncoding() throws NotSupportedValueException { 38 | NumericValueFactory valueFactory = new NumericValueFactory(new TestStringStore()); 39 | testStringConversion("foofoofoofoo", valueFactory); 40 | testStringConversion("bar", valueFactory); 41 | testLanguageStringConversion("foofoofoofoo", "foofoofoofoo", valueFactory); 42 | testLanguageStringConversion("bar", "foofoofoofoo", valueFactory); 43 | testLanguageStringConversion("foofoofoofoo", "en", valueFactory); 44 | testLanguageStringConversion("bar", "en", valueFactory); 45 | } 46 | 47 | @Test 48 | void testNumberEncoding() throws NotSupportedValueException { 49 | NumericValueFactory valueFactory = new NumericValueFactory(new TestStringStore()); 50 | testIntegerConversion(0, valueFactory); 51 | testIntegerConversion(1, valueFactory); 52 | testIntegerConversion(-1, valueFactory); 53 | testIntegerConversion(Integer.MIN_VALUE, valueFactory); 54 | testIntegerConversion(Integer.MAX_VALUE, valueFactory); 55 | testIntegerConversion(Long.MIN_VALUE, valueFactory); 56 | testIntegerConversion(Long.MAX_VALUE, valueFactory); 57 | } 58 | 59 | @Test 60 | void testDateTimeEncoding() throws NotSupportedValueException { 61 | NumericValueFactory valueFactory = new NumericValueFactory(new TestStringStore()); 62 | testDateTimeConversion("2020-01-01T00:00:00Z", valueFactory); 63 | testDateTimeConversion("2020-12-31T23:59:60Z", valueFactory); 64 | testDateTimeConversion("-2020-01-01T00:00:00Z", valueFactory); 65 | testDateTimeConversion("-10000000000-00-00T00:00:00Z", valueFactory); 66 | } 67 | 68 | @Test 69 | void testTypedLiteralEncoding() throws NotSupportedValueException { 70 | NumericValueFactory valueFactory = new NumericValueFactory(new TestStringStore()); 71 | testTypedLiteralConversion("foofoofoofoo", GEO.WKT_LITERAL, valueFactory); 72 | testTypedLiteralConversion("foofoofoofoo", XSD.DURATION, valueFactory); 73 | 74 | } 75 | 76 | private void testIRIConversion(String iri, NumericValueFactory valueFactory) throws NotSupportedValueException { 77 | Assertions.assertEquals(iri, 78 | valueFactory.createValue( 79 | ((NumericValueFactory.NumericValue) valueFactory.createIRI(iri)).encode() 80 | ).stringValue() 81 | ); 82 | } 83 | 84 | private void testStringConversion(String str, NumericValueFactory valueFactory) throws NotSupportedValueException { 85 | Literal value = (Literal) valueFactory.createValue( 86 | ((NumericValueFactory.NumericValue) valueFactory.createLiteral(str)).encode() 87 | ); 88 | Assertions.assertEquals(XSD.STRING, value.getDatatype()); 89 | Assertions.assertEquals(str, value.stringValue()); 90 | } 91 | 92 | private void testLanguageStringConversion(String str, String languageCode, NumericValueFactory valueFactory) throws NotSupportedValueException { 93 | Literal value = (Literal) valueFactory.createValue( 94 | ((NumericValueFactory.NumericValue) valueFactory.createLiteral(str, languageCode)).encode() 95 | ); 96 | Assertions.assertEquals(RDF.LANGSTRING, value.getDatatype()); 97 | Assertions.assertEquals(str, value.stringValue()); 98 | Assertions.assertEquals(Optional.of(languageCode), value.getLanguage()); 99 | } 100 | 101 | private void testIntegerConversion(long number, NumericValueFactory valueFactory) throws NotSupportedValueException { 102 | Literal valueInteger = (Literal) valueFactory.createValue( 103 | ((NumericValueFactory.NumericValue) valueFactory.createLiteral(number)).encode() 104 | ); 105 | Assertions.assertEquals(XSD.INTEGER, valueInteger.getDatatype()); 106 | Assertions.assertEquals(number, valueInteger.longValue()); 107 | 108 | Literal valueDecimal = (Literal) valueFactory.createValue( 109 | ((NumericValueFactory.NumericValue) valueFactory.createLiteral(Long.toString(number), XSD.DECIMAL)).encode() 110 | ); 111 | Assertions.assertEquals(XSD.DECIMAL, valueDecimal.getDatatype()); 112 | Assertions.assertEquals(number, valueDecimal.longValue()); 113 | } 114 | 115 | private void testDateTimeConversion(String time, NumericValueFactory valueFactory) throws NotSupportedValueException { 116 | Literal value = (Literal) valueFactory.createValue( 117 | ((NumericValueFactory.NumericValue) valueFactory.createLiteral(time, XSD.DATETIME)).encode() 118 | ); 119 | Assertions.assertEquals(XSD.DATETIME, value.getDatatype()); 120 | Assertions.assertEquals(time, value.stringValue()); 121 | } 122 | 123 | private void testTypedLiteralConversion(String str, IRI datatype, NumericValueFactory valueFactory) throws NotSupportedValueException { 124 | Literal value = (Literal) valueFactory.createValue( 125 | ((NumericValueFactory.NumericValue) valueFactory.createLiteral(str, datatype)).encode() 126 | ); 127 | Assertions.assertEquals(datatype, value.getDatatype()); 128 | Assertions.assertEquals(str, value.stringValue()); 129 | } 130 | 131 | static class TestStringStore implements NumericValueFactory.StringStore { 132 | 133 | private static final Map ENCODING = new HashMap<>(); 134 | private static final Map DECODING = new HashMap<>(); 135 | 136 | static { 137 | ENCODING.put("foofoofoofoo", 0L); 138 | DECODING.put(0L, "foofoofoofoo"); 139 | ENCODING.put("bar", (long) Integer.MAX_VALUE); 140 | DECODING.put((long) Integer.MAX_VALUE, "bar"); 141 | ENCODING.put("en", (long) Short.MAX_VALUE); 142 | DECODING.put((long) Short.MAX_VALUE, "en"); 143 | ENCODING.put(Long.toString(Long.MAX_VALUE), 1L); 144 | DECODING.put(1L, Long.toString(Long.MAX_VALUE)); 145 | ENCODING.put(Long.toString(Long.MIN_VALUE), 2L); 146 | DECODING.put(2L, Long.toString(Long.MIN_VALUE)); 147 | ENCODING.put("http://example.com", 3L); 148 | DECODING.put(3L, "http://example.com"); 149 | } 150 | 151 | @Override 152 | public String getString(long id) { 153 | return DECODING.get(id); 154 | } 155 | 156 | @Override 157 | public Long putString(String str) { 158 | return ENCODING.getOrDefault(str, null); 159 | } 160 | 161 | @Override 162 | public String getLanguage(short id) { 163 | return DECODING.get((long) id); 164 | } 165 | 166 | @Override 167 | public Short putLanguage(String languageCode) { 168 | return ENCODING.containsKey(languageCode) ? ENCODING.get(languageCode).shortValue() : null; 169 | } 170 | 171 | @Override 172 | public void close() { 173 | } 174 | } 175 | } 176 | 177 | -------------------------------------------------------------------------------- /src/main/java/org/wikidata/history/preprocessor/RevisionFileConverter.java: -------------------------------------------------------------------------------- 1 | package org.wikidata.history.preprocessor; 2 | 3 | import com.fasterxml.jackson.databind.DeserializationFeature; 4 | import com.fasterxml.jackson.databind.ObjectReader; 5 | import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream; 6 | import org.eclipse.rdf4j.model.Statement; 7 | import org.eclipse.rdf4j.model.ValueFactory; 8 | import org.eclipse.rdf4j.model.impl.SimpleValueFactory; 9 | import org.eclipse.rdf4j.model.vocabulary.OWL; 10 | import org.slf4j.Logger; 11 | import org.slf4j.LoggerFactory; 12 | import org.wikidata.history.sparql.Vocabulary; 13 | import org.wikidata.wdtk.datamodel.helpers.Datamodel; 14 | import org.wikidata.wdtk.datamodel.helpers.DatamodelMapper; 15 | import org.wikidata.wdtk.datamodel.implementation.EntityDocumentImpl; 16 | import org.wikidata.wdtk.datamodel.interfaces.Sites; 17 | import org.wikidata.wdtk.dumpfiles.*; 18 | 19 | import java.io.BufferedInputStream; 20 | import java.io.IOException; 21 | import java.nio.file.Files; 22 | import java.nio.file.Path; 23 | import java.time.Instant; 24 | import java.util.*; 25 | import java.util.regex.Matcher; 26 | import java.util.regex.Pattern; 27 | 28 | public class RevisionFileConverter { 29 | 30 | private static final Logger LOGGER = LoggerFactory.getLogger(RevisionFileConverter.class); 31 | private static final Pattern ENTITY_PAGE_TITLE_PATTERN = Pattern.compile("^(Item:|Property:|)([PQ]\\d+)$"); 32 | private static final Pattern REDIRECTION_PATTERN = Pattern.compile("^\\{\"entity\":\"(.*)\",\"redirect\":\"(.*)\"}$"); 33 | private static final ValueFactory VALUE_FACTORY = SimpleValueFactory.getInstance(); 34 | private static final long[] EMPTY_LONG_ARRAY = new long[]{}; 35 | 36 | private final HistoryOutput historyOutput; 37 | private final Sites sites; 38 | private final WikidataPropertyInformation propertyInformation; 39 | 40 | 41 | public RevisionFileConverter(HistoryOutput historyOutput) throws IOException { 42 | this.historyOutput = historyOutput; 43 | sites = (new DumpProcessingController("wikidatawiki")).getSitesInformation(); 44 | propertyInformation = new WikidataPropertyInformation(); 45 | } 46 | 47 | public void process(Path file) throws IOException, InterruptedException { 48 | MwDumpFileProcessor processor = new MwRevisionDumpFileProcessor(new RevisionProcessor(historyOutput, sites, propertyInformation)); 49 | MwLocalDumpFile dumpFile = new MwLocalDumpFile(file.toString(), DumpContentType.FULL, null, null); 50 | for (int i = 0; i < 10; i++) { 51 | try { 52 | processor.processDumpFileContents(new BZip2CompressorInputStream(new BufferedInputStream(Files.newInputStream(file))), dumpFile); 53 | return; 54 | } catch (IOException e) { 55 | LOGGER.error(e.getMessage(), e); 56 | Thread.sleep(30000); 57 | if (i == 9) { 58 | throw e; 59 | } 60 | } 61 | } 62 | } 63 | 64 | private static final class RevisionProcessor implements MwRevisionProcessor { 65 | 66 | private final HistoryOutput historyOutput; 67 | private final Sites sites; 68 | private final WikidataPropertyInformation propertyInformation; 69 | private int currentPageId = -1; 70 | private final Map> revisions = new TreeMap<>(); 71 | private final Map triplesHistory = new HashMap<>(); 72 | private final ObjectReader entityReader = new DatamodelMapper(Datamodel.SITE_WIKIDATA) 73 | .enable(DeserializationFeature.ACCEPT_EMPTY_ARRAY_AS_NULL_OBJECT) 74 | .readerFor(EntityDocumentImpl.class); 75 | 76 | RevisionProcessor(HistoryOutput historyOutput, Sites sites, WikidataPropertyInformation propertyInformation) { 77 | this.historyOutput = historyOutput; 78 | this.sites = sites; 79 | this.propertyInformation = propertyInformation; 80 | } 81 | 82 | @Override 83 | public void startRevisionProcessing(String siteName, String baseUrl, Map namespaces) { 84 | } 85 | 86 | @Override 87 | public void processRevision(MwRevision mwRevision) { 88 | try { 89 | int pageId = mwRevision.getPageId(); 90 | long revisionId = mwRevision.getRevisionId(); 91 | String text = mwRevision.getText(); 92 | String entityId = getEntityIdFromPageTitle(mwRevision.getPrefixedTitle()); 93 | 94 | if (entityId == null) { 95 | return; //Not a Wikibase entity 96 | } 97 | if (pageId != currentPageId) { 98 | processRevisions(); 99 | currentPageId = pageId; 100 | } 101 | 102 | //Adds to revision history 103 | try { 104 | historyOutput.addRevision(revisionId, mwRevision.getParentRevisionId(), entityId, Instant.parse(mwRevision.getTimeStamp()), mwRevision.getContributor(), mwRevision.getComment()); 105 | } catch (Exception e) { 106 | LOGGER.error(e.getMessage(), e); 107 | } 108 | 109 | //Redirection 110 | Matcher redirectionMatcher = REDIRECTION_PATTERN.matcher(text); 111 | if (redirectionMatcher.matches()) { 112 | revisions.put(revisionId, Collections.singleton(VALUE_FACTORY.createStatement( 113 | VALUE_FACTORY.createIRI(Vocabulary.WD_NAMESPACE, redirectionMatcher.group(1)), 114 | OWL.SAMEAS, 115 | VALUE_FACTORY.createIRI(Vocabulary.WD_NAMESPACE, redirectionMatcher.group(2)) 116 | ))); 117 | } else { 118 | SetRdfOutput output = new SetRdfOutput(); 119 | RdfBuilder converter = new RdfBuilder(output, sites, propertyInformation); 120 | converter.addEntityDocument(entityReader.readValue(text)); 121 | revisions.put(revisionId, output.getStatements()); 122 | } 123 | } catch (Exception e) { 124 | LOGGER.warn("Error while parsing revision " + mwRevision.toString() + ": " + e.getMessage()); 125 | } 126 | } 127 | 128 | private void processRevisions() { 129 | long[] revisionIds = toSortedLongArrays(revisions.keySet()); 130 | 131 | for (int i = 0; i < revisionIds.length; i++) { 132 | long revisionId = revisionIds[i]; 133 | long nextRevisionId = (i + 1 == revisionIds.length) ? Long.MAX_VALUE : revisionIds[i + 1]; 134 | if (nextRevisionId < revisionId) { 135 | LOGGER.error("The revision ids are not properly sorted."); 136 | } 137 | 138 | for (Statement statement : revisions.get(revisionId)) { 139 | long[] statementRevisions = triplesHistory.getOrDefault(statement, EMPTY_LONG_ARRAY); 140 | if (statementRevisions.length > 0 && statementRevisions[statementRevisions.length - 1] == revisionId) { 141 | statementRevisions[statementRevisions.length - 1] = nextRevisionId; 142 | } else { 143 | statementRevisions = Arrays.copyOf(statementRevisions, statementRevisions.length + 2); 144 | statementRevisions[statementRevisions.length - 2] = revisionId; 145 | statementRevisions[statementRevisions.length - 1] = nextRevisionId; 146 | triplesHistory.put(statement, statementRevisions); 147 | } 148 | } 149 | } 150 | 151 | for (Map.Entry entry : triplesHistory.entrySet()) { 152 | try { 153 | if (!isSorted(entry.getValue())) { 154 | LOGGER.error("the revision ranges are not sorted: " + Arrays.toString(revisionIds)); 155 | } 156 | historyOutput.addTriple(entry.getKey().getSubject(), entry.getKey().getPredicate(), entry.getKey().getObject(), entry.getValue()); 157 | } catch (IOException e) { 158 | LOGGER.error(e.getMessage(), e); 159 | } 160 | } 161 | 162 | triplesHistory.clear(); 163 | revisions.clear(); 164 | } 165 | 166 | private long[] toSortedLongArrays(Set s) { 167 | long[] values = new long[s.size()]; 168 | int i = 0; 169 | for (long v : s) { 170 | values[i] = v; 171 | i++; 172 | } 173 | Arrays.sort(values); 174 | return values; 175 | } 176 | 177 | private String getEntityIdFromPageTitle(String title) { 178 | Matcher matcher = ENTITY_PAGE_TITLE_PATTERN.matcher(title); 179 | return matcher.matches() ? matcher.group(2) : null; 180 | } 181 | 182 | @Override 183 | public void finishRevisionProcessing() { 184 | if (!revisions.isEmpty()) { 185 | processRevisions(); 186 | } 187 | } 188 | 189 | private static boolean isSorted(long[] array) { 190 | for (int i = 1; i < array.length; i++) { 191 | if (array[i] <= array[i - 1]) { 192 | return false; 193 | } 194 | } 195 | return true; 196 | } 197 | } 198 | 199 | private static class SetRdfOutput implements RdfBuilder.RdfOutput { 200 | private final Set statements = new HashSet<>(); 201 | 202 | @Override 203 | public void outputStatement(Statement statement) { 204 | statements.add(statement); 205 | } 206 | 207 | Set getStatements() { 208 | return statements; 209 | } 210 | } 211 | } 212 | -------------------------------------------------------------------------------- /src/main/java/org/wikidata/history/sparql/HistoryRepository.java: -------------------------------------------------------------------------------- 1 | package org.wikidata.history.sparql; 2 | 3 | import org.eclipse.rdf4j.common.iteration.CloseableIteration; 4 | import org.eclipse.rdf4j.common.iteration.EmptyIteration; 5 | import org.eclipse.rdf4j.common.iteration.ExceptionConvertingIteration; 6 | import org.eclipse.rdf4j.model.*; 7 | import org.eclipse.rdf4j.query.*; 8 | import org.eclipse.rdf4j.query.algebra.evaluation.QueryPreparer; 9 | import org.eclipse.rdf4j.query.parser.*; 10 | import org.eclipse.rdf4j.query.parser.sparql.SPARQLParser; 11 | import org.eclipse.rdf4j.repository.RepositoryConnection; 12 | import org.eclipse.rdf4j.repository.RepositoryException; 13 | import org.eclipse.rdf4j.repository.RepositoryResult; 14 | import org.eclipse.rdf4j.repository.base.AbstractRepository; 15 | import org.eclipse.rdf4j.repository.base.AbstractRepositoryConnection; 16 | import org.eclipse.rdf4j.rio.RDFHandler; 17 | import org.eclipse.rdf4j.rio.RDFHandlerException; 18 | 19 | import java.io.File; 20 | import java.nio.file.Path; 21 | import java.nio.file.Paths; 22 | 23 | import static org.eclipse.rdf4j.query.QueryLanguage.SPARQL; 24 | 25 | public class HistoryRepository extends AbstractRepository implements AutoCloseable { 26 | 27 | private Path indexPath; 28 | private RocksTripleSource tripleSource; 29 | private QueryParser queryParser; 30 | private QueryPreparer queryPreparer; 31 | 32 | public HistoryRepository(Path indexPath) { 33 | this.indexPath = indexPath; 34 | init(); 35 | } 36 | 37 | @Override 38 | protected void initializeInternal() throws RepositoryException { 39 | tripleSource = new RocksTripleSource(indexPath); 40 | queryParser = new SPARQLParser(); 41 | queryPreparer = new SimpleQueryPreparer(tripleSource); 42 | } 43 | 44 | @Override 45 | protected void shutDownInternal() throws RepositoryException { 46 | tripleSource.close(); 47 | } 48 | 49 | @Override 50 | public void setDataDir(File file) { 51 | indexPath = Paths.get(file.toURI()); 52 | } 53 | 54 | @Override 55 | public File getDataDir() { 56 | return indexPath.toFile(); 57 | } 58 | 59 | @Override 60 | public boolean isWritable() throws RepositoryException { 61 | return false; 62 | } 63 | 64 | @Override 65 | public RepositoryConnection getConnection() throws RepositoryException { 66 | return new HistoryConnection(this); 67 | } 68 | 69 | @Override 70 | public ValueFactory getValueFactory() { 71 | return tripleSource.getValueFactory(); 72 | } 73 | 74 | @Override 75 | public void close() { 76 | shutDownInternal(); 77 | } 78 | 79 | private static final class HistoryConnection extends AbstractRepositoryConnection { 80 | 81 | private final HistoryRepository repository; 82 | 83 | private HistoryConnection(HistoryRepository repository) { 84 | super(repository); 85 | this.repository = repository; 86 | } 87 | 88 | @Override 89 | protected void addWithoutCommit(Resource resource, IRI iri, Value value, Resource... resources) throws RepositoryException { 90 | throw new UnsupportedOperationException(); 91 | } 92 | 93 | @Override 94 | protected void removeWithoutCommit(Resource resource, IRI iri, Value value, Resource... resources) throws RepositoryException { 95 | throw new UnsupportedOperationException(); 96 | } 97 | 98 | @Override 99 | public Query prepareQuery(QueryLanguage ql, String query, String base) throws RepositoryException, MalformedQueryException { 100 | if (QueryLanguage.SPARQL.equals(ql)) { 101 | String strippedQuery = QueryParserUtil.removeSPARQLQueryProlog(query).toUpperCase(); 102 | if (strippedQuery.startsWith("SELECT")) { 103 | return prepareTupleQuery(ql, query, base); 104 | } else if (strippedQuery.startsWith("ASK")) { 105 | return prepareBooleanQuery(ql, query, base); 106 | } else { 107 | return prepareGraphQuery(ql, query, base); 108 | } 109 | } else { 110 | throw new UnsupportedOperationException("Unsupported query language " + ql); 111 | } 112 | } 113 | 114 | @Override 115 | public TupleQuery prepareTupleQuery(QueryLanguage ql, String query, String base) throws RepositoryException, MalformedQueryException { 116 | ParsedQuery parsedQuery = parseQuery(ql, query, base); 117 | if (parsedQuery instanceof ParsedTupleQuery) { 118 | return repository.queryPreparer.prepare((ParsedTupleQuery) parsedQuery); 119 | } else { 120 | throw new MalformedQueryException("Not supported query: " + parsedQuery.toString()); 121 | } 122 | } 123 | 124 | @Override 125 | public GraphQuery prepareGraphQuery(QueryLanguage ql, String query, String base) throws RepositoryException, MalformedQueryException { 126 | ParsedQuery parsedQuery = parseQuery(ql, query, base); 127 | if (parsedQuery instanceof ParsedGraphQuery) { 128 | return repository.queryPreparer.prepare((ParsedGraphQuery) parsedQuery); 129 | } else { 130 | throw new MalformedQueryException("Not supported query: " + parsedQuery.toString()); 131 | } 132 | } 133 | 134 | @Override 135 | public BooleanQuery prepareBooleanQuery(QueryLanguage ql, String query, String base) throws RepositoryException, MalformedQueryException { 136 | ParsedQuery parsedQuery = parseQuery(ql, query, base); 137 | if (parsedQuery instanceof ParsedBooleanQuery) { 138 | return repository.queryPreparer.prepare((ParsedBooleanQuery) parsedQuery); 139 | } else { 140 | throw new MalformedQueryException("Not supported query: " + parsedQuery.toString()); 141 | } 142 | } 143 | 144 | private ParsedQuery parseQuery(QueryLanguage ql, String query, String base) { 145 | if (SPARQL.equals(ql)) { 146 | try { 147 | return repository.queryParser.parseQuery(query, base); 148 | } catch (MalformedQueryException e) { 149 | throw new MalformedQueryException(e.getMessage() + "\nQuery:\n" + query, e); 150 | } 151 | } else { 152 | throw new UnsupportedQueryLanguageException("Unsupported query language " + ql); 153 | } 154 | } 155 | 156 | @Override 157 | public Update prepareUpdate(QueryLanguage queryLanguage, String s, String s1) throws RepositoryException, MalformedQueryException { 158 | throw new UnsupportedOperationException(); 159 | } 160 | 161 | @Override 162 | public RepositoryResult getContextIDs() throws RepositoryException { 163 | throw new UnsupportedOperationException(); 164 | } 165 | 166 | @Override 167 | public RepositoryResult getStatements(Resource subj, IRI pred, Value obj, boolean includeInferred, Resource... contexts) throws RepositoryException { 168 | return new RepositoryResult<>(new ExceptionConvertingIteration<>( 169 | repository.tripleSource.getStatements(subj, pred, obj, contexts) 170 | ) { 171 | @Override 172 | protected RepositoryException convert(Exception e) { 173 | return new RepositoryException(e); 174 | } 175 | }); 176 | } 177 | 178 | @Override 179 | public boolean hasStatement(Resource subj, IRI pred, Value obj, boolean includeInferred, Resource... contexts) throws RepositoryException { 180 | try (CloseableIteration statements = repository.tripleSource.getStatements(subj, pred, obj, contexts)) { 181 | return statements.hasNext(); 182 | } catch (QueryEvaluationException e) { 183 | throw new RepositoryException(e); 184 | } 185 | } 186 | 187 | @Override 188 | public void exportStatements(Resource subj, IRI pred, Value obj, boolean includeInferred, RDFHandler handler, Resource... contexts) throws RepositoryException, RDFHandlerException { 189 | } 190 | 191 | @Override 192 | public long size(Resource... contexts) throws RepositoryException { 193 | throw new UnsupportedOperationException(); 194 | } 195 | 196 | @Override 197 | public boolean isActive() throws RepositoryException { 198 | throw new UnsupportedOperationException(); 199 | } 200 | 201 | @Override 202 | public void begin() throws RepositoryException { 203 | throw new UnsupportedOperationException(); 204 | } 205 | 206 | @Override 207 | public void commit() throws RepositoryException { 208 | throw new UnsupportedOperationException(); 209 | } 210 | 211 | @Override 212 | public void rollback() throws RepositoryException { 213 | throw new UnsupportedOperationException(); 214 | } 215 | 216 | @Override 217 | public RepositoryResult getNamespaces() throws RepositoryException { 218 | return new RepositoryResult<>(new EmptyIteration<>()); 219 | } 220 | 221 | @Override 222 | public String getNamespace(String prefix) throws RepositoryException { 223 | return null; 224 | } 225 | 226 | @Override 227 | public void setNamespace(String prefix, String name) throws RepositoryException { 228 | throw new UnsupportedOperationException(); 229 | } 230 | 231 | @Override 232 | public void removeNamespace(String prefix) throws RepositoryException { 233 | throw new UnsupportedOperationException(); 234 | } 235 | 236 | @Override 237 | public void clearNamespaces() throws RepositoryException { 238 | throw new UnsupportedOperationException(); 239 | } 240 | } 241 | } 242 | -------------------------------------------------------------------------------- /src/main/java/org/wikidata/history/web/SparqlEndpoint.java: -------------------------------------------------------------------------------- 1 | package org.wikidata.history.web; 2 | 3 | import io.javalin.core.util.Header; 4 | import io.javalin.http.BadRequestResponse; 5 | import io.javalin.http.Context; 6 | import io.javalin.http.HttpResponseException; 7 | import io.javalin.http.InternalServerErrorResponse; 8 | import org.apache.http.NameValuePair; 9 | import org.apache.http.client.utils.URLEncodedUtils; 10 | import org.eclipse.rdf4j.common.lang.FileFormat; 11 | import org.eclipse.rdf4j.common.lang.service.FileFormatServiceRegistry; 12 | import org.eclipse.rdf4j.model.Model; 13 | import org.eclipse.rdf4j.model.Resource; 14 | import org.eclipse.rdf4j.model.ValueFactory; 15 | import org.eclipse.rdf4j.model.impl.SimpleValueFactory; 16 | import org.eclipse.rdf4j.model.impl.TreeModel; 17 | import org.eclipse.rdf4j.model.vocabulary.RDF; 18 | import org.eclipse.rdf4j.model.vocabulary.SD; 19 | import org.eclipse.rdf4j.query.*; 20 | import org.eclipse.rdf4j.query.algebra.evaluation.QueryPreparer; 21 | import org.eclipse.rdf4j.query.algebra.evaluation.TripleSource; 22 | import org.eclipse.rdf4j.query.parser.*; 23 | import org.eclipse.rdf4j.query.parser.sparql.SPARQLParser; 24 | import org.eclipse.rdf4j.query.resultio.BooleanQueryResultWriterFactory; 25 | import org.eclipse.rdf4j.query.resultio.BooleanQueryResultWriterRegistry; 26 | import org.eclipse.rdf4j.query.resultio.TupleQueryResultWriterFactory; 27 | import org.eclipse.rdf4j.query.resultio.TupleQueryResultWriterRegistry; 28 | import org.eclipse.rdf4j.rio.RDFWriterFactory; 29 | import org.eclipse.rdf4j.rio.RDFWriterRegistry; 30 | import org.eclipse.rdf4j.rio.Rio; 31 | import org.slf4j.Logger; 32 | import org.slf4j.LoggerFactory; 33 | import org.wikidata.history.sparql.SimpleQueryPreparer; 34 | 35 | import java.io.IOException; 36 | import java.io.OutputStream; 37 | import java.io.PipedInputStream; 38 | import java.io.PipedOutputStream; 39 | import java.nio.charset.StandardCharsets; 40 | import java.util.List; 41 | import java.util.concurrent.ExecutorService; 42 | import java.util.concurrent.Executors; 43 | import java.util.function.BiConsumer; 44 | import java.util.stream.Collectors; 45 | 46 | class SparqlEndpoint { 47 | private static final int QUERY_TIMOUT_IN_S = 60 * 5; 48 | private static final Logger LOGGER = LoggerFactory.getLogger(SparqlEndpoint.class); 49 | 50 | private final QueryParser queryParser = new SPARQLParser(); 51 | private final QueryPreparer queryPreparer; 52 | private final ExecutorService executorService = Executors.newCachedThreadPool(); 53 | private final QueryLogger queryLogger; 54 | 55 | SparqlEndpoint(TripleSource tripleSource, QueryLogger queryLogger) { 56 | queryPreparer = new SimpleQueryPreparer(tripleSource); 57 | this.queryLogger = queryLogger; 58 | } 59 | 60 | void get(Context context) { 61 | String query = context.queryParam("query"); 62 | if (query == null) { 63 | executeDescription(context); 64 | } else { 65 | executeQuery(query, context); 66 | } 67 | } 68 | 69 | void post(Context context) { 70 | String contentType = context.contentType(); 71 | if (contentType != null) { 72 | contentType = contentType.split(";")[0].trim(); 73 | } 74 | if ("application/x-www-form-urlencoded".equals(contentType)) { 75 | executeQuery(URLEncodedUtils.parse(context.body(), StandardCharsets.UTF_8).stream() 76 | .filter(t -> t.getName().trim().equals("query")) 77 | .map(NameValuePair::getValue) 78 | .findAny() 79 | .orElseThrow(() -> new BadRequestResponse("The 'query' urlencoded parameter is mandatory")), 80 | context); 81 | } else if ("application/sparql-query".equals(contentType)) { 82 | executeQuery(context.body(), context); 83 | } else { 84 | throw new BadRequestResponse("Unexpected Content-Type: " + contentType); 85 | } 86 | } 87 | 88 | private void executeDescription(Context context) { 89 | outputWithFormat(RDFWriterRegistry.getInstance(), context, (service, outputStream) -> 90 | Rio.write(getServiceDescription(), service.getWriter(outputStream)) 91 | ); 92 | } 93 | 94 | private void executeQuery(String query, Context context) { 95 | ParsedQuery parsedQuery; 96 | try { 97 | parsedQuery = queryParser.parseQuery(query, null); 98 | } catch (MalformedQueryException e) { 99 | throw new BadRequestResponse(e.getMessage()); 100 | } 101 | queryLogger.logQuery(parsedQuery.getSourceString()); 102 | if (parsedQuery instanceof ParsedBooleanQuery) { 103 | evaluateBooleanQuery((ParsedBooleanQuery) parsedQuery, context); 104 | } else if (parsedQuery instanceof ParsedGraphQuery) { 105 | evaluateGraphQuery((ParsedGraphQuery) parsedQuery, context); 106 | } else if (parsedQuery instanceof ParsedTupleQuery) { 107 | evaluateTupleQuery((ParsedTupleQuery) parsedQuery, context); 108 | } else { 109 | throw new BadRequestResponse("Unsupported kind of query: " + parsedQuery); 110 | } 111 | 112 | } 113 | 114 | private void evaluateBooleanQuery(ParsedBooleanQuery parsedQuery, Context context) { 115 | outputWithFormat(BooleanQueryResultWriterRegistry.getInstance(), context, (service, outputStream) -> { 116 | try { 117 | BooleanQuery query = queryPreparer.prepare(parsedQuery); 118 | query.setMaxExecutionTime(QUERY_TIMOUT_IN_S); 119 | service.getWriter(outputStream).handleBoolean(query.evaluate()); 120 | } catch (QueryEvaluationException e) { 121 | LOGGER.info(e.getMessage(), e); 122 | throw new InternalServerErrorResponse(e.getMessage()); 123 | } 124 | } 125 | ); 126 | } 127 | 128 | private void evaluateGraphQuery(ParsedGraphQuery parsedQuery, Context context) { 129 | outputWithFormat(RDFWriterRegistry.getInstance(), context, (service, outputStream) -> { 130 | try { 131 | GraphQuery query = queryPreparer.prepare(parsedQuery); 132 | query.setMaxExecutionTime(QUERY_TIMOUT_IN_S); 133 | query.evaluate(service.getWriter(outputStream)); 134 | } catch (QueryEvaluationException e) { 135 | LOGGER.info(e.getMessage(), e); 136 | throw new InternalServerErrorResponse(e.getMessage()); 137 | } 138 | }); 139 | } 140 | 141 | private void evaluateTupleQuery(ParsedTupleQuery parsedQuery, Context context) { 142 | outputWithFormat(TupleQueryResultWriterRegistry.getInstance(), context, (service, outputStream) -> { 143 | try { 144 | TupleQuery query = queryPreparer.prepare(parsedQuery); 145 | query.setMaxExecutionTime(QUERY_TIMOUT_IN_S); 146 | query.evaluate(service.getWriter(outputStream)); 147 | } catch (QueryEvaluationException e) { 148 | LOGGER.info(e.getMessage(), e); 149 | throw new InternalServerErrorResponse(e.getMessage()); 150 | } 151 | }); 152 | } 153 | 154 | private void outputWithFormat(FileFormatServiceRegistry writerRegistry, Context context, BiConsumer addToOutput) { 155 | List accepted = writerRegistry.getKeys().stream().flatMap(k -> k.getMIMETypes().stream()).collect(Collectors.toList()); 156 | String mimeType; 157 | try { 158 | mimeType = ContentNegotiation.negotiateAccept(context.header(Header.ACCEPT), accepted) 159 | .orElseThrow(() -> new NotAcceptableResponse("No acceptable result format found. Accepted format are: " + accepted)); 160 | } catch (IllegalArgumentException e) { 161 | throw new BadRequestResponse(e.getMessage()); 162 | } 163 | 164 | FF fileFormat = writerRegistry.getFileFormatForMIMEType(mimeType).orElseThrow(() -> { 165 | LOGGER.error("Not able to retrieve writer for " + mimeType); 166 | return new InternalServerErrorResponse("Not able to retrieve writer for " + mimeType); 167 | }); 168 | S service = writerRegistry.get(fileFormat).orElseThrow(() -> { 169 | LOGGER.error("Unable to write " + fileFormat); 170 | return new InternalServerErrorResponse("Unable to write " + fileFormat); 171 | }); 172 | 173 | try { 174 | PipedOutputStream outputStream = new PipedOutputStream(); 175 | PipedInputStream inputStream = new PipedInputStream(outputStream); 176 | context.contentType(mimeType); 177 | context.result(inputStream); 178 | executorService.submit(() -> { 179 | try { 180 | addToOutput.accept(service, outputStream); 181 | } catch (HttpResponseException e) { 182 | try { 183 | context.status(e.getStatus()); 184 | context.contentType("text/plain"); 185 | outputStream.write(e.getMessage().getBytes()); 186 | } catch (IOException e1) { 187 | LOGGER.error(e.getMessage(), e); 188 | } 189 | } finally { 190 | try { 191 | outputStream.close(); 192 | } catch (IOException e) { 193 | LOGGER.error(e.getMessage(), e); 194 | } 195 | } 196 | }); 197 | } catch (IOException e) { 198 | LOGGER.error(e.getMessage(), e); 199 | throw new InternalServerErrorResponse(); 200 | } 201 | } 202 | 203 | private Model getServiceDescription() { 204 | ValueFactory valueFactory = SimpleValueFactory.getInstance(); 205 | Model model = new TreeModel(); 206 | 207 | Resource service = valueFactory.createBNode(); 208 | model.add(service, RDF.TYPE, SD.SERVICE); 209 | //TODO model.add(service, SD.ENDPOINT, ) 210 | model.add(service, SD.FEATURE_PROPERTY, SD.BASIC_FEDERATED_QUERY); 211 | model.add(service, SD.SUPPORTED_LANGUAGE, SD.SPARQL_10_QUERY); 212 | model.add(service, SD.SUPPORTED_LANGUAGE, SD.SPARQL_11_QUERY); 213 | 214 | for (TupleQueryResultWriterFactory queryResultWriterFactory : TupleQueryResultWriterRegistry.getInstance().getAll()) { 215 | Resource formatIRI = queryResultWriterFactory.getTupleQueryResultFormat().getStandardURI(); 216 | if (formatIRI != null) { 217 | model.add(service, SD.RESULT_FORMAT, formatIRI); 218 | } 219 | } 220 | for (BooleanQueryResultWriterFactory queryResultWriterFactory : BooleanQueryResultWriterRegistry.getInstance().getAll()) { 221 | Resource formatIRI = queryResultWriterFactory.getBooleanQueryResultFormat().getStandardURI(); 222 | if (formatIRI != null) { 223 | model.add(service, SD.RESULT_FORMAT, formatIRI); 224 | } 225 | } 226 | for (RDFWriterFactory formatWriterFactory : RDFWriterRegistry.getInstance().getAll()) { 227 | Resource formatIRI = formatWriterFactory.getRDFFormat().getStandardURI(); 228 | if (formatIRI != null) { 229 | model.add(service, SD.RESULT_FORMAT, formatIRI); 230 | } 231 | } 232 | 233 | return model; 234 | } 235 | } 236 | -------------------------------------------------------------------------------- /src/test/java/org/wikidata/history/sparql/RocksTripleSourceTest.java: -------------------------------------------------------------------------------- 1 | package org.wikidata.history.sparql; 2 | 3 | import org.eclipse.rdf4j.common.iteration.CloseableIteration; 4 | import org.eclipse.rdf4j.model.IRI; 5 | import org.eclipse.rdf4j.model.Resource; 6 | import org.eclipse.rdf4j.model.Statement; 7 | import org.eclipse.rdf4j.model.ValueFactory; 8 | import org.eclipse.rdf4j.model.impl.SimpleValueFactory; 9 | import org.junit.jupiter.api.Assertions; 10 | import org.junit.jupiter.api.BeforeEach; 11 | import org.junit.jupiter.api.Test; 12 | 13 | import java.io.IOException; 14 | import java.nio.file.Files; 15 | import java.nio.file.Path; 16 | import java.util.Arrays; 17 | import java.util.List; 18 | 19 | class RocksTripleSourceTest { 20 | 21 | private static final ValueFactory VALUE_FACTORY = SimpleValueFactory.getInstance(); 22 | private static final List STATEMENTS = Arrays.asList( 23 | VALUE_FACTORY.createStatement( 24 | VALUE_FACTORY.createIRI(Vocabulary.WD_NAMESPACE, "Q42"), 25 | VALUE_FACTORY.createIRI(Vocabulary.WDT_NAMESPACE, "P31"), 26 | VALUE_FACTORY.createIRI(Vocabulary.WD_NAMESPACE, "Q5"), 27 | VALUE_FACTORY.createIRI(Vocabulary.REVISION_NAMESPACE, "42") 28 | ), 29 | VALUE_FACTORY.createStatement( 30 | VALUE_FACTORY.createIRI(Vocabulary.WD_NAMESPACE, "Q42"), 31 | VALUE_FACTORY.createIRI(Vocabulary.WDT_NAMESPACE, "P21"), 32 | VALUE_FACTORY.createIRI(Vocabulary.WD_NAMESPACE, "Q6581097"), 33 | VALUE_FACTORY.createIRI(Vocabulary.REVISION_NAMESPACE, "42") 34 | ), 35 | VALUE_FACTORY.createStatement( 36 | VALUE_FACTORY.createIRI(Vocabulary.WD_NAMESPACE, "Q42"), 37 | VALUE_FACTORY.createIRI(Vocabulary.WDT_NAMESPACE, "P735"), 38 | VALUE_FACTORY.createIRI(Vocabulary.WD_NAMESPACE, "Q463035"), 39 | VALUE_FACTORY.createIRI(Vocabulary.REVISION_NAMESPACE, "42") 40 | ), 41 | VALUE_FACTORY.createStatement( 42 | VALUE_FACTORY.createIRI(Vocabulary.WD_NAMESPACE, "Q42"), 43 | VALUE_FACTORY.createIRI(Vocabulary.WDT_NAMESPACE, "P735"), 44 | VALUE_FACTORY.createIRI(Vocabulary.WD_NAMESPACE, "Q19688263"), 45 | VALUE_FACTORY.createIRI(Vocabulary.REVISION_NAMESPACE, "42") 46 | ) 47 | ); 48 | 49 | private final Path tempDir; 50 | 51 | RocksTripleSourceTest() throws IOException { 52 | tempDir = Files.createTempDirectory(null); 53 | Files.deleteIfExists(tempDir); 54 | } 55 | 56 | @BeforeEach 57 | void setUpBeforeClass() throws NotSupportedValueException { 58 | try (RocksStore store = new RocksStore(tempDir, false)) { 59 | NumericValueFactory factory = new NumericValueFactory(store.getReadWriteStringStore()); 60 | for (Statement statement : STATEMENTS) { 61 | long s = factory.encodeValue(statement.getSubject()); 62 | long p = factory.encodeValue(statement.getPredicate()); 63 | long o = factory.encodeValue(statement.getObject()); 64 | long revision = Long.parseLong(((IRI) statement.getContext()).getLocalName()); 65 | long[] value = new long[]{revision, revision + 1}; 66 | store.spoStatementIndex().put(new long[]{s, p, o}, value); 67 | store.posStatementIndex().put(new long[]{p, o, s}, value); 68 | store.ospStatementIndex().put(new long[]{o, s, p}, value); 69 | addToStatementListIndex(store.insertedStatementIndex(), revision, new long[]{s, p, o}); 70 | addToStatementListIndex(store.deletedStatementIndex(), revision + 1, new long[]{s, p, o}); 71 | } 72 | } 73 | } 74 | 75 | private static void addToStatementListIndex(RocksStore.Index index, long key, long[] triple) { 76 | long[] existingTriples = index.get(key); 77 | long[] newTriples = (existingTriples == null) ? triple : TripleArrayUtils.addToSortedArray(existingTriples, triple); 78 | if (newTriples != existingTriples) { 79 | index.put(key, newTriples); 80 | } 81 | } 82 | 83 | @Test 84 | void testTriplePattern() { 85 | try (RocksTripleSource tripleSource = new RocksTripleSource(tempDir)) { 86 | assertLength(tripleSource.getStatements(null, null, null), 8); 87 | assertLength(tripleSource.getStatements(VALUE_FACTORY.createIRI(Vocabulary.WD_NAMESPACE, "Q42"), null, null), 8); 88 | assertLength(tripleSource.getStatements(VALUE_FACTORY.createIRI(Vocabulary.WD_NAMESPACE, "Q42"), VALUE_FACTORY.createIRI(Vocabulary.WDT_NAMESPACE, "P735"), null), 4); 89 | assertLength(tripleSource.getStatements(VALUE_FACTORY.createIRI(Vocabulary.WD_NAMESPACE, "Q42"), VALUE_FACTORY.createIRI(Vocabulary.WDT_NAMESPACE, "P735"), VALUE_FACTORY.createIRI(Vocabulary.WD_NAMESPACE, "Q463035")), 2); 90 | assertLength(tripleSource.getStatements(VALUE_FACTORY.createIRI(Vocabulary.WD_NAMESPACE, "Q42"), null, VALUE_FACTORY.createIRI(Vocabulary.WD_NAMESPACE, "Q463035")), 2); 91 | assertLength(tripleSource.getStatements(null, VALUE_FACTORY.createIRI(Vocabulary.WDT_NAMESPACE, "P735"), null), 4); 92 | assertLength(tripleSource.getStatements(null, VALUE_FACTORY.createIRI(Vocabulary.WDT_NAMESPACE, "P735"), VALUE_FACTORY.createIRI(Vocabulary.WD_NAMESPACE, "Q463035")), 2); 93 | assertLength(tripleSource.getStatements(null, null, VALUE_FACTORY.createIRI(Vocabulary.WD_NAMESPACE, "Q463035")), 2); 94 | 95 | Resource[] insertionRevision = new Resource[]{VALUE_FACTORY.createIRI(Vocabulary.REVISION_ADDITIONS_NAMESPACE, "42")}; 96 | assertLength(tripleSource.getStatements(null, null, null, insertionRevision), 4); 97 | assertLength(tripleSource.getStatements(VALUE_FACTORY.createIRI(Vocabulary.WD_NAMESPACE, "Q42"), null, null, insertionRevision), 4); 98 | assertLength(tripleSource.getStatements(VALUE_FACTORY.createIRI(Vocabulary.WD_NAMESPACE, "Q42"), VALUE_FACTORY.createIRI(Vocabulary.WDT_NAMESPACE, "P735"), null, insertionRevision), 2); 99 | assertLength(tripleSource.getStatements(VALUE_FACTORY.createIRI(Vocabulary.WD_NAMESPACE, "Q42"), VALUE_FACTORY.createIRI(Vocabulary.WDT_NAMESPACE, "P735"), VALUE_FACTORY.createIRI(Vocabulary.WD_NAMESPACE, "Q463035"), insertionRevision), 1); 100 | assertLength(tripleSource.getStatements(VALUE_FACTORY.createIRI(Vocabulary.WD_NAMESPACE, "Q42"), null, VALUE_FACTORY.createIRI(Vocabulary.WD_NAMESPACE, "Q463035"), insertionRevision), 1); 101 | assertLength(tripleSource.getStatements(null, VALUE_FACTORY.createIRI(Vocabulary.WDT_NAMESPACE, "P735"), null, insertionRevision), 2); 102 | assertLength(tripleSource.getStatements(null, VALUE_FACTORY.createIRI(Vocabulary.WDT_NAMESPACE, "P735"), VALUE_FACTORY.createIRI(Vocabulary.WD_NAMESPACE, "Q463035"), insertionRevision), 1); 103 | assertLength(tripleSource.getStatements(null, null, VALUE_FACTORY.createIRI(Vocabulary.WD_NAMESPACE, "Q463035"), insertionRevision), 1); 104 | 105 | Resource[] globalState1Revision = new Resource[]{VALUE_FACTORY.createIRI(Vocabulary.REVISION_GLOBAL_STATE_NAMESPACE, "42")}; 106 | assertLength(tripleSource.getStatements(null, null, null, globalState1Revision), 4); 107 | assertLength(tripleSource.getStatements(VALUE_FACTORY.createIRI(Vocabulary.WD_NAMESPACE, "Q42"), null, null, globalState1Revision), 4); 108 | assertLength(tripleSource.getStatements(VALUE_FACTORY.createIRI(Vocabulary.WD_NAMESPACE, "Q42"), VALUE_FACTORY.createIRI(Vocabulary.WDT_NAMESPACE, "P735"), null, globalState1Revision), 2); 109 | assertLength(tripleSource.getStatements(VALUE_FACTORY.createIRI(Vocabulary.WD_NAMESPACE, "Q42"), VALUE_FACTORY.createIRI(Vocabulary.WDT_NAMESPACE, "P735"), VALUE_FACTORY.createIRI(Vocabulary.WD_NAMESPACE, "Q463035"), globalState1Revision), 1); 110 | assertLength(tripleSource.getStatements(VALUE_FACTORY.createIRI(Vocabulary.WD_NAMESPACE, "Q42"), null, VALUE_FACTORY.createIRI(Vocabulary.WD_NAMESPACE, "Q463035"), globalState1Revision), 1); 111 | assertLength(tripleSource.getStatements(null, VALUE_FACTORY.createIRI(Vocabulary.WDT_NAMESPACE, "P735"), null, globalState1Revision), 2); 112 | assertLength(tripleSource.getStatements(null, VALUE_FACTORY.createIRI(Vocabulary.WDT_NAMESPACE, "P735"), VALUE_FACTORY.createIRI(Vocabulary.WD_NAMESPACE, "Q463035"), globalState1Revision), 1); 113 | assertLength(tripleSource.getStatements(null, null, VALUE_FACTORY.createIRI(Vocabulary.WD_NAMESPACE, "Q463035"), globalState1Revision), 1); 114 | 115 | Resource[] globalState2Revision = new Resource[]{VALUE_FACTORY.createIRI(Vocabulary.REVISION_GLOBAL_STATE_NAMESPACE, "43")}; 116 | assertLength(tripleSource.getStatements(null, null, null, globalState2Revision), 0); 117 | assertLength(tripleSource.getStatements(VALUE_FACTORY.createIRI(Vocabulary.WD_NAMESPACE, "Q42"), null, null, globalState2Revision), 0); 118 | assertLength(tripleSource.getStatements(VALUE_FACTORY.createIRI(Vocabulary.WD_NAMESPACE, "Q42"), VALUE_FACTORY.createIRI(Vocabulary.WDT_NAMESPACE, "P735"), null, globalState2Revision), 0); 119 | assertLength(tripleSource.getStatements(VALUE_FACTORY.createIRI(Vocabulary.WD_NAMESPACE, "Q42"), VALUE_FACTORY.createIRI(Vocabulary.WDT_NAMESPACE, "P735"), VALUE_FACTORY.createIRI(Vocabulary.WD_NAMESPACE, "Q463035"), globalState2Revision), 0); 120 | assertLength(tripleSource.getStatements(VALUE_FACTORY.createIRI(Vocabulary.WD_NAMESPACE, "Q42"), null, VALUE_FACTORY.createIRI(Vocabulary.WD_NAMESPACE, "Q463035"), globalState2Revision), 0); 121 | assertLength(tripleSource.getStatements(null, VALUE_FACTORY.createIRI(Vocabulary.WDT_NAMESPACE, "P735"), null, globalState2Revision), 0); 122 | assertLength(tripleSource.getStatements(null, VALUE_FACTORY.createIRI(Vocabulary.WDT_NAMESPACE, "P735"), VALUE_FACTORY.createIRI(Vocabulary.WD_NAMESPACE, "Q463035"), globalState2Revision), 0); 123 | assertLength(tripleSource.getStatements(null, null, VALUE_FACTORY.createIRI(Vocabulary.WD_NAMESPACE, "Q463035"), globalState2Revision), 0); 124 | 125 | Resource[] deletionRevision = new Resource[]{VALUE_FACTORY.createIRI(Vocabulary.REVISION_DELETIONS_NAMESPACE, "43")}; 126 | assertLength(tripleSource.getStatements(null, null, null, deletionRevision), 4); 127 | assertLength(tripleSource.getStatements(VALUE_FACTORY.createIRI(Vocabulary.WD_NAMESPACE, "Q42"), null, null, deletionRevision), 4); 128 | assertLength(tripleSource.getStatements(VALUE_FACTORY.createIRI(Vocabulary.WD_NAMESPACE, "Q42"), VALUE_FACTORY.createIRI(Vocabulary.WDT_NAMESPACE, "P735"), null, deletionRevision), 2); 129 | assertLength(tripleSource.getStatements(VALUE_FACTORY.createIRI(Vocabulary.WD_NAMESPACE, "Q42"), VALUE_FACTORY.createIRI(Vocabulary.WDT_NAMESPACE, "P735"), VALUE_FACTORY.createIRI(Vocabulary.WD_NAMESPACE, "Q463035"), deletionRevision), 1); 130 | assertLength(tripleSource.getStatements(VALUE_FACTORY.createIRI(Vocabulary.WD_NAMESPACE, "Q42"), null, VALUE_FACTORY.createIRI(Vocabulary.WD_NAMESPACE, "Q463035"), deletionRevision), 1); 131 | assertLength(tripleSource.getStatements(null, VALUE_FACTORY.createIRI(Vocabulary.WDT_NAMESPACE, "P735"), null, deletionRevision), 2); 132 | assertLength(tripleSource.getStatements(null, VALUE_FACTORY.createIRI(Vocabulary.WDT_NAMESPACE, "P735"), VALUE_FACTORY.createIRI(Vocabulary.WD_NAMESPACE, "Q463035"), deletionRevision), 1); 133 | assertLength(tripleSource.getStatements(null, null, VALUE_FACTORY.createIRI(Vocabulary.WD_NAMESPACE, "Q463035"), deletionRevision), 1); 134 | } 135 | } 136 | 137 | private static void assertLength(CloseableIteration iteration, int length) throws E { 138 | try (CloseableIteration iter = iteration) { 139 | int count = 0; 140 | while (iter.hasNext()) { 141 | count++; 142 | iter.next(); 143 | } 144 | Assertions.assertEquals(length, count); 145 | } 146 | } 147 | } 148 | -------------------------------------------------------------------------------- /src/test/resources/entities/Q7.json: -------------------------------------------------------------------------------- 1 | { 2 | "id": "Q7", 3 | "type": "item", 4 | "claims": { 5 | "P7": [ 6 | { 7 | "id": "TEST-References", 8 | "mainsnak": { 9 | "snaktype": "value", 10 | "property": "P7", 11 | "datatype": "string", 12 | "datavalue": { 13 | "value": "string", 14 | "type": "string" 15 | } 16 | }, 17 | "type": "statement", 18 | "rank": "normal", 19 | "references": [ 20 | { 21 | "snaks": { 22 | "P2": [ 23 | { 24 | "snaktype": "value", 25 | "property": "P2", 26 | "datatype": "wikibase-entityid", 27 | "datavalue": { 28 | "value": { 29 | "entity-type": "item", 30 | "numeric-id": 42 31 | }, 32 | "type": "wikibase-entityid" 33 | } 34 | }, 35 | { 36 | "snaktype": "value", 37 | "property": "P2", 38 | "datatype": "wikibase-entityid", 39 | "datavalue": { 40 | "value": { 41 | "entity-type": "item", 42 | "numeric-id": 666 43 | }, 44 | "type": "wikibase-entityid" 45 | } 46 | } 47 | ], 48 | "P18": [ 49 | { 50 | "snaktype": "value", 51 | "property": "P18", 52 | "datatype": "commonsMedia", 53 | "datavalue": { 54 | "value": "Universe.svg", 55 | "type": "string" 56 | } 57 | }, 58 | { 59 | "snaktype": "novalue", 60 | "property": "P18" 61 | } 62 | ], 63 | "P4": [ 64 | { 65 | "snaktype": "value", 66 | "property": "P4", 67 | "datatype": "globecoordinate", 68 | "datavalue": { 69 | "value": { 70 | "latitude": 12.125, 71 | "longitude": 67.25, 72 | "precision": 0.0625, 73 | "globe": "http:\/\/www.wikidata.org\/entity\/Q2" 74 | }, 75 | "type": "globecoordinate" 76 | } 77 | } 78 | ], 79 | "P5": [ 80 | { 81 | "snaktype": "value", 82 | "property": "P5", 83 | "datatype": "monolingualtext", 84 | "datavalue": { 85 | "value": { 86 | "text": "\u043f\u0440\u0435\u0432\u0435\u0434", 87 | "language": "ru" 88 | }, 89 | "type": "monolingualtext" 90 | } 91 | }, 92 | { 93 | "snaktype": "somevalue", 94 | "property": "P5" 95 | }, 96 | { 97 | "snaktype": "value", 98 | "property": "P5", 99 | "datatype": "monolingualtext", 100 | "datavalue": { 101 | "value": { 102 | "text": "\u0431\u0440\u0435\u0434", 103 | "language": "ru" 104 | }, 105 | "type": "monolingualtext" 106 | } 107 | } 108 | ], 109 | "P6": [ 110 | { 111 | "snaktype": "value", 112 | "property": "P6", 113 | "datatype": "quantity", 114 | "datavalue": { 115 | "value": { 116 | "amount": "+19.768000000000000682121026329696178436279296875", 117 | "unit": "1", 118 | "upperBound": "+19.76899999999999835154085303656756877899169921875", 119 | "lowerBound": "+19.766999999999999459987520822323858737945556640625" 120 | }, 121 | "type": "quantity" 122 | } 123 | } 124 | ], 125 | "P7": [ 126 | { 127 | "snaktype": "value", 128 | "property": "P7", 129 | "datatype": "string", 130 | "datavalue": { 131 | "value": "simplestring", 132 | "type": "string" 133 | } 134 | } 135 | ], 136 | "P8": [ 137 | { 138 | "snaktype": "value", 139 | "property": "P8", 140 | "datatype": "time", 141 | "datavalue": { 142 | "value": { 143 | "time": "-0200-00-00T00:00:00Z", 144 | "timezone": 0, 145 | "before": 0, 146 | "after": 0, 147 | "precision": 9, 148 | "calendarmodel": "http:\/\/www.wikidata.org\/entity\/Q1985727" 149 | }, 150 | "type": "time" 151 | } 152 | } 153 | ], 154 | "P856": [ 155 | { 156 | "snaktype": "value", 157 | "property": "P856", 158 | "datatype": "url", 159 | "datavalue": { 160 | "value": "http:\/\/url.acme.test\/", 161 | "type": "string" 162 | } 163 | } 164 | ] 165 | }, 166 | "snaks-order": [ 167 | "P2", 168 | "P18", 169 | "P4", 170 | "P5", 171 | "P6", 172 | "P7", 173 | "P8", 174 | "P856" 175 | ] 176 | } 177 | ] 178 | }, 179 | { 180 | "id": "TEST-References-2", 181 | "mainsnak": { 182 | "snaktype": "value", 183 | "property": "P7", 184 | "datatype": "string", 185 | "datavalue": { 186 | "value": "string2", 187 | "type": "string" 188 | } 189 | }, 190 | "type": "statement", 191 | "rank": "normal", 192 | "references": [ 193 | { 194 | "snaks": { 195 | "P2": [ 196 | { 197 | "snaktype": "value", 198 | "property": "P2", 199 | "datatype": "wikibase-entityid", 200 | "datavalue": { 201 | "value": { 202 | "entity-type": "item", 203 | "numeric-id": 42 204 | }, 205 | "type": "wikibase-entityid" 206 | } 207 | }, 208 | { 209 | "snaktype": "value", 210 | "property": "P2", 211 | "datatype": "wikibase-entityid", 212 | "datavalue": { 213 | "value": { 214 | "entity-type": "item", 215 | "numeric-id": 666 216 | }, 217 | "type": "wikibase-entityid" 218 | } 219 | } 220 | ], 221 | "P18": [ 222 | { 223 | "snaktype": "value", 224 | "property": "P18", 225 | "datatype": "commonsMedia", 226 | "datavalue": { 227 | "value": "Universe.svg", 228 | "type": "string" 229 | } 230 | }, 231 | { 232 | "snaktype": "novalue", 233 | "property": "P18" 234 | } 235 | ], 236 | "P4": [ 237 | { 238 | "snaktype": "value", 239 | "property": "P4", 240 | "datatype": "globecoordinate", 241 | "datavalue": { 242 | "value": { 243 | "latitude": 12.125, 244 | "longitude": 67.25, 245 | "precision": 0.0625, 246 | "globe": "http:\/\/www.wikidata.org\/entity\/Q2" 247 | }, 248 | "type": "globecoordinate" 249 | } 250 | } 251 | ], 252 | "P5": [ 253 | { 254 | "snaktype": "value", 255 | "property": "P5", 256 | "datatype": "monolingualtext", 257 | "datavalue": { 258 | "value": { 259 | "text": "\u043f\u0440\u0435\u0432\u0435\u0434", 260 | "language": "ru" 261 | }, 262 | "type": "monolingualtext" 263 | } 264 | }, 265 | { 266 | "snaktype": "somevalue", 267 | "property": "P5" 268 | }, 269 | { 270 | "snaktype": "value", 271 | "property": "P5", 272 | "datatype": "monolingualtext", 273 | "datavalue": { 274 | "value": { 275 | "text": "\u0431\u0440\u0435\u0434", 276 | "language": "ru" 277 | }, 278 | "type": "monolingualtext" 279 | } 280 | } 281 | ], 282 | "P6": [ 283 | { 284 | "snaktype": "value", 285 | "property": "P6", 286 | "datatype": "quantity", 287 | "datavalue": { 288 | "value": { 289 | "amount": "+19.768000000000000682121026329696178436279296875", 290 | "unit": "1", 291 | "upperBound": "+19.76899999999999835154085303656756877899169921875", 292 | "lowerBound": "+19.766999999999999459987520822323858737945556640625" 293 | }, 294 | "type": "quantity" 295 | } 296 | } 297 | ], 298 | "P7": [ 299 | { 300 | "snaktype": "value", 301 | "property": "P7", 302 | "datatype": "string", 303 | "datavalue": { 304 | "value": "simplestring", 305 | "type": "string" 306 | } 307 | } 308 | ], 309 | "P8": [ 310 | { 311 | "snaktype": "value", 312 | "property": "P8", 313 | "datatype": "time", 314 | "datavalue": { 315 | "value": { 316 | "time": "-0200-00-00T00:00:00Z", 317 | "timezone": 0, 318 | "before": 0, 319 | "after": 0, 320 | "precision": 9, 321 | "calendarmodel": "http:\/\/www.wikidata.org\/entity\/Q1985727" 322 | }, 323 | "type": "time" 324 | } 325 | } 326 | ], 327 | "P856": [ 328 | { 329 | "snaktype": "value", 330 | "property": "P856", 331 | "datatype": "url", 332 | "datavalue": { 333 | "value": "http:\/\/url.acme.test\/", 334 | "type": "string" 335 | } 336 | } 337 | ] 338 | }, 339 | "snaks-order": [ 340 | "P2", 341 | "P18", 342 | "P4", 343 | "P5", 344 | "P6", 345 | "P7", 346 | "P8", 347 | "P856" 348 | ] 349 | } 350 | ] 351 | } 352 | ] 353 | } 354 | } 355 | -------------------------------------------------------------------------------- /src/test/resources/rdf/Q4.nt: -------------------------------------------------------------------------------- 1 | . 2 | . 3 | . 4 | "test-external-identifier" . 5 | . 6 | . 7 | "Point(67.25 12.125)"^^ . 8 | _:genid1 . 9 | "\u043F\u0440\u0435\u0432\u0435\u0434"@ru . 10 | "+19.768000000000000682121026329696178436279296875"^^ . 11 | "simplestring" . 12 | "-0200-00-00T00:00:00Z"^^ . 13 | . 14 | . 15 | . 16 | . 17 | . 18 | . 19 | . 20 | . 21 | . 22 | . 23 | . 24 | . 25 | . 26 | . 27 | . 28 | . 29 | . 30 | . 31 | . 32 | . 33 | . 34 | . 35 | "test-external-identifier" . 36 | . 37 | . 38 | . 39 | . 40 | . 41 | . 42 | . 43 | . 44 | . 45 | . 46 | . 47 | . 48 | . 49 | . 50 | . 51 | . 52 | . 53 | . 54 | "Point(67.25 12.125)"^^ . 55 | . 56 | . 57 | . 58 | . 59 | _:genid1 . 60 | . 61 | . 62 | . 63 | "\u043F\u0440\u0435\u0432\u0435\u0434"@ru . 64 | . 65 | . 66 | "\u0431\u0440\u0435\u0434"@ru . 67 | . 68 | . 69 | . 70 | "+19.768000000000000682121026329696178436279296875"^^ . 71 | . 72 | . 73 | . 74 | . 75 | "simplestring" . 76 | . 77 | . 78 | . 79 | "-0200-00-00T00:00:00Z"^^ . 80 | . 81 | . 82 | . 83 | . 84 | . 85 | . 86 | "12.125"^^ . 87 | "67.25"^^ . 88 | "0.0625"^^ . 89 | . 90 | "+19.768000000000000682121026329696178436279296875"^^ . 91 | "+19.766999999999999459987520822323858737945556640625"^^ . 92 | . 93 | "+19.76899999999999835154085303656756877899169921875"^^ . 94 | . 95 | . 96 | "9"^^ . 97 | "0"^^ . 98 | "-0200-00-00T00:00:00Z"^^ . 99 | . 100 | --------------------------------------------------------------------------------