├── .gitignore ├── LICENSE ├── LICENSES_THIRD_PARTY ├── README.md ├── TODO.md ├── pom.xml └── src ├── main ├── java │ └── com │ │ └── merck │ │ └── rdf2x │ │ ├── beans │ │ ├── IndexMap.java │ │ ├── Instance.java │ │ ├── Predicate.java │ │ ├── RelationPredicate.java │ │ ├── RelationRow.java │ │ └── TypeID.java │ │ ├── config │ │ └── StorageLevelConverter.java │ │ ├── flavors │ │ ├── Bio2RDFFlavor.java │ │ ├── DefaultFlavor.java │ │ ├── Flavor.java │ │ ├── FlavorFactory.java │ │ └── WikidataFlavor.java │ │ ├── jobs │ │ ├── CommandConfig.java │ │ ├── JobFactory.java │ │ ├── convert │ │ │ ├── ConvertConfig.java │ │ │ └── ConvertJob.java │ │ └── stats │ │ │ ├── StatsConfig.java │ │ │ └── StatsJob.java │ │ ├── main │ │ ├── Main.java │ │ └── ReadmeUtils.java │ │ ├── persistence │ │ ├── InstanceRelationWriter.java │ │ ├── InstanceRelationWriterConfig.java │ │ ├── MetadataWriter.java │ │ ├── config │ │ │ ├── DbConfig.java │ │ │ ├── ElasticSearchConfig.java │ │ │ ├── FileConfig.java │ │ │ └── OutputConfig.java │ │ ├── output │ │ │ ├── CSVPersistor.java │ │ │ ├── DataFrameMapPersistor.java │ │ │ ├── DbPersistor.java │ │ │ ├── DbPersistorPostgres.java │ │ │ ├── DbPersistorSQLServer.java │ │ │ ├── ElasticSearchPersistor.java │ │ │ ├── JSONPersistor.java │ │ │ ├── PersistException.java │ │ │ ├── Persistor.java │ │ │ ├── PersistorFactory.java │ │ │ └── PreviewPersistor.java │ │ └── schema │ │ │ ├── EntityColumn.java │ │ │ ├── EntityProperty.java │ │ │ ├── EntitySchema.java │ │ │ ├── EntityTable.java │ │ │ ├── RelationEntityFilter.java │ │ │ ├── RelationPredicateFilter.java │ │ │ ├── RelationSchema.java │ │ │ └── RelationTable.java │ │ ├── processing │ │ ├── aggregating │ │ │ ├── InstanceAggregator.java │ │ │ └── InstanceAggregatorConfig.java │ │ ├── filtering │ │ │ ├── InstanceFilter.java │ │ │ ├── InstanceFilterConfig.java │ │ │ ├── QuadFilter.java │ │ │ └── QuadFilterConfig.java │ │ ├── formatting │ │ │ ├── FormatUtil.java │ │ │ ├── SchemaFormatter.java │ │ │ └── SchemaFormatterConfig.java │ │ ├── indexing │ │ │ ├── GlobalInstanceIndexer.java │ │ │ └── InstanceIndexer.java │ │ ├── partitioning │ │ │ ├── InstancePartitioner.java │ │ │ └── InstancePartitionerConfig.java │ │ ├── relations │ │ │ └── RelationExtractor.java │ │ └── schema │ │ │ ├── EntitySchemaCollector.java │ │ │ ├── EntitySchemaCollectorConfig.java │ │ │ ├── RelationConfig.java │ │ │ ├── RelationSchemaCollector.java │ │ │ └── RelationSchemaStrategy.java │ │ ├── rdf │ │ ├── LiteralType.java │ │ ├── QuadUtils.java │ │ ├── parsing │ │ │ ├── ElephasQuadParser.java │ │ │ ├── ParseErrorHandling.java │ │ │ ├── QuadParser.java │ │ │ └── QuadParserConfig.java │ │ └── schema │ │ │ ├── ClassGraph.java │ │ │ ├── RdfSchema.java │ │ │ ├── RdfSchemaCollector.java │ │ │ └── RdfSchemaCollectorConfig.java │ │ ├── spark │ │ └── SparkContextProvider.java │ │ └── stats │ │ └── QuadCounter.java └── resources │ ├── examples │ ├── czechia.ttl │ ├── example1.ttl │ ├── exampleMultipleTypes.ttl │ ├── fruit.nq │ ├── locations.ttl │ ├── multivalued.ttl │ └── wikidata.nt │ ├── log4j.properties │ └── test │ └── datasets │ ├── aggregatorTest.nq │ ├── convertJobTest.nq │ ├── filtering │ ├── input.nq │ ├── resource0.nq │ ├── resource1.nq │ ├── resource2.nq │ ├── resource3.nq │ ├── type0.nq │ ├── type1.nq │ ├── type2.nq │ └── type3.nq │ ├── parserTest.nq │ ├── parserTest.ttl │ ├── rdfSchemaCollectorTest.nt │ └── wikidataFlavorTest.nt └── test └── java └── com └── merck └── rdf2x ├── jobs ├── JobFactoryTest.java └── convert │ └── ConvertJobTest.java ├── persistence ├── InstanceRelationWriterTest.java ├── MetadataWriterTest.java └── PersistorTest.java ├── processing ├── aggregating │ └── InstanceAggregatorTest.java ├── filtering │ ├── InstanceFilterTest.java │ └── QuadFilterTest.java ├── formatting │ └── SchemaFormatterTest.java ├── relations │ └── RelationExtractorTest.java └── schema │ ├── EntitySchemaCollectorTest.java │ └── RelationSchemaCollectorTest.java ├── rdf ├── parsing │ └── ElephasQuadParserTest.java └── schema │ └── RdfSchemaCollectorTest.java └── test ├── TestSparkContextProvider.java └── TestUtils.java /.gitignore: -------------------------------------------------------------------------------- 1 | *.class 2 | 3 | # Target dir 4 | target/ 5 | 6 | # Dev dir 7 | dev/ 8 | 9 | # Mobile Tools for Java (J2ME) 10 | .mtj.tmp/ 11 | 12 | # Package Files # 13 | *.jar 14 | *.war 15 | *.ear 16 | 17 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml 18 | hs_err_pid* 19 | 20 | # Idea project 21 | .idea/ 22 | *.iml 23 | 24 | # Jupyter notebook 25 | .ipynb_checkpoints/ -------------------------------------------------------------------------------- /LICENSES_THIRD_PARTY: -------------------------------------------------------------------------------- 1 | RDF2X uses third-party libraries which may be distributed under different 2 | licenses. We have attempted to list all of these third party libraries and 3 | their licenses below (however the most up-to-date information can be found 4 | via Maven, see pom.xml) 5 | 6 | You must agree to the terms of these licenses, in addition to the RDF2X 7 | source code license, in order to use this software. 8 | 9 | -------------------------------------------------- 10 | Third party Java libraries listed by License type 11 | [Format: Name (Maven Project) - URL] 12 | -------------------------------------------------- 13 | 14 | BSD License (http://www.opensource.org/licenses/BSD-3-Clause) 15 | * PostgreSQL driver (org.postgresql:postgresql:*) - http://postgresql.org 16 | 17 | Apache Software License, Version 2.0 (http://opensource.org/licenses/apache2.0) 18 | * Apache Jena (org.apache.jena:*) - https://jena.apache.org/ 19 | * JCommander (com.beust:jcommander:*) - http://jcommander.org/ 20 | * Spark CSV (com.databricks:spark-csv_2.10:*) - https://github.com/databricks/spark-csv 21 | * Elasticsearch Spark (org.elasticsearch:elasticsearch-spark_2.10:*) - https://www.elastic.co/ 22 | * Apache Commons (org.apache.commons:*) - https://commons.apache.org/ 23 | * Spark Testing Base (com.holdenkarau:spark-testing-base_2.10:*) - https://github.com/holdenk/spark-testing-base 24 | * Apache Spark (org.apache.spark:*) - http://spark.apache.org/ 25 | 26 | Eclipse Distribution License - v 1.0 (https://eclipse.org/org/documents/edl-v10.php) 27 | * RDF4J: Runtime (org.eclipse.rdf4j:*) - http://rdf4j.org/ 28 | 29 | Eclipse Public License - v 1.0 (https://www.eclipse.org/legal/epl-v10.html) 30 | * JGraphT Core (org.jgrapht:jgrapht-core:*) - http://jgrapht.org/ 31 | * JUnit (junit:*) - http://junit.org/junit4/ 32 | 33 | The MIT License - (https://opensource.org/licenses/MIT) 34 | * Mockito (org.mockito:*) - http://site.mockito.org/ 35 | * Project Lombok (org.projectlombok:*) - https://projectlombok.org/ 36 | -------------------------------------------------------------------------------- /TODO.md: -------------------------------------------------------------------------------- 1 | # TODO 2 | 3 | ## Priority 4 | 5 | - Enable array columns (for JSON) 6 | - Show jCommander error message at the end of output 7 | 8 | ## Parsing 9 | 10 | - Save all parse errors 11 | 12 | ## Formatting 13 | 14 | - Fix formatting clash with language suffix being present already in uri e.g. /name-en = "abc" VS name = "def"@en 15 | - Use domain names and other substrings for resolving name conflicts instead of numeric suffixes 16 | 17 | ## Instances 18 | - Support for blank nodes 19 | - Saving errors separately 20 | - Merging sameAs resources, types, properties 21 | - Detect 1:n relationships 22 | - entities.ignoredType - types that don't create a table (but the instances are preserved if they also have an other type) 23 | 24 | ## Testing 25 | 26 | - Formatting names from labels 27 | - Skipping columns already stored in superclass 28 | - Writing indexes 29 | - Convert job with filtered type 30 | 31 | ## Persistence 32 | 33 | - Save instance types in EAV table 34 | - Check if table is already present and ignore creating indexes with SaveMode=Ignore to avoid log errors 35 | 36 | ## Performance 37 | 38 | - Persist relations to single table and then move the data using SQL 39 | - Encode instance predicate as long, use fastutil library Long2ObjectMap 40 | 41 | ## Schema 42 | 43 | - Separate schema extraction and data load 44 | - all references in schema represented by URIs, not indexes 45 | - save entity and relation schema to xml file (owl? what about properties with multiple languages / types?) 46 | - xml file can be edited by hand (e.g. each relation table) 47 | - load instances using the schema 48 | 49 | ## README 50 | 51 | - Sparse values example 52 | -------------------------------------------------------------------------------- /src/main/java/com/merck/rdf2x/beans/IndexMap.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Merck Sharp & Dohme Corp. a subsidiary of Merck & Co., 3 | * Inc., Kenilworth, NJ, USA. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.merck.rdf2x.beans; 19 | 20 | import java.io.Serializable; 21 | import java.util.*; 22 | import java.util.concurrent.atomic.AtomicInteger; 23 | import java.util.function.Function; 24 | import java.util.stream.Collectors; 25 | 26 | /** 27 | * IndexMap defines classes that represent values of any type as integers, starting from 0. 28 | * 29 | * @param The value to be represented as integer 30 | */ 31 | public class IndexMap implements Serializable { 32 | /** 33 | * value array - mapping indexes to values, starting from 0 34 | */ 35 | private final ArrayList values; 36 | /** 37 | * value map - mapping values to indexes, starting from 0 38 | */ 39 | private Map indexMap; 40 | 41 | /** 42 | * @param values list of values to represent, will be used directly 43 | */ 44 | public IndexMap(ArrayList values) { 45 | this.values = values; 46 | this.indexMap = createValueMap(values); 47 | } 48 | 49 | /** 50 | * @param values list of values to represent, deep copy will be created 51 | */ 52 | public IndexMap(List values) { 53 | this(new ArrayList<>(values)); 54 | } 55 | 56 | private Map createValueMap(Collection values) { 57 | AtomicInteger index = new AtomicInteger(); 58 | return values.stream() 59 | .collect(Collectors.toMap(Function.identity(), v -> index.getAndIncrement())); 60 | } 61 | 62 | 63 | /** 64 | * Get integer representation of a value 65 | * 66 | * @param value the value to be represented 67 | * @return integer representation (index) of the value 68 | */ 69 | public Integer getIndex(V value) { 70 | if (!indexMap.containsKey(value)) { 71 | // alert when element is not present right away to avoid null pointer errors 72 | throw new NullPointerException("Map does not contain value, were all values properly initialized?"); 73 | } 74 | return indexMap.get(value); 75 | } 76 | 77 | /** 78 | * Get value for a specific index 79 | * 80 | * @param index value index (the integer representation of the value) 81 | * @return value for the specified index 82 | */ 83 | public V getValue(int index) { 84 | return values.get(index); 85 | } 86 | 87 | /** 88 | * Get size of the map 89 | * 90 | * @return size of the map 91 | */ 92 | public int size() { 93 | return values.size(); 94 | } 95 | 96 | /** 97 | * Get list of all represented values 98 | * 99 | * @return set of all represented values 100 | */ 101 | public ArrayList getValues() { 102 | return values; 103 | } 104 | 105 | /** 106 | * Get set of all represented values 107 | * 108 | * @return set of all represented values 109 | */ 110 | public Set getValueSet() { 111 | return indexMap.keySet(); 112 | } 113 | 114 | /** 115 | * Get collection of all used indexes 116 | * 117 | * @return collection of all used indexes 118 | */ 119 | public Collection getIndex() { 120 | return indexMap.values(); 121 | } 122 | } 123 | -------------------------------------------------------------------------------- /src/main/java/com/merck/rdf2x/beans/Instance.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Merck Sharp & Dohme Corp. a subsidiary of Merck & Co., 3 | * Inc., Kenilworth, NJ, USA. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.merck.rdf2x.beans; 19 | 20 | import lombok.Data; 21 | import org.apache.jena.sparql.core.Quad; 22 | 23 | import java.io.Serializable; 24 | import java.util.Collection; 25 | import java.util.HashMap; 26 | import java.util.HashSet; 27 | import java.util.Set; 28 | 29 | /** 30 | * Instance stores all properties of a RDF resource. Or in other words, a {@link Instance} represents content from all {@link Quad}s with the same instance URI. 31 | *

32 | * A Instance is defined by: 33 | *

    34 | *
  • URI of the resource
  • 35 | *
  • ID (optional)
  • 36 | *
  • set of type URIs
  • 37 | *
  • set of {@link RelationPredicate} defining the Instance's relations
  • 38 | *
  • map of literal values ({@link Predicate} -> valueObject). The valueObject can be either a single value (String, Integer, ...) or a set of these
  • 39 | *
40 | */ 41 | @Data 42 | public class Instance implements Serializable { 43 | 44 | /** 45 | * URI of the resource. If instance is a blank node, its identifier is used instead. 46 | */ 47 | private String uri; 48 | /** 49 | * ID (optional) 50 | */ 51 | private Long id; 52 | /** 53 | * set of type URIs 54 | */ 55 | private final Set types; 56 | /** 57 | * set of {@link RelationPredicate} defining the Instance's relations 58 | */ 59 | private final Set relations; 60 | /** 61 | * map of literal values ({@link Predicate} -> Object) 62 | */ 63 | private final HashMap literalValues; 64 | 65 | /** 66 | * Default constructor. Start with empty types, relations and literal values. 67 | */ 68 | public Instance() { 69 | literalValues = new HashMap<>(); 70 | types = new HashSet<>(); 71 | relations = new HashSet<>(); 72 | } 73 | 74 | public void setType(Integer type) { 75 | this.types.clear(); 76 | addType(type); 77 | } 78 | 79 | public void addType(Integer type) { 80 | this.types.add(type); 81 | } 82 | 83 | public void addTypes(Collection types) { 84 | this.types.addAll(types); 85 | } 86 | 87 | public Set getRelations() { 88 | return relations; 89 | } 90 | 91 | public void setRelation(RelationPredicate relation) { 92 | this.relations.clear(); 93 | addRelation(relation); 94 | } 95 | 96 | public void addRelation(RelationPredicate relation) { 97 | this.relations.add(relation); 98 | } 99 | 100 | public void addRelations(Collection relations) { 101 | this.relations.addAll(relations); 102 | } 103 | 104 | public void putLiteralValue(Predicate predicate, Object value) { 105 | literalValues.put(predicate, value); 106 | } 107 | 108 | public Set getLiteralPredicates() { 109 | return literalValues.keySet(); 110 | } 111 | 112 | public Object getLiteralValue(Predicate key) { 113 | return literalValues.get(key); 114 | } 115 | 116 | public boolean hasType() { 117 | return !types.isEmpty(); 118 | } 119 | } 120 | -------------------------------------------------------------------------------- /src/main/java/com/merck/rdf2x/beans/Predicate.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Merck Sharp & Dohme Corp. a subsidiary of Merck & Co., 3 | * Inc., Kenilworth, NJ, USA. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.merck.rdf2x.beans; 19 | 20 | import com.merck.rdf2x.rdf.LiteralType; 21 | import lombok.Data; 22 | import lombok.NonNull; 23 | 24 | import java.io.Serializable; 25 | 26 | /** 27 | * Predicate is used as a key in maps of {@link Instance} values. 28 | *

29 | * It is defined by a URI and a {@link LiteralType}. 30 | */ 31 | @Data 32 | public class Predicate implements Serializable { 33 | /** 34 | * Predicate index representing the Predicate URI String 35 | */ 36 | @NonNull 37 | private final Integer predicateIndex; 38 | /** 39 | * Type of the literal value that this Predicate will reference 40 | */ 41 | private final int literalType; 42 | /** 43 | * Language of the literal value that this Predicate will reference (optional) 44 | */ 45 | private final String language; 46 | 47 | /** 48 | * Initialize a predicate 49 | * 50 | * @param predicateIndex Predicate index representing the Predicate URI String 51 | * @param literalType Type of the literal value that this Predicate will reference 52 | * @param language Language of the literal value that this Predicate will reference (optional) 53 | */ 54 | public Predicate(@NonNull Integer predicateIndex, int literalType, String language) { 55 | this.predicateIndex = predicateIndex; 56 | this.literalType = literalType; 57 | this.language = language; 58 | } 59 | 60 | /** 61 | * Initialize a predicate with null language 62 | * 63 | * @param predicateIndex Predicate index representing the Predicate URI String 64 | * @param literalType Type of the literal value that this Predicate will reference 65 | */ 66 | public Predicate(Integer predicateIndex, int literalType) { 67 | this(predicateIndex, literalType, null); 68 | } 69 | 70 | @Override 71 | public String toString() { 72 | return "Predicate" + predicateIndex + '(' + LiteralType.toString(literalType) + ')' + (language == null ? "" : "@" + language); 73 | } 74 | } 75 | -------------------------------------------------------------------------------- /src/main/java/com/merck/rdf2x/beans/RelationPredicate.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Merck Sharp & Dohme Corp. a subsidiary of Merck & Co., 3 | * Inc., Kenilworth, NJ, USA. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.merck.rdf2x.beans; 19 | 20 | import lombok.Data; 21 | 22 | import java.io.Serializable; 23 | 24 | /** 25 | * RelationPredicate defines a relation to a specific object. 26 | * It is defined by the predicate URI (type of relation) and the object URI (target of relation). 27 | */ 28 | @Data 29 | public class RelationPredicate implements Serializable { 30 | /** 31 | * index of the predicate representing this relation 32 | */ 33 | private final Integer predicateIndex; 34 | /** 35 | * URI of the target resource of this relation 36 | */ 37 | private final String objectURI; 38 | 39 | } 40 | -------------------------------------------------------------------------------- /src/main/java/com/merck/rdf2x/beans/RelationRow.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Merck Sharp & Dohme Corp. a subsidiary of Merck & Co., 3 | * Inc., Kenilworth, NJ, USA. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.merck.rdf2x.beans; 19 | 20 | import lombok.Data; 21 | 22 | import java.io.Serializable; 23 | 24 | /** 25 | * RelationRow defines a single row in a relation table. 26 | *

27 | * It is defined by a predicate URI, a from {@link TypeID} (source) and a to {@link TypeID} (target). 28 | */ 29 | @Data 30 | public class RelationRow implements Serializable { 31 | /** 32 | * predicate index (represents type of the relationship) 33 | */ 34 | private final Integer predicateIndex; 35 | /** 36 | * source {@link TypeID} 37 | */ 38 | private final TypeID from; 39 | /** 40 | * target {@link TypeID} 41 | */ 42 | private final TypeID to; 43 | 44 | @Override 45 | public String toString() { 46 | return "RelationRow{(Predicate" + predicateIndex + ") " + 47 | from + " => " + to + 48 | '}'; 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /src/main/java/com/merck/rdf2x/beans/TypeID.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Merck Sharp & Dohme Corp. a subsidiary of Merck & Co., 3 | * Inc., Kenilworth, NJ, USA. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.merck.rdf2x.beans; 19 | 20 | import lombok.Data; 21 | 22 | import java.io.Serializable; 23 | 24 | /** 25 | * TypeID defines a reference to a single {@link Instance} in an entity table. 26 | *

27 | * It is defined by the Instance's type URI and its ID. 28 | */ 29 | @Data 30 | public class TypeID implements Serializable { 31 | /** 32 | * index of the Instance's type 33 | */ 34 | private final Integer typeIndex; 35 | /** 36 | * ID of the Instance 37 | */ 38 | private final Long id; 39 | 40 | @Override 41 | public String toString() { 42 | return "Type" + typeIndex + "(ID" + id + ")"; 43 | } 44 | 45 | } 46 | -------------------------------------------------------------------------------- /src/main/java/com/merck/rdf2x/config/StorageLevelConverter.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Merck Sharp & Dohme Corp. a subsidiary of Merck & Co., 3 | * Inc., Kenilworth, NJ, USA. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.merck.rdf2x.config; 19 | 20 | import com.beust.jcommander.IStringConverter; 21 | import org.apache.spark.storage.StorageLevel; 22 | 23 | /** 24 | * StorageLevelConverter converts a String config value to a {@link StorageLevel} value. 25 | */ 26 | public class StorageLevelConverter implements IStringConverter { 27 | @Override 28 | public StorageLevel convert(String value) { 29 | return StorageLevel.fromString(value); 30 | } 31 | } -------------------------------------------------------------------------------- /src/main/java/com/merck/rdf2x/flavors/Bio2RDFFlavor.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Merck Sharp & Dohme Corp. a subsidiary of Merck & Co., 3 | * Inc., Kenilworth, NJ, USA. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.merck.rdf2x.flavors; 19 | 20 | import com.merck.rdf2x.processing.formatting.FormatUtil; 21 | 22 | /** 23 | * Bio2RDFFlavor adds some changes that might be desirable when converting Bio2RDF datasets 24 | */ 25 | public class Bio2RDFFlavor implements Flavor { 26 | /** 27 | * Format table or column name from URI and label 28 | * 29 | * @param URI URI to format into name 30 | * @param label rdfs:label or null if not available 31 | * @param maxLength maximum length of formatted name 32 | * @return formatted name or null if custom formatting function is not available 33 | */ 34 | @Override 35 | public String formatName(String URI, String label, Integer maxLength) { 36 | // URIs for resources will be formatted as prefix_resource 37 | // for example http://bio2rdf.org/drugbank_vocabulary:Resource -> drugbank_resource 38 | final String RESOURCE_SUFFIX = "_vocabulary:Resource"; 39 | if (URI.endsWith(RESOURCE_SUFFIX)) { 40 | return FormatUtil.getCleanURISuffix(URI.replace(RESOURCE_SUFFIX, "_resource"), "/", maxLength); 41 | } 42 | return null; 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /src/main/java/com/merck/rdf2x/flavors/DefaultFlavor.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Merck Sharp & Dohme Corp. a subsidiary of Merck & Co., 3 | * Inc., Kenilworth, NJ, USA. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.merck.rdf2x.flavors; 19 | 20 | /** 21 | * DefaultFlavor is a flavor that does not modify the behavior 22 | */ 23 | public final class DefaultFlavor implements Flavor { 24 | 25 | } 26 | -------------------------------------------------------------------------------- /src/main/java/com/merck/rdf2x/flavors/Flavor.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Merck Sharp & Dohme Corp. a subsidiary of Merck & Co., 3 | * Inc., Kenilworth, NJ, USA. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.merck.rdf2x.flavors; 19 | 20 | import com.merck.rdf2x.jobs.convert.ConvertConfig; 21 | import com.merck.rdf2x.jobs.convert.ConvertJob; 22 | import com.merck.rdf2x.jobs.stats.StatsConfig; 23 | import com.merck.rdf2x.jobs.stats.StatsJob; 24 | import org.apache.jena.sparql.core.Quad; 25 | import org.apache.spark.api.java.JavaRDD; 26 | 27 | import java.io.Serializable; 28 | 29 | /** 30 | * Modifiers are classes used to introduce specific settings and methods for different data sources. 31 | */ 32 | public interface Flavor extends Serializable { 33 | 34 | /** 35 | * Set default values for {@link StatsJob} 36 | * 37 | * @param config config to update 38 | */ 39 | default void setDefaultValues(StatsConfig config) { 40 | 41 | } 42 | 43 | /** 44 | * Set default values for {@link ConvertJob} 45 | * 46 | * @param config config to update 47 | */ 48 | default void setDefaultValues(ConvertConfig config) { 49 | 50 | } 51 | 52 | /** 53 | * Modify RDD of quads in any needed way (filtering, flatMapping, ...) 54 | * 55 | * @param quads RDD of quads to modify 56 | * @return modified RDD of quads, returns original RDD in default 57 | */ 58 | default JavaRDD modifyQuads(JavaRDD quads) { 59 | return quads; 60 | } 61 | 62 | /** 63 | * Format table or column name from URI and label 64 | * 65 | * @param URI URI to format into name 66 | * @param label rdfs:label or null if not available 67 | * @param maxLength maximum length of formatted name 68 | * @return formatted name or null if custom formatting function is not available 69 | */ 70 | default String formatName(String URI, String label, Integer maxLength) { 71 | return null; 72 | } 73 | } 74 | -------------------------------------------------------------------------------- /src/main/java/com/merck/rdf2x/flavors/FlavorFactory.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Merck Sharp & Dohme Corp. a subsidiary of Merck & Co., 3 | * Inc., Kenilworth, NJ, USA. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.merck.rdf2x.flavors; 19 | 20 | /** 21 | * FlavorFactory creates new instances of flavors from their String names. 22 | */ 23 | public class FlavorFactory { 24 | 25 | /** 26 | * creates new Flavor instance from its String name or {@link DefaultFlavor} if input was null 27 | * 28 | * @param flavor name of the flavor without the "Flavor" suffix, e.g. 'Wikidata' 29 | * @return new Flavor instance or {@link DefaultFlavor} if input was null 30 | */ 31 | public static Flavor fromString(String flavor) { 32 | if (flavor == null) { 33 | return new DefaultFlavor(); 34 | } 35 | 36 | try { 37 | return (Flavor) Class.forName("com.merck.rdf2x.flavors." + flavor + "Flavor").newInstance(); 38 | } catch (InstantiationException | IllegalAccessException | ClassNotFoundException e) { 39 | throw new IllegalArgumentException("Unrecognized flavor (case-sensitive): " + flavor, e); 40 | } 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /src/main/java/com/merck/rdf2x/flavors/WikidataFlavor.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Merck Sharp & Dohme Corp. a subsidiary of Merck & Co., 3 | * Inc., Kenilworth, NJ, USA. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.merck.rdf2x.flavors; 19 | 20 | import com.google.common.collect.Sets; 21 | import com.merck.rdf2x.jobs.convert.ConvertConfig; 22 | import com.merck.rdf2x.jobs.convert.ConvertJob; 23 | import com.merck.rdf2x.processing.schema.RelationSchemaStrategy; 24 | import org.apache.jena.graph.NodeFactory; 25 | import org.apache.jena.sparql.core.Quad; 26 | import org.apache.spark.api.java.JavaRDD; 27 | import org.eclipse.rdf4j.model.vocabulary.RDFS; 28 | 29 | import java.util.Arrays; 30 | import java.util.Collections; 31 | 32 | /** 33 | * WikidataFlavor provides default settings and methods to improve conversion of the Wikidata RDF dump 34 | */ 35 | public class WikidataFlavor implements Flavor { 36 | 37 | private static final String WIKIDATA_PREFIX = "http://www.wikidata.org/"; 38 | private static final String ENTITY_PREFIX = WIKIDATA_PREFIX + "entity/"; 39 | private static final String PROPERTY_ENTITY_PREFIX = ENTITY_PREFIX + "P"; 40 | private static final String PROPERTY_DIRECT_PREFIX = "http://www.wikidata.org/prop/direct/P"; 41 | private static final String PROPERTY_STATEMENT_PREFIX = "http://www.wikidata.org/prop/statement/P"; 42 | private static final String PROPERTY_QUALIFIER_PREFIX = "http://www.wikidata.org/prop/qualifier/P"; 43 | 44 | /** 45 | * Set default values for {@link ConvertJob} 46 | * 47 | * @param config config to update 48 | */ 49 | @Override 50 | public void setDefaultValues(ConvertConfig config) { 51 | config.getRdfSchemaCollectorConfig() 52 | .setSubclassPredicates(Arrays.asList( 53 | "http://www.wikidata.org/prop/direct/P279" // subclass of 54 | )) 55 | .setTypePredicates(Arrays.asList( 56 | "http://www.wikidata.org/prop/direct/P31", // instance of 57 | "http://www.wikidata.org/prop/direct/P279" // subclass of - consider subclasses to also be instances of the class (to include instances such as Piano, which does not have 'instance of' information) 58 | )); 59 | config.getSchemaFormatterConfig() 60 | .setUseLabels(true) 61 | .setMaxTableNameLength(50); 62 | 63 | config.getRelationConfig() 64 | .setSchemaStrategy(RelationSchemaStrategy.Predicates); 65 | } 66 | 67 | /** 68 | * Modify RDD of quads in any needed way (filtering, flatMapping, ...) 69 | * 70 | * @param quads RDD of quads to modify 71 | * @return modified RDD of quads, returns original RDD in default 72 | */ 73 | @Override 74 | public JavaRDD modifyQuads(JavaRDD quads) { 75 | final String labelURI = RDFS.LABEL.toString(); 76 | return quads.flatMap(quad -> { 77 | if (quad.getSubject().isURI()) { 78 | String subjectURI = quad.getSubject().getURI(); 79 | // for each quad specifying property label, create label quads for each URI variant of this property 80 | // done because Wikidata only provides entity labels, for example http://www.wikidata.org/entity/P279 and not http://www.wikidata.org/prop/direct/P279 81 | if (subjectURI.contains(PROPERTY_ENTITY_PREFIX) && quad.getPredicate().getURI().equals(labelURI)) { 82 | return Sets.newHashSet( 83 | quad, 84 | new Quad(quad.getGraph(), 85 | NodeFactory.createURI(subjectURI.replace(PROPERTY_ENTITY_PREFIX, PROPERTY_DIRECT_PREFIX)), 86 | quad.getPredicate(), 87 | quad.getObject()), 88 | new Quad(quad.getGraph(), 89 | NodeFactory.createURI(subjectURI.replace(PROPERTY_ENTITY_PREFIX, PROPERTY_STATEMENT_PREFIX)), 90 | quad.getPredicate(), 91 | quad.getObject()), 92 | new Quad(quad.getGraph(), 93 | NodeFactory.createURI(subjectURI.replace(PROPERTY_ENTITY_PREFIX, PROPERTY_QUALIFIER_PREFIX)), 94 | quad.getPredicate(), 95 | quad.getObject()) 96 | ); 97 | } 98 | } 99 | 100 | return Collections.singleton(quad); 101 | }); 102 | } 103 | 104 | 105 | } 106 | -------------------------------------------------------------------------------- /src/main/java/com/merck/rdf2x/jobs/CommandConfig.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Merck Sharp & Dohme Corp. a subsidiary of Merck & Co., 3 | * Inc., Kenilworth, NJ, USA. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.merck.rdf2x.jobs; 19 | 20 | import com.beust.jcommander.Parameter; 21 | import lombok.Data; 22 | 23 | @Data 24 | public class CommandConfig { 25 | @Parameter(names = "--help", description = "Show usage page", help = true) 26 | private boolean help = false; 27 | } 28 | -------------------------------------------------------------------------------- /src/main/java/com/merck/rdf2x/jobs/JobFactory.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Merck Sharp & Dohme Corp. a subsidiary of Merck & Co., 3 | * Inc., Kenilworth, NJ, USA. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.merck.rdf2x.jobs; 19 | 20 | import com.beust.jcommander.JCommander; 21 | import com.beust.jcommander.ParameterException; 22 | import com.merck.rdf2x.flavors.Flavor; 23 | import com.merck.rdf2x.flavors.FlavorFactory; 24 | import com.merck.rdf2x.jobs.convert.ConvertConfig; 25 | import com.merck.rdf2x.jobs.convert.ConvertJob; 26 | import com.merck.rdf2x.jobs.stats.StatsConfig; 27 | import com.merck.rdf2x.jobs.stats.StatsJob; 28 | import com.merck.rdf2x.spark.SparkContextProvider; 29 | import lombok.extern.slf4j.Slf4j; 30 | import org.apache.spark.api.java.JavaSparkContext; 31 | 32 | import javax.naming.ConfigurationException; 33 | 34 | /** 35 | * JobFactory creates {@link Runnable} jobs from command-line arguments. 36 | *

37 | * Uses jCommander to load config property values. 38 | */ 39 | @Slf4j 40 | public class JobFactory { 41 | /** 42 | * create a {@link Runnable} job from command-line arguments, print usage and return null on errors. 43 | * 44 | * @param args command-line arguments 45 | * @return a new {@link Runnable} job, null if errors occurred 46 | * @throws ConfigurationException thrown in case config is not valid 47 | */ 48 | public static Runnable getJob(String args[]) throws ConfigurationException { 49 | return getJob(args, null); 50 | } 51 | 52 | /** 53 | * create a {@link Runnable} job from command-line arguments, print usage and return null on errors. 54 | * 55 | * @param args command-line arguments 56 | * @param flavor flavor providing default values for config properties 57 | * @return a new {@link Runnable} job, null if errors occurred 58 | * @throws ConfigurationException thrown in case config is not valid 59 | */ 60 | public static Runnable getJob(String args[], Flavor flavor) throws ConfigurationException { 61 | 62 | // create a jCommander instance 63 | CommandConfig config = new CommandConfig(); 64 | JCommander jc = new JCommander(config); 65 | jc.setProgramName("rdf2x"); 66 | 67 | // register the 'convert' command 68 | ConvertConfig convertConfig = new ConvertConfig(); 69 | jc.addCommand("convert", convertConfig); 70 | // register the 'stats' command 71 | StatsConfig statsConfig = new StatsConfig(); 72 | jc.addCommand("stats", statsConfig); 73 | 74 | if (flavor != null) { 75 | flavor.setDefaultValues(convertConfig); 76 | flavor.setDefaultValues(statsConfig); 77 | } 78 | 79 | try { 80 | jc.parse(args); 81 | } catch (ParameterException e) { 82 | jc.usage(); 83 | System.err.println(e.getMessage()); 84 | return null; 85 | } 86 | 87 | String command = jc.getParsedCommand(); 88 | // print usage if no command is provided 89 | if (config.isHelp() || command == null) { 90 | jc.usage(); 91 | return null; 92 | } 93 | 94 | try { 95 | // return the corresponding convert job 96 | switch (command) { 97 | case "convert": 98 | // if flavor is null, create a correct one and run again 99 | // necessary because default values have to be specified before parsing 100 | if (flavor == null) { 101 | flavor = FlavorFactory.fromString(convertConfig.getFlavor()); 102 | return getJob(args, flavor); 103 | } 104 | if (convertConfig.isHelp()) { 105 | jc.usage(); 106 | return null; 107 | } 108 | convertConfig.validate(); 109 | JavaSparkContext scConvert = SparkContextProvider.provide(); 110 | return new ConvertJob(convertConfig, scConvert, flavor); 111 | case "stats": 112 | if (statsConfig.isHelp()) { 113 | jc.usage(); 114 | return null; 115 | } 116 | JavaSparkContext scStats = SparkContextProvider.provide(); 117 | return new StatsJob(statsConfig, scStats); 118 | } 119 | } catch (ConfigurationException e) { 120 | jc.usage(); 121 | throw e; 122 | } 123 | return null; 124 | } 125 | 126 | } 127 | -------------------------------------------------------------------------------- /src/main/java/com/merck/rdf2x/jobs/stats/StatsConfig.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Merck Sharp & Dohme Corp. a subsidiary of Merck & Co., 3 | * Inc., Kenilworth, NJ, USA. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.merck.rdf2x.jobs.stats; 19 | 20 | import com.beust.jcommander.Parameter; 21 | import com.beust.jcommander.Parameters; 22 | import com.beust.jcommander.ParametersDelegate; 23 | import com.merck.rdf2x.processing.aggregating.InstanceAggregatorConfig; 24 | import com.merck.rdf2x.rdf.parsing.QuadParserConfig; 25 | import lombok.AllArgsConstructor; 26 | import lombok.Data; 27 | import lombok.NoArgsConstructor; 28 | import lombok.experimental.Accessors; 29 | 30 | import java.util.List; 31 | 32 | /** 33 | * StatsConfig stores all parameters used for a {@link StatsJob}. 34 | * It is annotated by jCommander Parameters used to load values from the command line. 35 | *

36 | */ 37 | @Data 38 | @Accessors(chain = true) 39 | @NoArgsConstructor 40 | @AllArgsConstructor 41 | @Parameters(commandDescription = "Compute various stats on RDF datasets") 42 | public class StatsConfig { 43 | 44 | @Parameter(names = "--input.file", description = "Path to input file or folder", required = true) 45 | private String inputFile; 46 | 47 | @Parameter(names = "--stat", description = "Stat to compute (multiple stats can be added using '--stat A --stat B')", required = true) 48 | private List stats; 49 | 50 | @Parameter(names = "--help", description = "Show usage page", help = true) 51 | private boolean help = false; 52 | 53 | @ParametersDelegate() 54 | private QuadParserConfig parserConfig = new QuadParserConfig(); 55 | 56 | @ParametersDelegate() 57 | private InstanceAggregatorConfig aggregatorConfig = new InstanceAggregatorConfig(); 58 | 59 | public enum Stat { 60 | /** 61 | * Count number of occurrences of each subject URI 62 | */ 63 | SUBJECT_URI_COUNT, 64 | 65 | /** 66 | * Count number of occurrences of each predicate URI 67 | */ 68 | PREDICATE_URI_COUNT, 69 | 70 | /** 71 | * Count number of occurrences of each object URI 72 | */ 73 | OBJECT_URI_COUNT 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /src/main/java/com/merck/rdf2x/jobs/stats/StatsJob.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Merck Sharp & Dohme Corp. a subsidiary of Merck & Co., 3 | * Inc., Kenilworth, NJ, USA. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.merck.rdf2x.jobs.stats; 19 | 20 | import com.merck.rdf2x.rdf.parsing.ElephasQuadParser; 21 | import com.merck.rdf2x.rdf.parsing.QuadParser; 22 | import com.merck.rdf2x.stats.QuadCounter; 23 | import lombok.extern.slf4j.Slf4j; 24 | import org.apache.jena.sparql.core.Quad; 25 | import org.apache.spark.api.java.JavaPairRDD; 26 | import org.apache.spark.api.java.JavaRDD; 27 | import org.apache.spark.api.java.JavaSparkContext; 28 | import scala.Tuple2; 29 | 30 | import javax.naming.ConfigurationException; 31 | import java.util.List; 32 | 33 | import static com.merck.rdf2x.jobs.stats.StatsConfig.Stat.*; 34 | 35 | /** 36 | * StatsJob computes various stats on RDF datasets. 37 | *

38 | * The instructions are passed in a {@link StatsConfig} object. 39 | */ 40 | @Slf4j 41 | public class StatsJob implements Runnable { 42 | /** 43 | * job config with all instructions 44 | */ 45 | private final StatsConfig config; 46 | /** 47 | * Spark context to be used 48 | */ 49 | private final JavaSparkContext sc; 50 | 51 | /** 52 | * Subject URI count 53 | */ 54 | private Long subjectURICount; 55 | 56 | /** 57 | * Predicate URI count 58 | */ 59 | private Long predicateURICount; 60 | 61 | /** 62 | * Object URI count 63 | */ 64 | private Long objectURICount; 65 | 66 | public StatsJob(StatsConfig config, JavaSparkContext sc) throws ConfigurationException { 67 | this.config = config; 68 | this.sc = sc; 69 | } 70 | 71 | 72 | /** 73 | * Run the job 74 | */ 75 | @Override 76 | public void run() { 77 | 78 | // create all required processors 79 | QuadParser parser = new ElephasQuadParser( 80 | config.getParserConfig(), 81 | sc 82 | ); 83 | 84 | String inputFile = config.getInputFile(); 85 | log.info("Preparing input file: {}", inputFile); 86 | JavaRDD quads = parser.parseQuads(inputFile); 87 | log.info("Done preparing RDD of quads with {} partitions", quads.getNumPartitions()); 88 | 89 | List stats = config.getStats(); 90 | 91 | if (stats.contains(SUBJECT_URI_COUNT)) { 92 | log.info("----------------------------"); 93 | log.info("Subject URI Stats:"); 94 | JavaPairRDD counts = QuadCounter.countBySubjectURI(quads).mapToPair(Tuple2::swap); 95 | counts.sortByKey(false).take(100).forEach(uriCount -> { 96 | log.info(uriCount.toString()); 97 | }); 98 | subjectURICount = counts.count(); 99 | stats.remove(SUBJECT_URI_COUNT); 100 | } 101 | 102 | if(stats.contains(PREDICATE_URI_COUNT)) { 103 | log.info("----------------------------"); 104 | log.info("Predicate URI stats:"); 105 | JavaPairRDD counts = QuadCounter.countByPredicateURI(quads).mapToPair(Tuple2::swap); 106 | counts.sortByKey(false).take(100).forEach(uriCount -> { 107 | log.info(uriCount.toString()); 108 | }); 109 | predicateURICount = counts.count(); 110 | stats.remove(PREDICATE_URI_COUNT); 111 | } 112 | 113 | if(stats.contains(OBJECT_URI_COUNT)) { 114 | log.info("----------------------------"); 115 | log.info("Object URI stats:"); 116 | JavaPairRDD counts = QuadCounter.getObjectURI(quads).mapToPair(Tuple2::swap); 117 | counts.sortByKey(false).take(100).forEach( uriCount -> { 118 | log.info(uriCount.toString()); 119 | }); 120 | objectURICount = counts.count(); 121 | stats.remove(OBJECT_URI_COUNT); 122 | } 123 | 124 | printStats(); 125 | 126 | } 127 | 128 | /** 129 | * Prints stats for distinct counts of {@link Quad} properties i.e. subject, predicate and object 130 | */ 131 | private void printStats() { 132 | log.info("---------------------------------"); 133 | if(subjectURICount != null) { 134 | log.info("Total Distinct Subject URIs: {}", subjectURICount); 135 | } 136 | if(predicateURICount != null) { 137 | log.info("Total Distinct Predicate URIs: {}", predicateURICount); 138 | } 139 | if(objectURICount != null) { 140 | log.info("Total Distinct Object URIs: {}", objectURICount); 141 | } 142 | log.info("---------------------------------"); 143 | } 144 | 145 | } 146 | -------------------------------------------------------------------------------- /src/main/java/com/merck/rdf2x/main/Main.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Merck Sharp & Dohme Corp. a subsidiary of Merck & Co., 3 | * Inc., Kenilworth, NJ, USA. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.merck.rdf2x.main; 19 | 20 | import com.merck.rdf2x.jobs.JobFactory; 21 | 22 | import javax.naming.ConfigurationException; 23 | 24 | /** 25 | * Main class used to run jobs based on command-line arguments. 26 | */ 27 | public class Main { 28 | /** 29 | * Get job based on command-line arguments and run it. 30 | * 31 | * @param args command-line arguments 32 | * @throws ConfigurationException in case config is not valid 33 | */ 34 | public static void main(String args[]) throws ConfigurationException { 35 | Runnable job = JobFactory.getJob(args); 36 | 37 | if (job == null) { 38 | return; 39 | } 40 | 41 | job.run(); 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /src/main/java/com/merck/rdf2x/main/ReadmeUtils.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Merck Sharp & Dohme Corp. a subsidiary of Merck & Co., 3 | * Inc., Kenilworth, NJ, USA. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.merck.rdf2x.main; 19 | 20 | import com.beust.jcommander.Parameter; 21 | import com.beust.jcommander.ParametersDelegate; 22 | import com.merck.rdf2x.jobs.convert.ConvertConfig; 23 | 24 | import java.lang.reflect.Field; 25 | import java.util.stream.Collectors; 26 | import java.util.stream.Stream; 27 | 28 | /** 29 | * ReadmeUtils are used to generate README.md file sections, such as the config tables. 30 | */ 31 | public class ReadmeUtils { 32 | 33 | public static void main(String args[]) throws InstantiationException, IllegalAccessException { 34 | printConfig(ConvertConfig.class); 35 | } 36 | 37 | private static void printConfig(Class configClass) throws IllegalAccessException, InstantiationException { 38 | Field[] fields = configClass.getDeclaredFields(); 39 | System.out.println(); 40 | System.out.println("### " + configClass.getSimpleName()); 41 | System.out.println(); 42 | Object defaultConfig = configClass.newInstance(); 43 | 44 | System.out.println("|Name|Default|Description|"); 45 | System.out.println("|---|---|---|"); 46 | try { 47 | for (Field field : fields) { 48 | field.setAccessible(true); 49 | StringBuilder sb = new StringBuilder(); 50 | sb.append("|"); 51 | Parameter param = field.getDeclaredAnnotation(Parameter.class); 52 | 53 | if (param != null) { 54 | String names = Stream.of(param.names()) 55 | .collect(Collectors.joining(", ")); 56 | // name 57 | sb.append(names).append("|"); 58 | 59 | // default 60 | sb.append(param.required() ? "**required**" : field.get(defaultConfig) + " ").append("|"); 61 | 62 | // description 63 | sb.append(param.description()).append("|"); 64 | 65 | System.out.println(sb.toString()); 66 | } 67 | 68 | ParametersDelegate delegate = field.getDeclaredAnnotation(ParametersDelegate.class); 69 | 70 | if (delegate != null) { 71 | printConfig(field.getType()); 72 | } 73 | } 74 | } catch (IllegalAccessException e) { 75 | e.printStackTrace(); 76 | } 77 | } 78 | } 79 | -------------------------------------------------------------------------------- /src/main/java/com/merck/rdf2x/persistence/InstanceRelationWriterConfig.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Merck Sharp & Dohme Corp. a subsidiary of Merck & Co., 3 | * Inc., Kenilworth, NJ, USA. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package com.merck.rdf2x.persistence; 18 | 19 | import com.beust.jcommander.Parameter; 20 | import lombok.Data; 21 | import lombok.experimental.Accessors; 22 | 23 | import java.io.Serializable; 24 | 25 | /** 26 | * InstanceRelationWriterConfig stores parameters for the {@link InstanceRelationWriter}. 27 | */ 28 | @Data 29 | @Accessors(chain = true) 30 | public class InstanceRelationWriterConfig implements Serializable { 31 | 32 | @Parameter(names = "--formatting.entityTablePrefix", description = "String to prepend to entity table names") 33 | private String entityTablePrefix = ""; 34 | 35 | @Parameter(names = "--formatting.relationTablePrefix", description = "String to prepend to relation table names") 36 | private String relationTablePrefix = ""; 37 | 38 | @Parameter(names = "--relations.storePredicate", arity = 1, description = "Store predicate (relationship type) as a third column of entity relation tables.") 39 | private boolean storePredicate = true; 40 | 41 | } 42 | -------------------------------------------------------------------------------- /src/main/java/com/merck/rdf2x/persistence/config/DbConfig.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Merck Sharp & Dohme Corp. a subsidiary of Merck & Co., 3 | * Inc., Kenilworth, NJ, USA. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.merck.rdf2x.persistence.config; 19 | 20 | import com.beust.jcommander.Parameter; 21 | import lombok.Data; 22 | import lombok.ToString; 23 | import lombok.experimental.Accessors; 24 | 25 | import javax.naming.ConfigurationException; 26 | import java.sql.DriverManager; 27 | import java.sql.SQLException; 28 | import java.util.Properties; 29 | 30 | /** 31 | * DbConfig stores database connection parameters 32 | */ 33 | @Data 34 | @Accessors(chain = true) 35 | @ToString(exclude = "password") 36 | public class DbConfig { 37 | @Parameter(names = "--db.url", description = "Database JDBC string") 38 | private String url; 39 | 40 | @Parameter(names = "--db.user", description = "Database user") 41 | private String user; 42 | 43 | @Parameter(names = "--db.password", description = "Database password") 44 | private String password; 45 | 46 | @Parameter(names = "--db.schema", description = "Database schema name") 47 | private String schema = null; 48 | 49 | @Parameter(names = "--db.batchSize", description = "Insert batch size (number of batched insert statements or number of lines in a CSV batch)") 50 | private Integer batchSize = 5000; 51 | 52 | @Parameter(names = "--db.bulkLoad", arity = 1, description = "Use CSV bulk load if possible (PostgreSQL COPY)") 53 | private boolean bulkLoad = true; 54 | 55 | /** 56 | * Validate the config, throw {@link ConfigurationException} on error 57 | * 58 | * @throws ConfigurationException found error 59 | */ 60 | public void validate() throws ConfigurationException { 61 | if (url == null) { 62 | throw new ConfigurationException("Specify JDBC url with --db.url"); 63 | } 64 | if (user == null) { 65 | throw new ConfigurationException("Specify DB user with --db.user"); 66 | } 67 | getDriverClassName(); 68 | } 69 | 70 | /** 71 | * Prepare Properties object for JDBC Connector 72 | * 73 | * @return prepared Properties object for JDBC Connector 74 | */ 75 | public Properties getProperties() { 76 | Properties dbProperties = new Properties(); 77 | 78 | try { 79 | // specify driver class name, required by Spark to register it on all executors 80 | dbProperties.put("driver", getDriverClassName()); 81 | } catch (ConfigurationException ignored) { 82 | // already checked during validation 83 | } 84 | dbProperties.put("tcpKeepAlive", "true"); 85 | dbProperties.put("connectTimeout", "0"); 86 | dbProperties.put("socketTimeout", "0"); 87 | dbProperties.setProperty("user", user); 88 | dbProperties.setProperty("password", password); 89 | dbProperties.setProperty("batchsize", batchSize.toString()); 90 | if (schema != null) { 91 | dbProperties.put("searchpath", schema); 92 | dbProperties.put("currentSchema", schema); 93 | } 94 | return dbProperties; 95 | } 96 | 97 | public String getDriverClassName() throws ConfigurationException { 98 | try { 99 | return DriverManager.getDriver(url).getClass().getName(); 100 | } catch (SQLException e) { 101 | throw new ConfigurationException("Driver not found for JDBC url '" + url + "', probably due to missing dependencies"); 102 | } 103 | } 104 | } 105 | -------------------------------------------------------------------------------- /src/main/java/com/merck/rdf2x/persistence/config/ElasticSearchConfig.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Merck Sharp & Dohme Corp. a subsidiary of Merck & Co., 3 | * Inc., Kenilworth, NJ, USA. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.merck.rdf2x.persistence.config; 19 | 20 | import com.beust.jcommander.Parameter; 21 | import lombok.Data; 22 | import lombok.experimental.Accessors; 23 | 24 | import javax.naming.ConfigurationException; 25 | import java.util.HashMap; 26 | import java.util.Map; 27 | 28 | /** 29 | * EsConfig stores properties for saving to ElasticSearch 30 | */ 31 | @Data 32 | @Accessors(chain = true) 33 | public class ElasticSearchConfig { 34 | @Parameter(names = "--es.index", description = "ElasticSearch Index to save the output to", required = false) 35 | private String index; 36 | 37 | @Parameter(names = "--es.createIndex", arity = 1, description = "Whether to create index in case it does not exist, overrides es.index.auto.create property", required = false) 38 | private Boolean createIndex = true; 39 | 40 | /** 41 | * Validate the config, throw {@link ConfigurationException} on error 42 | * 43 | * @throws ConfigurationException found error 44 | */ 45 | public void validate() throws ConfigurationException { 46 | if (index == null) { 47 | throw new ConfigurationException("Specify the ElasticSearch output index with --es.index"); 48 | } 49 | } 50 | 51 | public Map getProperties(String name) { 52 | Map properties = new HashMap<>(); 53 | 54 | properties.put("es.resource", index + "/" + name); 55 | if (createIndex != null) { 56 | properties.put("es.index.auto.create", createIndex.toString()); 57 | } 58 | 59 | return properties; 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /src/main/java/com/merck/rdf2x/persistence/config/FileConfig.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Merck Sharp & Dohme Corp. a subsidiary of Merck & Co., 3 | * Inc., Kenilworth, NJ, USA. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.merck.rdf2x.persistence.config; 19 | 20 | import com.beust.jcommander.Parameter; 21 | import lombok.AllArgsConstructor; 22 | import lombok.Data; 23 | import lombok.NoArgsConstructor; 24 | import lombok.experimental.Accessors; 25 | 26 | import javax.naming.ConfigurationException; 27 | import java.io.File; 28 | 29 | /** 30 | * FileConfig stores output path for outputting files to disk 31 | */ 32 | @Data 33 | @Accessors(chain = true) 34 | @NoArgsConstructor 35 | @AllArgsConstructor 36 | public class FileConfig { 37 | @Parameter(names = "--output.folder", description = "Folder to output the files to", required = false) 38 | private String outputFolder; 39 | 40 | /** 41 | * Validate the config, throw {@link ConfigurationException} on error 42 | * 43 | * @throws ConfigurationException found error 44 | */ 45 | public void validate() throws ConfigurationException { 46 | if (outputFolder == null) { 47 | throw new ConfigurationException("Specify the output folder with --output.folder"); 48 | } 49 | File f = new File(outputFolder); 50 | if (f.exists() && !f.isDirectory()) { 51 | throw new ConfigurationException("The output path is not a folder: " + outputFolder); 52 | } 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /src/main/java/com/merck/rdf2x/persistence/output/CSVPersistor.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Merck Sharp & Dohme Corp. a subsidiary of Merck & Co., 3 | * Inc., Kenilworth, NJ, USA. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.merck.rdf2x.persistence.output; 19 | 20 | import com.merck.rdf2x.persistence.config.FileConfig; 21 | import lombok.RequiredArgsConstructor; 22 | import lombok.extern.slf4j.Slf4j; 23 | import org.apache.spark.sql.DataFrame; 24 | import org.apache.spark.sql.SaveMode; 25 | 26 | import java.nio.file.Paths; 27 | 28 | /** 29 | * CSVPersistor persists dataframes to CSV files 30 | */ 31 | @Slf4j 32 | @RequiredArgsConstructor 33 | public class CSVPersistor implements Persistor { 34 | 35 | /** 36 | * config storing output location 37 | */ 38 | private final FileConfig config; 39 | 40 | /** 41 | * save mode defines what to do when file exists 42 | */ 43 | private final SaveMode saveMode; 44 | 45 | /** 46 | * Write a {@link DataFrame} to the specified output 47 | * 48 | * @param name name of output table 49 | * @param df dataframe containing the data 50 | */ 51 | @Override 52 | public void writeDataFrame(String name, DataFrame df) { 53 | String outputFolder = config.getOutputFolder(); 54 | String outputPath = Paths.get(outputFolder, name).toString(); 55 | log.info("Writing CSV files to folder {}", outputPath); 56 | df.write().mode(saveMode) 57 | .format("com.databricks.spark.csv") 58 | .option("header", "true") 59 | .save(outputPath); 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /src/main/java/com/merck/rdf2x/persistence/output/DataFrameMapPersistor.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Merck Sharp & Dohme Corp. a subsidiary of Merck & Co., 3 | * Inc., Kenilworth, NJ, USA. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.merck.rdf2x.persistence.output; 19 | 20 | import org.apache.spark.sql.DataFrame; 21 | 22 | import java.util.HashMap; 23 | import java.util.Map; 24 | 25 | /** 26 | * DataFrameMapPersistor persists dataframes directly to a map 27 | */ 28 | public class DataFrameMapPersistor implements Persistor { 29 | 30 | /** 31 | * Map of DataFrames to use with DataFrameMap output mode 32 | */ 33 | private final Map resultMap; 34 | 35 | public DataFrameMapPersistor() { 36 | this(new HashMap<>()); 37 | } 38 | 39 | public DataFrameMapPersistor(Map resultMap) { 40 | this.resultMap = resultMap; 41 | } 42 | 43 | /** 44 | * Write a {@link DataFrame} to the specified output 45 | * 46 | * @param name name of output table 47 | * @param df dataframe containing the data 48 | */ 49 | @Override 50 | public void writeDataFrame(String name, DataFrame df) { 51 | resultMap.put(name, df); 52 | } 53 | 54 | public Map getResultMap() { 55 | return resultMap; 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /src/main/java/com/merck/rdf2x/persistence/output/DbPersistorSQLServer.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Merck Sharp & Dohme Corp. a subsidiary of Merck & Co., 3 | * Inc., Kenilworth, NJ, USA. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.merck.rdf2x.persistence.output; 19 | 20 | import com.merck.rdf2x.persistence.config.DbConfig; 21 | import lombok.extern.slf4j.Slf4j; 22 | import org.apache.spark.sql.DataFrame; 23 | import org.apache.spark.sql.SaveMode; 24 | import org.apache.spark.sql.types.DataTypes; 25 | import org.apache.spark.sql.types.StructField; 26 | 27 | @Slf4j 28 | public class DbPersistorSQLServer extends DbPersistor { 29 | 30 | private static final String TMP_SUFFIX = "__RDF2X_TMP_SUFFIX"; 31 | 32 | public DbPersistorSQLServer(DbConfig config, SaveMode saveMode) { 33 | super(config, saveMode); 34 | } 35 | 36 | @Override 37 | public void writeDataFrame(String name, DataFrame df) { 38 | for (StructField field : df.schema().fields()) { 39 | String column = field.name(); 40 | // convert booleans to integers to avoid error in Spark 1.6.2 41 | // "Cannot specify a column width on data type bit." 42 | if (field.dataType() == DataTypes.BooleanType) { 43 | df = df.withColumn(column + TMP_SUFFIX, df.col(column).cast(DataTypes.IntegerType)) 44 | .drop(column) 45 | .withColumnRenamed(column + TMP_SUFFIX, column); 46 | } 47 | } 48 | super.writeDataFrame(name, df); 49 | } 50 | 51 | } 52 | -------------------------------------------------------------------------------- /src/main/java/com/merck/rdf2x/persistence/output/ElasticSearchPersistor.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Merck Sharp & Dohme Corp. a subsidiary of Merck & Co., 3 | * Inc., Kenilworth, NJ, USA. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.merck.rdf2x.persistence.output; 19 | 20 | import com.merck.rdf2x.persistence.config.ElasticSearchConfig; 21 | import lombok.RequiredArgsConstructor; 22 | import lombok.extern.slf4j.Slf4j; 23 | import org.apache.spark.sql.DataFrame; 24 | import org.elasticsearch.spark.sql.api.java.JavaEsSparkSQL; 25 | 26 | import java.util.Map; 27 | 28 | /** 29 | * ElasticSearchPersistor persists dataframes to Elasticsearch 30 | */ 31 | @Slf4j 32 | @RequiredArgsConstructor 33 | public class ElasticSearchPersistor implements Persistor { 34 | 35 | private final ElasticSearchConfig config; 36 | 37 | /** 38 | * Write a {@link DataFrame} to the specified output 39 | * 40 | * @param name name of output table 41 | * @param df dataframe containing the data 42 | */ 43 | @Override 44 | public void writeDataFrame(String name, DataFrame df) { 45 | Map props = config.getProperties(name); 46 | log.info("Writing to ElasticSearch: {}", props); 47 | JavaEsSparkSQL.saveToEs(df, props); 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /src/main/java/com/merck/rdf2x/persistence/output/JSONPersistor.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Merck Sharp & Dohme Corp. a subsidiary of Merck & Co., 3 | * Inc., Kenilworth, NJ, USA. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.merck.rdf2x.persistence.output; 19 | 20 | import com.merck.rdf2x.persistence.config.FileConfig; 21 | import lombok.RequiredArgsConstructor; 22 | import lombok.extern.slf4j.Slf4j; 23 | import org.apache.spark.sql.DataFrame; 24 | import org.apache.spark.sql.SaveMode; 25 | 26 | import java.nio.file.Paths; 27 | 28 | /** 29 | * JSONPersistor persists dataframes as JSON files 30 | */ 31 | @Slf4j 32 | @RequiredArgsConstructor 33 | public class JSONPersistor implements Persistor { 34 | 35 | private final FileConfig config; 36 | 37 | private final SaveMode saveMode; 38 | 39 | /** 40 | * Write a {@link DataFrame} to the specified output 41 | * 42 | * @param name name of output table 43 | * @param df dataframe containing the data 44 | */ 45 | @Override 46 | public void writeDataFrame(String name, DataFrame df) { 47 | String outputFolder = config.getOutputFolder(); 48 | String outputPath = Paths.get(outputFolder, name).toString(); 49 | log.info("Writing JSON files to folder {}", outputPath); 50 | df.write().mode(saveMode).json(outputPath); 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /src/main/java/com/merck/rdf2x/persistence/output/PersistException.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Merck Sharp & Dohme Corp. a subsidiary of Merck & Co., 3 | * Inc., Kenilworth, NJ, USA. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.merck.rdf2x.persistence.output; 19 | 20 | /** 21 | * Exception thrown when persisting dataframes and creating indexes 22 | */ 23 | public class PersistException extends Exception { 24 | public PersistException() { 25 | super(); 26 | } 27 | 28 | public PersistException(Throwable cause) { 29 | super(cause); 30 | } 31 | 32 | public PersistException(String message, Throwable cause) { 33 | super(message, cause); 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /src/main/java/com/merck/rdf2x/persistence/output/Persistor.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Merck Sharp & Dohme Corp. a subsidiary of Merck & Co., 3 | * Inc., Kenilworth, NJ, USA. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.merck.rdf2x.persistence.output; 19 | 20 | import org.apache.spark.sql.DataFrame; 21 | import scala.Tuple2; 22 | import scala.Tuple4; 23 | 24 | import java.util.Collection; 25 | import java.util.Collections; 26 | 27 | /** 28 | * OutputWriter writes {@link DataFrame}s in various output targets. 29 | */ 30 | public interface Persistor { 31 | 32 | /** 33 | * Write a {@link DataFrame} to the specified output 34 | * 35 | * @param name name of output table 36 | * @param df dataframe containing the data 37 | */ 38 | void writeDataFrame(String name, DataFrame df); 39 | 40 | /** 41 | * Create index on specified columns (if applicable for given output) 42 | * 43 | * @param tableColumnPairs collection of table, column pairs to create index for 44 | */ 45 | default void createIndexes(Collection> tableColumnPairs) { 46 | } 47 | 48 | /** 49 | * Create primary keys on specified columns (if applicable for given output) 50 | * 51 | * @param tableColumnPairs collection of table, column pairs to create primary keys for 52 | */ 53 | default void createPrimaryKeys(Collection> tableColumnPairs) { 54 | } 55 | 56 | /** 57 | * Create foreign keys on specified columns (if applicable for given output) 58 | * 59 | * @param tableColumnPairs collection of (fromTableName, fromTableColumn, toTableName, toTableColumn) tuples 60 | */ 61 | default void createForeignKeys(Collection> tableColumnPairs) { 62 | } 63 | 64 | /** 65 | * Get reserved names that should not be used (case insensitive) 66 | * 67 | * @return reserved names that should not be used (case insensitive) 68 | */ 69 | default Collection getReservedNames() { 70 | return Collections.emptySet(); 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /src/main/java/com/merck/rdf2x/persistence/output/PersistorFactory.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Merck Sharp & Dohme Corp. a subsidiary of Merck & Co., 3 | * Inc., Kenilworth, NJ, USA. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.merck.rdf2x.persistence.output; 19 | 20 | import com.merck.rdf2x.persistence.config.OutputConfig; 21 | import lombok.extern.slf4j.Slf4j; 22 | import org.apache.commons.lang3.NotImplementedException; 23 | 24 | import javax.naming.ConfigurationException; 25 | 26 | /** 27 | * PersistorFactory creates {@link Persistor}s based on specified {@link OutputConfig} 28 | */ 29 | @Slf4j 30 | public class PersistorFactory { 31 | 32 | public static Persistor createPersistor(OutputConfig config) { 33 | OutputConfig.OutputTarget target = config.getTarget(); 34 | switch (target) { 35 | case DB: 36 | try { 37 | String driverClassName = config.getDbConfig().getDriverClassName(); 38 | switch (driverClassName) { 39 | case "org.postgresql.Driver": 40 | return new DbPersistorPostgres(config.getDbConfig(), config.getSaveMode()); 41 | case "com.microsoft.sqlserver.jdbc.SQLServerDriver": 42 | return new DbPersistorSQLServer(config.getDbConfig(), config.getSaveMode()); 43 | } 44 | } catch (ConfigurationException ignored) { 45 | } 46 | return new DbPersistor(config.getDbConfig(), config.getSaveMode()); 47 | case CSV: 48 | return new CSVPersistor(config.getFileConfig(), config.getSaveMode()); 49 | case JSON: 50 | return new JSONPersistor(config.getFileConfig(), config.getSaveMode()); 51 | case ES: 52 | return new ElasticSearchPersistor(config.getEsConfig()); 53 | case Preview: 54 | return new PreviewPersistor(); 55 | case DataFrameMap: 56 | return new DataFrameMapPersistor(config.getResultMap()); 57 | default: 58 | throw new NotImplementedException("Output not supported: " + config); 59 | } 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /src/main/java/com/merck/rdf2x/persistence/output/PreviewPersistor.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Merck Sharp & Dohme Corp. a subsidiary of Merck & Co., 3 | * Inc., Kenilworth, NJ, USA. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.merck.rdf2x.persistence.output; 19 | 20 | import lombok.extern.slf4j.Slf4j; 21 | import org.apache.spark.sql.DataFrame; 22 | 23 | /** 24 | * PreviewPersistor does not persist the dataframe, just shows the first rows 25 | */ 26 | @Slf4j 27 | public class PreviewPersistor implements Persistor { 28 | /** 29 | * Write a {@link DataFrame} to the specified output 30 | * 31 | * @param name name of output table 32 | * @param df dataframe containing the data 33 | */ 34 | @Override 35 | public void writeDataFrame(String name, DataFrame df) { 36 | log.info(name); 37 | log.info(df.showString(10, true)); 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /src/main/java/com/merck/rdf2x/persistence/schema/EntityColumn.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Merck Sharp & Dohme Corp. a subsidiary of Merck & Co., 3 | * Inc., Kenilworth, NJ, USA. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.merck.rdf2x.persistence.schema; 19 | 20 | import com.merck.rdf2x.beans.Predicate; 21 | import lombok.Data; 22 | import lombok.NonNull; 23 | 24 | import java.io.Serializable; 25 | 26 | /** 27 | * EntityColumn defines structure of a column in an {@link EntityTable}. 28 | *

29 | * It is defined by a {@link Predicate} and a column name. 30 | */ 31 | @Data 32 | public class EntityColumn implements Serializable { 33 | 34 | /** 35 | * the full name of this column 36 | */ 37 | private final String name; 38 | /** 39 | * name of superclass column this column is substituted by, in the form of 'table.column'. 40 | * Null if column is stored directly in its table. 41 | */ 42 | private String storedInSuperclassColumn = null; 43 | /** 44 | * the property stored by this column 45 | */ 46 | private final EntityProperty property; 47 | 48 | /** 49 | * @param name the full name of this column 50 | * @param property the property stored by this column 51 | */ 52 | public EntityColumn(@NonNull String name, EntityProperty property) { 53 | this.name = name; 54 | this.property = property; 55 | } 56 | 57 | 58 | @Override 59 | public String toString() { 60 | return name + "(" + 61 | property + 62 | (storedInSuperclassColumn == null ? "" : ", Stored in " + storedInSuperclassColumn) + 63 | ")"; 64 | } 65 | 66 | } 67 | -------------------------------------------------------------------------------- /src/main/java/com/merck/rdf2x/persistence/schema/EntityProperty.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Merck Sharp & Dohme Corp. a subsidiary of Merck & Co., 3 | * Inc., Kenilworth, NJ, USA. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.merck.rdf2x.persistence.schema; 19 | 20 | import com.merck.rdf2x.beans.Predicate; 21 | import lombok.Data; 22 | import lombok.NonNull; 23 | 24 | /** 25 | * EntityProperty stores a literal property of a specific entity. Can be saved in a column or the EAV table. 26 | */ 27 | @Data 28 | public class EntityProperty { 29 | /** 30 | * the predicate stored by this property 31 | */ 32 | @NonNull 33 | private final Predicate predicate; 34 | /** 35 | * whether the property has multiple values 36 | */ 37 | private final boolean multivalued; 38 | /** 39 | * ratio of non-null values in this property 40 | */ 41 | private final Double nonNullFraction; 42 | 43 | 44 | @Override 45 | public String toString() { 46 | return predicate + 47 | (multivalued ? ",Multivalued" : "") + 48 | (nonNullFraction == null ? "" : "," + (int) (nonNullFraction * 100) + "% non-null"); 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /src/main/java/com/merck/rdf2x/persistence/schema/EntitySchema.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Merck Sharp & Dohme Corp. a subsidiary of Merck & Co., 3 | * Inc., Kenilworth, NJ, USA. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.merck.rdf2x.persistence.schema; 19 | 20 | import lombok.Data; 21 | 22 | import java.io.Serializable; 23 | import java.util.List; 24 | import java.util.Map; 25 | 26 | /** 27 | * EntitySchema defines a sorted set of {@link EntityTable}s to be persisted in a file or database. 28 | */ 29 | @Data 30 | public class EntitySchema implements Serializable { 31 | 32 | /** 33 | * List of tables in this schema. 34 | */ 35 | private final List tables; 36 | 37 | /** 38 | * Map of type URI -> unique type name 39 | */ 40 | private final Map tableNames; 41 | 42 | /** 43 | * Map of predicate URI -> unique property name 44 | */ 45 | private final Map propertyNames; 46 | } 47 | -------------------------------------------------------------------------------- /src/main/java/com/merck/rdf2x/persistence/schema/EntityTable.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Merck Sharp & Dohme Corp. a subsidiary of Merck & Co., 3 | * Inc., Kenilworth, NJ, USA. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.merck.rdf2x.persistence.schema; 19 | 20 | import com.merck.rdf2x.beans.Instance; 21 | import lombok.Data; 22 | 23 | import java.io.Serializable; 24 | import java.util.List; 25 | import java.util.Set; 26 | 27 | /** 28 | * EntityTable represents the structure of a single entity table of {@link Instance}s to be persisted in a database or file. 29 | *

30 | * It is defined by a name, type URI and a sorted set of {@link EntityColumn}s. 31 | *

32 | * It does NOT define how to store an ID or an URI of a {@link Instance}, this is handled later by persistors. 33 | */ 34 | @Data 35 | public class EntityTable implements Serializable { 36 | /** 37 | * name of the table 38 | */ 39 | private final String name; 40 | /** 41 | * URI of the type stored by this table 42 | */ 43 | private final String typeURI; 44 | /** 45 | * Number of rows stored by this table 46 | */ 47 | private final Long numRows; 48 | /** 49 | * list of table columns 50 | */ 51 | private final List columns; 52 | 53 | /** 54 | * properties to be stored in Entity-Attribute-Value table 55 | */ 56 | private final Set attributes; 57 | 58 | } 59 | -------------------------------------------------------------------------------- /src/main/java/com/merck/rdf2x/persistence/schema/RelationEntityFilter.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Merck Sharp & Dohme Corp. a subsidiary of Merck & Co., 3 | * Inc., Kenilworth, NJ, USA. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.merck.rdf2x.persistence.schema; 19 | 20 | import lombok.Data; 21 | import lombok.RequiredArgsConstructor; 22 | 23 | /** 24 | * RelationEntityFilter defines a group of relations between two entity types. 25 | */ 26 | @Data 27 | @RequiredArgsConstructor 28 | public class RelationEntityFilter { 29 | 30 | /** 31 | * index of the source type 32 | */ 33 | private final Integer fromTypeIndex; 34 | 35 | /** 36 | * table name of the source type 37 | */ 38 | private final String fromTypeName; 39 | 40 | /** 41 | * index of the target type 42 | */ 43 | private final Integer toTypeIndex; 44 | 45 | /** 46 | * table name of the source type 47 | */ 48 | private final String toTypeName; 49 | 50 | } 51 | -------------------------------------------------------------------------------- /src/main/java/com/merck/rdf2x/persistence/schema/RelationPredicateFilter.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Merck Sharp & Dohme Corp. a subsidiary of Merck & Co., 3 | * Inc., Kenilworth, NJ, USA. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.merck.rdf2x.persistence.schema; 19 | 20 | import lombok.Data; 21 | import lombok.RequiredArgsConstructor; 22 | 23 | /** 24 | * RelationPredicateFilter defines relations of a single predicate. 25 | */ 26 | @Data 27 | @RequiredArgsConstructor 28 | public class RelationPredicateFilter { 29 | 30 | /** 31 | * URI of the relation predicate stored in this table 32 | */ 33 | private final String predicateURI; 34 | 35 | } 36 | -------------------------------------------------------------------------------- /src/main/java/com/merck/rdf2x/persistence/schema/RelationSchema.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Merck Sharp & Dohme Corp. a subsidiary of Merck & Co., 3 | * Inc., Kenilworth, NJ, USA. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.merck.rdf2x.persistence.schema; 19 | 20 | import com.merck.rdf2x.processing.schema.RelationSchemaStrategy; 21 | import lombok.Data; 22 | 23 | import java.io.Serializable; 24 | import java.util.Collection; 25 | import java.util.TreeSet; 26 | 27 | /** 28 | * RelationSchema defines a sorted set of {@link RelationTable}s to be persisted in a file or database. 29 | */ 30 | @Data 31 | public class RelationSchema implements Serializable { 32 | 33 | /** 34 | * Set of tables in this schema, sorted by table name. 35 | */ 36 | private final TreeSet tables; 37 | 38 | /** 39 | * @param tables Set of tables in this schema, will be copied and sorted by table name 40 | */ 41 | public RelationSchema(Collection tables) { 42 | this.tables = new TreeSet<>(tables); 43 | } 44 | 45 | /** 46 | * Create single table schema 47 | * 48 | * @param singleRelationTableName name of the single table 49 | */ 50 | public RelationSchema(String singleRelationTableName) { 51 | this.tables = new TreeSet<>(); 52 | this.tables.add(new RelationTable(singleRelationTableName, RelationSchemaStrategy.SingleTable)); 53 | } 54 | 55 | } 56 | -------------------------------------------------------------------------------- /src/main/java/com/merck/rdf2x/persistence/schema/RelationTable.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Merck Sharp & Dohme Corp. a subsidiary of Merck & Co., 3 | * Inc., Kenilworth, NJ, USA. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.merck.rdf2x.persistence.schema; 19 | 20 | import com.merck.rdf2x.processing.schema.RelationSchemaStrategy; 21 | import lombok.Data; 22 | import lombok.experimental.Accessors; 23 | 24 | import java.io.Serializable; 25 | 26 | /** 27 | * RelationTable represents the structure of a single relation table to be persisted in a database or file. 28 | */ 29 | @Data 30 | @Accessors(chain = true) 31 | public class RelationTable implements Comparable, Serializable { 32 | 33 | /** 34 | * name of the table 35 | */ 36 | private final String name; 37 | 38 | /** 39 | * Used strategy of storing relations 40 | */ 41 | private final RelationSchemaStrategy strategy; 42 | 43 | /** 44 | * Filter that defines that this table stores relations of a given predicate, null if all predicates are stored 45 | */ 46 | private RelationPredicateFilter predicateFilter = null; 47 | 48 | /** 49 | * Filter that defines that this table stores relations between two entities, null if relations of all entities are stored 50 | */ 51 | private RelationEntityFilter entityFilter = null; 52 | 53 | public boolean isSingleTable() { 54 | return this.strategy == RelationSchemaStrategy.SingleTable; 55 | } 56 | 57 | @Override 58 | public int compareTo(RelationTable o) { 59 | return name.compareTo(o.name); 60 | } 61 | 62 | } 63 | -------------------------------------------------------------------------------- /src/main/java/com/merck/rdf2x/processing/aggregating/InstanceAggregatorConfig.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Merck Sharp & Dohme Corp. a subsidiary of Merck & Co., 3 | * Inc., Kenilworth, NJ, USA. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package com.merck.rdf2x.processing.aggregating; 18 | 19 | import com.beust.jcommander.Parameter; 20 | import com.merck.rdf2x.persistence.InstanceRelationWriter; 21 | import lombok.Data; 22 | import lombok.experimental.Accessors; 23 | 24 | import java.io.Serializable; 25 | 26 | /** 27 | * InstanceRelationWriterConfig stores parameters for the {@link InstanceRelationWriter}. 28 | */ 29 | @Data 30 | @Accessors(chain = true) 31 | public class InstanceAggregatorConfig implements Serializable { 32 | 33 | @Parameter(names = "--instances.defaultLanguage", description = "Consider all values in this language as if no language is specified. Language suffix will not be added to columns.") 34 | private String defaultLanguage = null; 35 | 36 | @Parameter(names = "--instances.addSuperTypes", arity = 1, description = "Automatically add all supertypes to each instance, instance will be persisted in all parent type tables.") 37 | private boolean addSuperTypes = true; 38 | 39 | } 40 | -------------------------------------------------------------------------------- /src/main/java/com/merck/rdf2x/processing/filtering/InstanceFilter.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Merck Sharp & Dohme Corp. a subsidiary of Merck & Co., 3 | * Inc., Kenilworth, NJ, USA. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.merck.rdf2x.processing.filtering; 19 | 20 | import com.google.common.collect.Sets; 21 | import com.merck.rdf2x.beans.IndexMap; 22 | import com.merck.rdf2x.beans.Instance; 23 | import lombok.RequiredArgsConstructor; 24 | import lombok.extern.slf4j.Slf4j; 25 | import org.apache.spark.api.java.JavaRDD; 26 | 27 | import java.util.Collections; 28 | import java.util.Set; 29 | import java.util.stream.Collectors; 30 | 31 | /** 32 | * InstanceFilter filters a RDD of {@link Instance}s based on specified config. 33 | */ 34 | @Slf4j 35 | @RequiredArgsConstructor 36 | public class InstanceFilter { 37 | 38 | private final InstanceFilterConfig config; 39 | 40 | /** 41 | * filter RDD of {@link Instance}s based on the specified config 42 | * 43 | * @param instances RDD of instances to filter 44 | * @param typeIndex index mapping type URIs to integers 45 | * @return filtered RDD of instances 46 | */ 47 | public JavaRDD filter(JavaRDD instances, IndexMap typeIndex) { 48 | if (config.getTypes().isEmpty()) { 49 | return instances; 50 | } 51 | // get indexes of accepted type URIs 52 | Set acceptedTypes = config.getTypes().stream() 53 | .map(typeIndex::getIndex) 54 | .collect(Collectors.toSet()); 55 | 56 | instances = instances.filter(instance -> !Collections.disjoint(instance.getTypes(), acceptedTypes)); 57 | 58 | if (config.isIgnoreOtherTypes()) { 59 | // remove other than accepted types from each instance 60 | instances = instances.map(instance -> { 61 | Set intersect = Sets.intersection(instance.getTypes(), acceptedTypes).immutableCopy(); 62 | instance.getTypes().clear(); 63 | instance.getTypes().addAll(intersect); 64 | return instance; 65 | }); 66 | } 67 | 68 | return instances; 69 | } 70 | } 71 | -------------------------------------------------------------------------------- /src/main/java/com/merck/rdf2x/processing/filtering/InstanceFilterConfig.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Merck Sharp & Dohme Corp. a subsidiary of Merck & Co., 3 | * Inc., Kenilworth, NJ, USA. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package com.merck.rdf2x.processing.filtering; 18 | 19 | import com.beust.jcommander.Parameter; 20 | import lombok.Data; 21 | import lombok.experimental.Accessors; 22 | 23 | import java.io.Serializable; 24 | import java.util.ArrayList; 25 | import java.util.List; 26 | 27 | /** 28 | * InstanceFilterConfig stores all information for the {@link InstanceFilter}. 29 | */ 30 | @Data 31 | @Accessors(chain = true) 32 | public class InstanceFilterConfig implements Serializable { 33 | 34 | @Parameter(names = "--filter.type", description = "Accept only resources of specified type. More type URIs can be specified by repeating this parameter.") 35 | private List types = new ArrayList<>(); 36 | 37 | @Parameter(names = "--filter.ignoreOtherTypes", description = "Whether to ignore instance types that were not selected. If true, only the tables for the specified types are created. If false, all of the additional types and supertypes of selected instances are considered as well.") 38 | private boolean ignoreOtherTypes = true; 39 | 40 | } 41 | -------------------------------------------------------------------------------- /src/main/java/com/merck/rdf2x/processing/filtering/QuadFilter.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Merck Sharp & Dohme Corp. a subsidiary of Merck & Co., 3 | * Inc., Kenilworth, NJ, USA. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.merck.rdf2x.processing.filtering; 19 | 20 | import com.merck.rdf2x.rdf.QuadUtils; 21 | import lombok.RequiredArgsConstructor; 22 | import lombok.extern.slf4j.Slf4j; 23 | import org.apache.jena.sparql.core.Quad; 24 | import org.apache.spark.api.java.JavaRDD; 25 | 26 | import java.util.HashSet; 27 | import java.util.List; 28 | import java.util.Set; 29 | 30 | /** 31 | * QuadFilter filters a RDD of {@link Quad}s based on a specified config. 32 | */ 33 | @Slf4j 34 | @RequiredArgsConstructor 35 | public class QuadFilter { 36 | 37 | private final QuadFilterConfig config; 38 | 39 | /** 40 | * filter RDD of {@link Quad}s based on the specified config 41 | * 42 | * @param quads RDD of quads to filter 43 | * @return filtered RDD of quads 44 | */ 45 | public JavaRDD filter(JavaRDD quads) { 46 | Set subjectBlacklist = new HashSet<>(config.getResourceBlacklist()); 47 | if (config.getResources().isEmpty()) { 48 | return QuadUtils.filterQuadsByForbiddenSubjects(quads, subjectBlacklist); 49 | } 50 | log.info("Filtering quads"); 51 | Set subjects = new HashSet<>(config.getResources()); 52 | boolean directed = config.isDirected(); 53 | for (int d = 0; d < config.getRelatedDepth(); d++) { 54 | log.info("Depth {}, collecting neighbors of {} resources", d, subjects.size()); 55 | List neighbors = QuadUtils.getNeighborResources(quads, subjects, directed).collect(); 56 | subjects.addAll(neighbors); 57 | subjects.removeAll(subjectBlacklist); 58 | } 59 | log.info("Filtering on an in-memory set of {} subjects", subjects.size()); 60 | quads = QuadUtils.filterQuadsByAllowedSubjects(quads, subjects); 61 | return quads; 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /src/main/java/com/merck/rdf2x/processing/filtering/QuadFilterConfig.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Merck Sharp & Dohme Corp. a subsidiary of Merck & Co., 3 | * Inc., Kenilworth, NJ, USA. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package com.merck.rdf2x.processing.filtering; 18 | 19 | import com.beust.jcommander.Parameter; 20 | import lombok.Data; 21 | import lombok.experimental.Accessors; 22 | 23 | import javax.naming.ConfigurationException; 24 | import java.io.Serializable; 25 | import java.util.ArrayList; 26 | import java.util.Arrays; 27 | import java.util.List; 28 | 29 | /** 30 | * QuadFilterConfig stores all information for the {@link QuadFilter}. 31 | */ 32 | @Data 33 | @Accessors(chain = true) 34 | public class QuadFilterConfig implements Serializable { 35 | 36 | @Parameter(names = "--filter.resource", description = "Accept resources of specified URI. More resource URIs can be specified by repeating this parameter.") 37 | private List resources = new ArrayList<>(); 38 | 39 | @Parameter(names = "--filter.resourceBlacklist", description = "Ignore resources of specified URI. More resource URIs can be specified by repeating this parameter.") 40 | private List resourceBlacklist = new ArrayList<>(); 41 | 42 | @Parameter(names = "--filter.relatedDepth", description = "Accept also resources related to the original set in relatedDepth directed steps. Uses an in-memory set of subject URIs, therefore can only be used for small results (e.g. less than 1 million resources selected).") 43 | private Integer relatedDepth = 0; 44 | 45 | @Parameter(names = "--filter.directed", arity = 1, description = "Whether to traverse only in the subject->object directions of relations when retrieving related resources.") 46 | private boolean directed = true; 47 | 48 | /** 49 | * Validate the config, throw {@link ConfigurationException} on error 50 | * 51 | * @throws ConfigurationException found error 52 | */ 53 | public void validate() throws ConfigurationException { 54 | if (relatedDepth > 0 && resources.isEmpty()) { 55 | throw new ConfigurationException("RelatedDepth > 0 has no effect when no filter resources are specified."); 56 | } 57 | } 58 | 59 | public QuadFilterConfig addResources(String... resources) { 60 | this.resources.addAll(Arrays.asList(resources)); 61 | return this; 62 | } 63 | 64 | public QuadFilterConfig addResourceBlacklist(String... resourceBlacklist) { 65 | this.resourceBlacklist.addAll(Arrays.asList(resourceBlacklist)); 66 | return this; 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /src/main/java/com/merck/rdf2x/processing/formatting/SchemaFormatterConfig.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Merck Sharp & Dohme Corp. a subsidiary of Merck & Co., 3 | * Inc., Kenilworth, NJ, USA. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.merck.rdf2x.processing.formatting; 19 | 20 | import com.beust.jcommander.Parameter; 21 | import com.merck.rdf2x.flavors.Flavor; 22 | import lombok.Data; 23 | import lombok.experimental.Accessors; 24 | 25 | import java.io.Serializable; 26 | import java.util.HashSet; 27 | import java.util.Set; 28 | 29 | /** 30 | * SchemaFormatterConfig stores all instructions for the SchemaFormatter. 31 | */ 32 | 33 | @Data 34 | @Accessors(chain = true) 35 | public class SchemaFormatterConfig implements Serializable { 36 | @Parameter(names = "--formatting.maxTableNameLength", description = "Maximum length of entity table names") 37 | private Integer maxTableNameLength = 25; 38 | 39 | @Parameter(names = "--formatting.maxColumnNameLength", description = "Maximum length of column names") 40 | private Integer maxColumnNameLength = 50; 41 | 42 | @Parameter(names = "--formatting.uriSuffixPattern", description = "When collecting name from URI, use the segment after the last occurrence of this regex") 43 | private String uriSuffixPattern = "[/:#=]"; 44 | 45 | @Parameter(names = "--formatting.useLabels", arity = 1, description = "Try to use rdfs:label for formatting names. Will use URIs if label is not present.") 46 | private boolean useLabels = false; 47 | 48 | private Set reservedNames = new HashSet<>(); 49 | /** 50 | * Flavor containing custom conversion methods 51 | */ 52 | private Flavor flavor; 53 | } 54 | -------------------------------------------------------------------------------- /src/main/java/com/merck/rdf2x/processing/indexing/GlobalInstanceIndexer.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Merck Sharp & Dohme Corp. a subsidiary of Merck & Co., 3 | * Inc., Kenilworth, NJ, USA. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.merck.rdf2x.processing.indexing; 19 | 20 | import com.merck.rdf2x.beans.Instance; 21 | import org.apache.spark.api.java.JavaRDD; 22 | 23 | import java.io.Serializable; 24 | 25 | /** 26 | * GlobalPredicateMapIndexer adds a globally unique ID to a RDD of {@link Instance}s. 27 | */ 28 | public class GlobalInstanceIndexer implements InstanceIndexer, Serializable { 29 | 30 | /** 31 | * Add a globally unique index to each {@link Instance} in a RDD, starting from 1. 32 | * 33 | * @param instances RDD of {@link Instance}s 34 | * @return the modified RDD of {@link Instance}s with IDs 35 | */ 36 | public JavaRDD addIDs(JavaRDD instances) { 37 | return instances.zipWithUniqueId().map(instanceWithId -> { 38 | Instance instance = instanceWithId._1(); 39 | instance.setId(instanceWithId._2() + 1); 40 | return instance; 41 | }); 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /src/main/java/com/merck/rdf2x/processing/indexing/InstanceIndexer.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Merck Sharp & Dohme Corp. a subsidiary of Merck & Co., 3 | * Inc., Kenilworth, NJ, USA. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.merck.rdf2x.processing.indexing; 19 | 20 | import com.merck.rdf2x.beans.Instance; 21 | import org.apache.spark.api.java.JavaRDD; 22 | 23 | /** 24 | * PredicateMapIndexer defines an interface for adding IDs to a RDD of {@link Instance}s. 25 | */ 26 | public interface InstanceIndexer { 27 | 28 | /** 29 | * Add an ID to each {@link Instance} in a RDD. 30 | * 31 | * @param instances RDD of {@link Instance}s 32 | * @return the modified RDD of {@link Instance}s with IDs 33 | */ 34 | JavaRDD addIDs(JavaRDD instances); 35 | } 36 | -------------------------------------------------------------------------------- /src/main/java/com/merck/rdf2x/processing/partitioning/InstancePartitioner.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Merck Sharp & Dohme Corp. a subsidiary of Merck & Co., 3 | * Inc., Kenilworth, NJ, USA. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.merck.rdf2x.processing.partitioning; 19 | 20 | import com.merck.rdf2x.beans.Instance; 21 | import lombok.RequiredArgsConstructor; 22 | import lombok.extern.slf4j.Slf4j; 23 | import org.apache.spark.HashPartitioner; 24 | import org.apache.spark.api.java.JavaPairRDD; 25 | import org.apache.spark.api.java.JavaRDD; 26 | import scala.Tuple2; 27 | 28 | import java.io.Serializable; 29 | import java.util.Map; 30 | 31 | /** 32 | * InstancePartitioner partitions instances by a specified partitioning (e.g. by type) 33 | */ 34 | @Slf4j 35 | @RequiredArgsConstructor 36 | public class InstancePartitioner implements Serializable { 37 | /** 38 | * Config storing partitioning type 39 | */ 40 | private final InstancePartitionerConfig config; 41 | 42 | /** 43 | * Partition instances by the specified partitioning (e.g. by instance type) 44 | * 45 | * @param instances RDD of instances to partition 46 | * @return partitioned RDD if requested, original RDD if no partitioning is specified 47 | */ 48 | public JavaRDD partition(JavaRDD instances) { 49 | if (!config.isRepartitionByType()) { 50 | return instances; 51 | } 52 | log.info("Getting counts by type hash"); 53 | Map typeCounts = getApproximateTypeHashCounts(instances); 54 | int numPartitions = instances.getNumPartitions(); 55 | long totalInstances = instances.count(); 56 | long instancesPerPartition = totalInstances / numPartitions + 1; 57 | 58 | JavaPairRDD instanceWithPartitions = instances.mapToPair(instance -> { 59 | int typeHash = getTypeHash(instance); 60 | int splitIncrement = getSplitIncrement(instance.getId(), typeCounts.get(typeHash), instancesPerPartition); 61 | return new Tuple2<>(typeHash + splitIncrement, instance); 62 | }); 63 | 64 | log.info("Partitioning instances by type"); 65 | return instanceWithPartitions 66 | .partitionBy(new HashPartitioner(numPartitions)) 67 | .values(); 68 | } 69 | 70 | private Map getApproximateTypeHashCounts(JavaRDD instances) { 71 | return instances.map(this::getTypeHash).countByValue(); 72 | } 73 | 74 | private int getTypeHash(Instance instance) { 75 | return instance.getTypes().hashCode() * 31; 76 | } 77 | 78 | private int getSplitIncrement(Long seed, long numInstances, long instancesPerPartition) { 79 | int numSplits = (int) (numInstances / instancesPerPartition) + 1; 80 | return (int) (seed % numSplits); 81 | } 82 | 83 | } 84 | -------------------------------------------------------------------------------- /src/main/java/com/merck/rdf2x/processing/partitioning/InstancePartitionerConfig.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Merck Sharp & Dohme Corp. a subsidiary of Merck & Co., 3 | * Inc., Kenilworth, NJ, USA. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.merck.rdf2x.processing.partitioning; 19 | 20 | import com.beust.jcommander.Parameter; 21 | import com.merck.rdf2x.processing.schema.EntitySchemaCollector; 22 | import lombok.Data; 23 | import lombok.experimental.Accessors; 24 | 25 | import java.io.Serializable; 26 | 27 | /** 28 | * EntitySchemaCollectorConfig stores instructions for the {@link EntitySchemaCollector}. 29 | */ 30 | @Data 31 | @Accessors(chain = true) 32 | public class InstancePartitionerConfig implements Serializable { 33 | 34 | @Parameter(names = "--instances.repartitionByType", arity = 1, description = "Whether to repartition instances by type. Profitable in local mode when, causes an expensive shuffle in cluster mode.", hidden = true) 35 | private boolean repartitionByType = false; 36 | } 37 | -------------------------------------------------------------------------------- /src/main/java/com/merck/rdf2x/processing/schema/EntitySchemaCollectorConfig.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Merck Sharp & Dohme Corp. a subsidiary of Merck & Co., 3 | * Inc., Kenilworth, NJ, USA. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.merck.rdf2x.processing.schema; 19 | 20 | import com.beust.jcommander.Parameter; 21 | import lombok.Data; 22 | import lombok.experimental.Accessors; 23 | 24 | import java.io.Serializable; 25 | 26 | /** 27 | * EntitySchemaCollectorConfig stores instructions for the {@link EntitySchemaCollector}. 28 | */ 29 | @Data 30 | @Accessors(chain = true) 31 | public class EntitySchemaCollectorConfig implements Serializable { 32 | 33 | @Parameter(names = "--entities.maxNumColumns", description = "Maximum number of columns for one table.") 34 | private Integer maxNumColumns = null; 35 | 36 | @Parameter(names = "--entities.minColumnNonNullFraction", description = "Properties require at least minColumnNonNullFraction non-null values to be stored as columns. The rest is stored in the Entity-Attribute-Value table (e.g. 0.4 = properties with less than 40% values present will be stored only in the EAV table, 0 = store all as columns, 1 = store all only in EAV table).") 37 | private Double minColumnNonNullFraction = 0.0; 38 | 39 | @Parameter(names = "--entities.redundantEAV", arity = 1, description = "Store all properties in the EAV table, including values that are already stored in columns.") 40 | private boolean redundantEAV = false; 41 | 42 | @Parameter(names = "--entities.redundantSubclassColumns", arity = 1, description = "Store all columns in subclass tables, even if they are also present in a superclass table. If false (default behavior), columns present in superclasses are removed, their superclass location is marked in the column meta table.") 43 | private boolean redundantSubclassColumns = false; 44 | 45 | @Parameter(names = "--entities.minNumRows", description = "Minimum number of rows required for an entity table. Tables with less rows will not be included.") 46 | private Integer minNumRows = 1; 47 | 48 | @Parameter(names = "--entities.sortColumnsAlphabetically", arity = 1, description = "Sort columns alphabetically. Otherwise by non-null ratio, most frequent first.") 49 | private boolean sortColumnsAlphabetically = false; 50 | 51 | @Parameter(names = "--entities.forceTypeSuffix", arity = 1, description = "Whether to always add a type suffix to columns, even if only one datatype is present.") 52 | private boolean forceTypeSuffix = false; 53 | 54 | } 55 | -------------------------------------------------------------------------------- /src/main/java/com/merck/rdf2x/processing/schema/RelationConfig.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Merck Sharp & Dohme Corp. a subsidiary of Merck & Co., 3 | * Inc., Kenilworth, NJ, USA. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.merck.rdf2x.processing.schema; 19 | 20 | import com.beust.jcommander.Parameter; 21 | import com.merck.rdf2x.processing.relations.RelationExtractor; 22 | import lombok.Data; 23 | import lombok.experimental.Accessors; 24 | 25 | import java.io.Serializable; 26 | 27 | import static com.merck.rdf2x.processing.schema.RelationSchemaStrategy.Types; 28 | 29 | /** 30 | * RelationConfig stores instructions for the {@link RelationSchemaCollector} and {@link RelationExtractor}. 31 | */ 32 | @Data 33 | @Accessors(chain = true) 34 | public class RelationConfig implements Serializable { 35 | 36 | @Parameter(names = "--relations.schema", description = "How to create relation tables (SingleTable, Types, Predicates, TypePredicates, None)") 37 | private RelationSchemaStrategy schemaStrategy = Types; 38 | 39 | @Parameter(names = "--relations.rootTypesOnly", arity = 1, description = "When creating relation tables between two instances of multiple types, create the relation table only for the root type pair. If false, relation tables are created for all combinations of types.") 40 | private boolean rootTypesOnly = true; 41 | /** 42 | * Whether entity names are forbidden to be used for predicate relation table names (only necessary if no prefix is added to table names) 43 | */ 44 | private boolean entityNamesForbidden = true; 45 | } 46 | -------------------------------------------------------------------------------- /src/main/java/com/merck/rdf2x/processing/schema/RelationSchemaStrategy.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Merck Sharp & Dohme Corp. a subsidiary of Merck & Co., 3 | * Inc., Kenilworth, NJ, USA. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.merck.rdf2x.processing.schema; 19 | 20 | /** 21 | * RelationSchemaStrategy defines which relation table schema to create 22 | */ 23 | public enum RelationSchemaStrategy { 24 | /** 25 | * SingleTable - Store all relations in a single table. 26 | */ 27 | SingleTable, 28 | /** 29 | * Types - Create relation tables for all combinations of the two instance's types (redundant). 30 | */ 31 | Types, 32 | /** 33 | * Predicates - Create one relation table for each predicate 34 | */ 35 | Predicates, 36 | /** 37 | * TypePredicates - Create one relation table for each predicate between two entity tables 38 | */ 39 | TypePredicates, 40 | /** 41 | * None - Do not extract relations 42 | */ 43 | None 44 | } 45 | -------------------------------------------------------------------------------- /src/main/java/com/merck/rdf2x/rdf/LiteralType.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Merck Sharp & Dohme Corp. a subsidiary of Merck & Co., 3 | * Inc., Kenilworth, NJ, USA. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.merck.rdf2x.rdf; 19 | 20 | import com.merck.rdf2x.beans.Predicate; 21 | import org.apache.commons.lang3.NotImplementedException; 22 | 23 | /** 24 | * LiteralType defines the type of a literal value. Used along with a URI to define a {@link Predicate}. 25 | */ 26 | public class LiteralType { 27 | public final static int UNKNOWN = 0; 28 | public final static int STRING = 1; 29 | public final static int FLOAT = 2; 30 | public final static int DOUBLE = 3; 31 | public final static int INTEGER = 4; 32 | public final static int LONG = 5; 33 | public final static int BOOLEAN = 6; 34 | public final static int DATETIME = 7; 35 | 36 | private final static String[] NAMES = new String[]{ 37 | "UNKNOWN", "STRING", "FLOAT", "DOUBLE", "INTEGER", "LONG", "BOOLEAN", "DATETIME" 38 | }; 39 | 40 | public static String toString(int literalType) { 41 | if (literalType < 0 || literalType >= NAMES.length) { 42 | throw new NotImplementedException("Missing type representation for literal type " + literalType); 43 | } 44 | return NAMES[literalType]; 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /src/main/java/com/merck/rdf2x/rdf/parsing/ElephasQuadParser.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Merck Sharp & Dohme Corp. a subsidiary of Merck & Co., 3 | * Inc., Kenilworth, NJ, USA. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.merck.rdf2x.rdf.parsing; 19 | 20 | import lombok.RequiredArgsConstructor; 21 | import lombok.extern.slf4j.Slf4j; 22 | import org.apache.hadoop.conf.Configuration; 23 | import org.apache.hadoop.io.LongWritable; 24 | import org.apache.hadoop.mapreduce.lib.input.NLineInputFormat; 25 | import org.apache.jena.hadoop.rdf.io.RdfIOConstants; 26 | import org.apache.jena.hadoop.rdf.io.input.TriplesOrQuadsInputFormat; 27 | import org.apache.jena.hadoop.rdf.io.input.nquads.NQuadsInputFormat; 28 | import org.apache.jena.hadoop.rdf.types.QuadWritable; 29 | import org.apache.jena.sparql.core.Quad; 30 | import org.apache.spark.api.java.JavaRDD; 31 | import org.apache.spark.api.java.JavaSparkContext; 32 | 33 | import java.util.List; 34 | 35 | /** 36 | * ElephasQuadParser parses a variety of RDF formats into a RDD of {@link Quad}. It is based on Jena Elephas Hadoop parser. 37 | *

38 | * It operates in two modes: Line based (NQuads) and Whole file based (Turtle, JSON-LD, ...). 39 | */ 40 | @Slf4j 41 | @RequiredArgsConstructor 42 | public class ElephasQuadParser implements QuadParser { 43 | /** 44 | * parser config 45 | */ 46 | private final QuadParserConfig config; 47 | /** 48 | * Spark context to be used 49 | */ 50 | transient private final JavaSparkContext sc; 51 | 52 | @Override 53 | public JavaRDD parseQuads(String path) { 54 | 55 | Configuration conf = new Configuration(); 56 | 57 | Integer batchSize = config.getBatchSize(); 58 | conf.set(NLineInputFormat.LINES_PER_MAP, batchSize.toString()); 59 | 60 | if (config.getErrorHandling() == ParseErrorHandling.Throw) { 61 | conf.set(RdfIOConstants.INPUT_IGNORE_BAD_TUPLES, "false"); 62 | } else { 63 | conf.set(RdfIOConstants.INPUT_IGNORE_BAD_TUPLES, "true"); 64 | } 65 | 66 | Boolean isLineBased = config.getLineBasedFormat(); 67 | if (isLineBased == null) { 68 | isLineBased = guessIsLineBasedFormat(path); 69 | } 70 | JavaRDD quads; 71 | Integer partitions = config.getRepartition(); 72 | if (isLineBased) { 73 | log.info("Parsing RDF in parallel with batch size: {}", batchSize); 74 | quads = sc.newAPIHadoopFile(path, 75 | NQuadsInputFormat.class, 76 | LongWritable.class, // position 77 | QuadWritable.class, // value 78 | conf).values().map(QuadWritable::get); 79 | } else { 80 | // let Jena guess the format, load whole files 81 | log.info("Input format is not line based, parsing RDF by Master node only."); 82 | quads = sc.newAPIHadoopFile(path, 83 | TriplesOrQuadsInputFormat.class, 84 | LongWritable.class, // position 85 | QuadWritable.class, // value 86 | conf).values().map(QuadWritable::get); 87 | 88 | if (partitions == null) { 89 | log.warn("Reading non-line based formats by master node only, consider setting --parsing.repartition to redistribute work to other nodes."); 90 | } 91 | } 92 | if (partitions != null) { 93 | log.info("Distributing workload, repartitioning into {} partitions", partitions); 94 | quads = quads.repartition(partitions); 95 | } 96 | 97 | 98 | final List acceptedLanguages = config.getAcceptedLanguages(); 99 | // if only some languages are accepted 100 | if (!acceptedLanguages.isEmpty()) { 101 | // filter out literals of unsupported languages 102 | quads = quads.filter(quad -> 103 | !quad.getObject().isLiteral() || 104 | quad.getObject().getLiteralLanguage() == null || 105 | quad.getObject().getLiteralLanguage().isEmpty() || 106 | acceptedLanguages.contains(quad.getObject().getLiteralLanguage()) 107 | ); 108 | } 109 | 110 | return quads; 111 | } 112 | 113 | private boolean guessIsLineBasedFormat(String path) { 114 | if (path.endsWith(".nq") || path.endsWith(".nq.gz") || path.endsWith(".nt") || path.endsWith(".nt.gz")) { 115 | return true; 116 | } else { 117 | log.warn("Unable to guess input file format, parsing by master node only."); 118 | return false; 119 | } 120 | } 121 | 122 | 123 | } -------------------------------------------------------------------------------- /src/main/java/com/merck/rdf2x/rdf/parsing/ParseErrorHandling.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Merck Sharp & Dohme Corp. a subsidiary of Merck & Co., 3 | * Inc., Kenilworth, NJ, USA. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.merck.rdf2x.rdf.parsing; 19 | 20 | /** 21 | * ParseErrorHandling is used in config objects to define handling of RDF parse errors. 22 | */ 23 | public enum ParseErrorHandling { 24 | /** 25 | * Ignore quads containing errors 26 | */ 27 | Ignore, 28 | 29 | /** 30 | * Store quads containing errors in an Error table 31 | */ 32 | Store, 33 | 34 | /** 35 | * When encountering an error, throw an exception and exit. 36 | */ 37 | Throw 38 | } -------------------------------------------------------------------------------- /src/main/java/com/merck/rdf2x/rdf/parsing/QuadParser.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Merck Sharp & Dohme Corp. a subsidiary of Merck & Co., 3 | * Inc., Kenilworth, NJ, USA. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.merck.rdf2x.rdf.parsing; 19 | 20 | import org.apache.jena.sparql.core.Quad; 21 | import org.apache.spark.api.java.JavaRDD; 22 | 23 | /** 24 | * QuadParser defines an interface for parsing RDF files into a RDD of Quads. 25 | */ 26 | public interface QuadParser { 27 | 28 | /** 29 | * Parse RDF file into a RDD of Jena {@link Quad}s. 30 | * 31 | * @param path Path to RDF file or folder 32 | * @return RDD of Jena Quads 33 | */ 34 | JavaRDD parseQuads(String path); 35 | 36 | } 37 | -------------------------------------------------------------------------------- /src/main/java/com/merck/rdf2x/rdf/parsing/QuadParserConfig.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Merck Sharp & Dohme Corp. a subsidiary of Merck & Co., 3 | * Inc., Kenilworth, NJ, USA. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package com.merck.rdf2x.rdf.parsing; 18 | 19 | import com.beust.jcommander.Parameter; 20 | import lombok.Data; 21 | import lombok.experimental.Accessors; 22 | 23 | import java.io.Serializable; 24 | import java.util.ArrayList; 25 | import java.util.List; 26 | 27 | /** 28 | * SparkConfig stores all information needed to connect to Spark. 29 | */ 30 | @Data 31 | @Accessors(chain = true) 32 | public class QuadParserConfig implements Serializable { 33 | 34 | @Parameter(names = "--input.lineBasedFormat", arity = 1, description = "Whether the input files can be read line by line (e.g. true for NTriples or NQuads, false for Turtle). In default, will try to guess based on file extension. Line based formats can be parsed by multiple nodes at the same time, other formats will be read by master node and repartitioned after parsing.") 35 | private Boolean lineBasedFormat = null; 36 | 37 | @Parameter(names = "--input.repartition", description = "Repartition after parsing into this number of partitions.") 38 | private Integer repartition = null; 39 | 40 | @Parameter(names = "--input.batchSize", description = "Batch size for parsing line-based formats (number of quads per partition)") 41 | private Integer batchSize = 500000; 42 | 43 | @Parameter(names = "--input.errorHandling", description = "How to handle RDF parsing errors (Ignore, Throw).") 44 | private ParseErrorHandling errorHandling = ParseErrorHandling.Ignore; 45 | 46 | @Parameter(names = "--input.acceptedLanguage", description = "Accepted language. Literals in other languages are ignored. You can specify more languages by repeating this parameter.") 47 | private List acceptedLanguages = new ArrayList<>(); 48 | } 49 | -------------------------------------------------------------------------------- /src/main/java/com/merck/rdf2x/rdf/schema/ClassGraph.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Merck Sharp & Dohme Corp. a subsidiary of Merck & Co., 3 | * Inc., Kenilworth, NJ, USA. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.merck.rdf2x.rdf.schema; 19 | 20 | import org.jgrapht.experimental.dag.DirectedAcyclicGraph; 21 | import org.jgrapht.graph.DefaultEdge; 22 | 23 | import java.io.Serializable; 24 | import java.util.Collections; 25 | import java.util.HashSet; 26 | import java.util.Set; 27 | 28 | /** 29 | * ClassGraph wraps a JGraphT graph to provide nicer methods 30 | */ 31 | public class ClassGraph implements Serializable { 32 | 33 | /** 34 | * graph of edges from superclass to subclass 35 | */ 36 | private final DirectedAcyclicGraph graph; 37 | 38 | /** 39 | * Create new empty graph 40 | */ 41 | public ClassGraph() { 42 | this.graph = new DirectedAcyclicGraph<>(DefaultEdge.class); 43 | } 44 | 45 | /** 46 | * @param graph graph of edges from superclass to subclass 47 | */ 48 | public ClassGraph(DirectedAcyclicGraph graph) { 49 | this.graph = graph; 50 | } 51 | 52 | /** 53 | * Get all types in topological order - every superclass precedes a subclass. 54 | * 55 | * @return types in topological order - every superclass precedes a subclass 56 | */ 57 | public Iterable inSuperClassFirstOrder() { 58 | // return empty list if no vertices are added to avoid JGraphT exception 59 | return graph.vertexSet().isEmpty() ? Collections.emptyList() : graph; 60 | } 61 | 62 | /** 63 | * Get set of edges (from superclass to subclass) 64 | * @return set of edges (from superclass to subclass) 65 | */ 66 | public Set edgeSet() { 67 | return graph.edgeSet(); 68 | } 69 | 70 | /** 71 | * Get superclasses (type indexes) of given class (type index) 72 | * @param typeIndex type index to find superclasses for 73 | * @return superclasses (type indexes) of given class (type index) 74 | */ 75 | public Set getSuperClasses(Integer typeIndex) { 76 | return graph.vertexSet().isEmpty() ? Collections.emptySet() : graph.getAncestors(graph, typeIndex); 77 | } 78 | 79 | /** 80 | * Return whether the class (type index) has any superclasses 81 | * @param typeIndex type index to find superclasses for 82 | * @return true if the class (type index) has any superclasses, false otherwise 83 | */ 84 | public boolean hasSuperClasses(Integer typeIndex) { 85 | return !graph.edgeSet().isEmpty() && !graph.incomingEdgesOf(typeIndex).isEmpty(); 86 | } 87 | 88 | /** 89 | * Return whether the class (type index) has any subclasses 90 | * @param typeIndex type index to find superclasses for 91 | * @return true if the class (type index) has any subclasses, false otherwise 92 | */ 93 | public boolean hasSubClasses(Integer typeIndex) { 94 | return !graph.edgeSet().isEmpty() && !graph.outgoingEdgesOf(typeIndex).isEmpty(); 95 | } 96 | 97 | /** 98 | * Get superclass from edge (object of subClassOf statement) 99 | * @param edge edge to get superclass from 100 | * @return superclass of the given edge (object of subClassOf statement) 101 | */ 102 | public Integer getEdgeSuperclass(DefaultEdge edge) { 103 | return graph.getEdgeSource(edge); 104 | } 105 | 106 | /** 107 | * Get subclass from edge (subject of subClassOf statement) 108 | * @param edge edge to get subclass from 109 | * @return subclass of the given edge (subject of subClassOf statement) 110 | */ 111 | public Integer getEdgeSubclass(DefaultEdge edge) { 112 | return graph.getEdgeTarget(edge); 113 | } 114 | 115 | /** 116 | * Get all classes (type indexes) 117 | * @return set of all classes (type indexes) 118 | */ 119 | public Set getClasses() { 120 | return graph.vertexSet(); 121 | } 122 | 123 | /** 124 | * Remove all vertices that are not in typeIndexes set 125 | * 126 | * @param typeIndexes set of vertices (type indexes) to preserve 127 | */ 128 | public void keepOnlyClasses(Set typeIndexes) { 129 | Set toRemove = new HashSet<>(graph.vertexSet()); 130 | toRemove.removeAll(typeIndexes); 131 | graph.removeAllVertices(toRemove); 132 | } 133 | } 134 | -------------------------------------------------------------------------------- /src/main/java/com/merck/rdf2x/rdf/schema/RdfSchema.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Merck Sharp & Dohme Corp. a subsidiary of Merck & Co., 3 | * Inc., Kenilworth, NJ, USA. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.merck.rdf2x.rdf.schema; 19 | 20 | import com.merck.rdf2x.beans.IndexMap; 21 | import lombok.Data; 22 | 23 | import java.io.Serializable; 24 | import java.util.Map; 25 | 26 | /** 27 | * RdfSchema stores various RDF Schema information, such as the class graph. 28 | */ 29 | @Data 30 | public class RdfSchema implements Serializable { 31 | 32 | /** 33 | * config that this schema was extracted with 34 | */ 35 | private final RdfSchemaCollectorConfig config; 36 | /** 37 | * Graph of superclass information 38 | */ 39 | private final ClassGraph classGraph; 40 | /** 41 | * Type index maps type URIs to integers 42 | */ 43 | private final IndexMap typeIndex; 44 | /** 45 | * Predicate index maps predicate URIs to integers 46 | */ 47 | private final IndexMap predicateIndex; 48 | /** 49 | * Map of type uriLabels mapping type and predicate URIs to rdfs:uriLabels 50 | */ 51 | private final Map uriLabels; 52 | } 53 | -------------------------------------------------------------------------------- /src/main/java/com/merck/rdf2x/rdf/schema/RdfSchemaCollectorConfig.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Merck Sharp & Dohme Corp. a subsidiary of Merck & Co., 3 | * Inc., Kenilworth, NJ, USA. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package com.merck.rdf2x.rdf.schema; 18 | 19 | import com.beust.jcommander.Parameter; 20 | import lombok.Data; 21 | import lombok.experimental.Accessors; 22 | 23 | import java.io.Serializable; 24 | import java.util.ArrayList; 25 | import java.util.List; 26 | 27 | /** 28 | * RdfSchemaCollectorConfig stores parameters for the {@link RdfSchemaCollector}. 29 | */ 30 | @Data 31 | @Accessors(chain = true) 32 | public class RdfSchemaCollectorConfig implements Serializable { 33 | @Parameter(names = "--rdf.typePredicate", description = "Additional URI apart from rdf:type to treat as type predicate. You can specify more predicates by repeating this parameter.", hidden = true) 34 | private List typePredicates = new ArrayList<>(); 35 | 36 | @Parameter(names = "--rdf.subclassPredicate", description = "Additional URI apart from rdfs:subClassOf to treat as subClassOf predicate. You can specify more predicates by repeating this parameter.", hidden = true) 37 | private List subclassPredicates = new ArrayList<>(); 38 | 39 | @Parameter(names = "--rdf.collectSubclassGraph", arity = 1, description = "Whether to collect the graph of subClass predicates.", hidden = true) 40 | private boolean collectSubclassGraph = true; 41 | 42 | @Parameter(names = "--rdf.collectLabels", arity = 1, description = "Whether to collect type and predicate labels (to be saved in meta tables and for name formatting if requested).", hidden = true) 43 | private boolean collectLabels = true; 44 | 45 | @Parameter(names = "--rdf.cacheFile", description = "File for saving and loading cached schema.", hidden = true) 46 | private String cacheFile = null; 47 | /** 48 | * List of additional type IRIs to include in the type index 49 | */ 50 | private List additionalTypes = new ArrayList<>(); 51 | /** 52 | * List of additional predicate IRIs to include in the predicate index 53 | */ 54 | private List additionalPredicates = new ArrayList<>(); 55 | } 56 | -------------------------------------------------------------------------------- /src/main/java/com/merck/rdf2x/spark/SparkContextProvider.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Merck Sharp & Dohme Corp. a subsidiary of Merck & Co., 3 | * Inc., Kenilworth, NJ, USA. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.merck.rdf2x.spark; 19 | 20 | import com.merck.rdf2x.beans.*; 21 | import com.merck.rdf2x.rdf.LiteralType; 22 | import lombok.extern.slf4j.Slf4j; 23 | import org.apache.jena.sparql.core.Quad; 24 | import org.apache.spark.SparkConf; 25 | import org.apache.spark.api.java.JavaSparkContext; 26 | import org.apache.spark.sql.catalyst.InternalRow; 27 | import org.apache.spark.sql.catalyst.expressions.GenericInternalRow; 28 | 29 | import java.util.HashMap; 30 | import java.util.HashSet; 31 | 32 | /** 33 | * SparkContextProvider provides a {@link JavaSparkContext} based on default settings. 34 | */ 35 | @Slf4j 36 | public class SparkContextProvider { 37 | /** 38 | * Provide a {@link JavaSparkContext} based on default settings 39 | * 40 | * @return a {@link JavaSparkContext} based on default settings 41 | */ 42 | public static JavaSparkContext provide() { 43 | SparkConf config = new SparkConf() 44 | .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") 45 | .registerKryoClasses(getSerializableClasses()); 46 | 47 | if (!config.contains("spark.app.name")) { 48 | config.setAppName("RDF2X"); 49 | } 50 | if (!config.contains("spark.master")) { 51 | config.setMaster("local"); 52 | } 53 | 54 | // set serialization registration required if you want to make sure you registered all your classes 55 | // some spark internal classes will need to be registered as well 56 | // config.set("spark.kryo.registrationRequired", "true"); 57 | 58 | 59 | log.info("Getting Spark Context for config: \n{}", config.toDebugString()); 60 | return new JavaSparkContext(config); 61 | } 62 | 63 | public static Class[] getSerializableClasses() { 64 | return new Class[]{ 65 | Instance.class, Predicate.class, RelationPredicate.class, RelationRow.class, 66 | TypeID.class, HashMap.class, HashSet.class, LiteralType.class, Object[].class, 67 | InternalRow[].class, GenericInternalRow.class, IndexMap.class, Quad.class 68 | }; 69 | } 70 | 71 | } 72 | -------------------------------------------------------------------------------- /src/main/java/com/merck/rdf2x/stats/QuadCounter.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Merck Sharp & Dohme Corp. a subsidiary of Merck & Co., 3 | * Inc., Kenilworth, NJ, USA. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.merck.rdf2x.stats; 19 | 20 | import org.apache.jena.sparql.core.Quad; 21 | import org.apache.spark.api.java.JavaPairRDD; 22 | import org.apache.spark.api.java.JavaRDD; 23 | import scala.Tuple2; 24 | 25 | /** 26 | * QuadCounter provides methods for counting {@link Quad}s 27 | */ 28 | public class QuadCounter { 29 | 30 | public static JavaPairRDD countBySubjectURI(JavaRDD quads) { 31 | return quads 32 | .filter(quad -> quad.getSubject().isURI()) 33 | .mapToPair(quad -> new Tuple2<>(quad.getSubject().getURI(), 1L)) 34 | .reduceByKey((a, b) -> a + b); 35 | } 36 | 37 | public static JavaPairRDD countByPredicateURI(JavaRDD quads) { 38 | return quads 39 | .filter(quad -> quad.getPredicate().isURI()) 40 | .mapToPair(quad -> new Tuple2<>(quad.getPredicate().getURI(), 1L)) 41 | .reduceByKey((a, b) -> a + b); 42 | } 43 | 44 | public static JavaPairRDD getObjectURI(JavaRDD quads) { 45 | return quads 46 | .filter(quad -> quad.getObject().isURI()) 47 | .mapToPair(quad -> new Tuple2<>(quad.getObject().getURI(), 1L)) 48 | .reduceByKey((a, b) -> a + b); 49 | } 50 | 51 | } 52 | -------------------------------------------------------------------------------- /src/main/resources/examples/czechia.ttl: -------------------------------------------------------------------------------- 1 | @base . 2 | @prefix example: . 3 | @prefix rdfs: . 4 | 5 | <#Prague> 6 | a example:City; 7 | example:name "Prague", "Praha"@cs; 8 | example:capitalOf <#Czechia>; 9 | example:population 1267449. 10 | 11 | <#Czechia> 12 | a example:Country; 13 | example:name "Czech republic", "Czechia", "Česká republika"@cs; 14 | example:neighborOf <#Germany>; 15 | example:population 10553843. 16 | 17 | <#Germany> 18 | a example:Country; 19 | example:name "Germany", "Deutschland"@de; 20 | example:neighborOf <#Czechia>. 21 | 22 | example:City rdfs:subClassOf example:Location. 23 | example:Country rdfs:subClassOf example:Location. 24 | 25 | -------------------------------------------------------------------------------- /src/main/resources/examples/example1.ttl: -------------------------------------------------------------------------------- 1 | @base . 2 | @prefix rdf: . 3 | @prefix rdfs: . 4 | @prefix foaf: . 5 | @prefix rel: . 6 | 7 | <#green-goblin> 8 | rel:enemyOf <#spiderman> ; 9 | a foaf:Person ; # in the context of the Marvel universe 10 | foaf:name "Green Goblin" . 11 | 12 | <#spiderman> 13 | rel:enemyOf <#green-goblin> ; 14 | a foaf:Person ; 15 | foaf:name "Spiderman", "Человек-паук"@ru . -------------------------------------------------------------------------------- /src/main/resources/examples/exampleMultipleTypes.ttl: -------------------------------------------------------------------------------- 1 | @base . 2 | @prefix foaf: . 3 | 4 | <#spiderman> 5 | a foaf:Person, foaf:Agent ; 6 | foaf:name "Spiderman". 7 | 8 | <#lexcorp> 9 | a foaf:Organization, foaf:Agent ; 10 | foaf:name "LexCorp" ; 11 | foaf:homepage "https://www.lexcorp.io/". 12 | -------------------------------------------------------------------------------- /src/main/resources/examples/fruit.nq: -------------------------------------------------------------------------------- 1 | . 2 | . 3 | . 4 | . 5 | "150"^^ . 6 | "180"^^ . 7 | "100"^^ . 8 | "TBD"^^ . 9 | . 10 | . 11 | "100"^^ . 12 | "110"^^ . 13 | . 14 | . 15 | . 16 | "true"^^ . 17 | "false"^^ . 18 | . 19 | "2016-01-04T00:00:00Z"^^ . 20 | "2016-01-04T00:00:00Z"^^ . -------------------------------------------------------------------------------- /src/main/resources/examples/locations.ttl: -------------------------------------------------------------------------------- 1 | @base . 2 | @prefix example: . 3 | @prefix rdfs: . 4 | 5 | <#Czechia> 6 | a example:Country; 7 | example:name "Czech republic", "Czechia", "Česká republika"@cs; 8 | example:neighborOf <#Germany>; 9 | example:population 10553843. 10 | 11 | <#Germany> 12 | a example:Country; 13 | example:name "Germany", "Deutschland"@de; 14 | example:neighborOf <#Czechia>. 15 | 16 | <#Vltava> 17 | a example:River; 18 | example:name "Vltava", "Vltava"@cs; 19 | example:basinCountry <#Czechia>. 20 | 21 | example:Country rdfs:subClassOf example:Location. 22 | 23 | -------------------------------------------------------------------------------- /src/main/resources/examples/multivalued.ttl: -------------------------------------------------------------------------------- 1 | @base . 2 | @prefix foaf: . 3 | 4 | <#spiderman> 5 | a foaf:Person; 6 | foaf:name "Spiderman", "Spider-Man", "Spider man", "Человек-паук"@ru. 7 | 8 | <#green-goblin> 9 | a foaf:Person; 10 | foaf:name "Green Goblin" . 11 | -------------------------------------------------------------------------------- /src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # Root logger option 2 | log4j.rootLogger=INFO, stdout 3 | # Direct log messages to stdout 4 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender 5 | log4j.appender.stdout.Target=System.out 6 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout 7 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n 8 | log4j.logger.org.spark_project=WARN 9 | log4j.logger.org.apache.spark=WARN 10 | log4j.logger.com.merck=DEBUG 11 | log4j.logger.org.apache.jena.hadoop.rdf.io.input.readers=ERROR -------------------------------------------------------------------------------- /src/main/resources/test/datasets/aggregatorTest.nq: -------------------------------------------------------------------------------- 1 | "invalidValueTest"^^ . 2 | . 3 | . 4 | . 5 | . 6 | "150"^^ . 7 | "180"^^ . 8 | "100"^^ . 9 | "unknown"^^ . 10 | . 11 | . 12 | "100"^^ . 13 | "110"^^ . 14 | . 15 | . 16 | "red" . 17 | "red"@en . 18 | "rouge"@fr . 19 | . 20 | "true"^^ . 21 | "false"^^ . 22 | . 23 | "red" . -------------------------------------------------------------------------------- /src/main/resources/test/datasets/convertJobTest.nq: -------------------------------------------------------------------------------- 1 | . 2 | . 3 | . 4 | . 5 | "150"^^ . 6 | "180"^^ . 7 | "100"^^ . 8 | "TBD"^^ . 9 | . 10 | . 11 | "100"^^ . 12 | "110"^^ . 13 | . 14 | . 15 | . 16 | "true"^^ . 17 | "false"^^ . 18 | . 19 | "2016-01-04T00:00:00Z"^^ . 20 | -------------------------------------------------------------------------------- /src/main/resources/test/datasets/filtering/input.nq: -------------------------------------------------------------------------------- 1 | "label A"^^ . 2 | "label B"^^ . 3 | "label 1"^^ . 4 | . 5 | "name 1"^^ . 6 | . 7 | "label 2"^^ . 8 | . 9 | "name 2"^^ . 10 | . 11 | "label 3"^^ . 12 | . 13 | "name 3"^^ . 14 | . 15 | "label 4"^^ . 16 | . 17 | "name 4"^^ . 18 | . 19 | "label 5"^^ . 20 | . 21 | "name 5"^^ . 22 | . 23 | "label 1"^^ . 24 | . 25 | "name 1"^^ . 26 | . 27 | . 28 | -------------------------------------------------------------------------------- /src/main/resources/test/datasets/filtering/resource0.nq: -------------------------------------------------------------------------------- 1 | # "label A"^^ . 2 | # "label B"^^ . 3 | "label 1"^^ . 4 | . 5 | "name 1"^^ . 6 | . 7 | # "label 2"^^ . 8 | # . 9 | # "name 2"^^ . 10 | # . 11 | "label 3"^^ . 12 | . 13 | "name 3"^^ . 14 | . 15 | # "label 4"^^ . 16 | # . 17 | # "name 4"^^ . 18 | # . 19 | # "label 5"^^ . 20 | # . 21 | # "name 5"^^ . 22 | # . 23 | # "label 1"^^ . 24 | # . 25 | # "name 1"^^ . 26 | # . 27 | # . 28 | -------------------------------------------------------------------------------- /src/main/resources/test/datasets/filtering/resource1.nq: -------------------------------------------------------------------------------- 1 | "label A"^^ . 2 | # "label B"^^ . 3 | "label 1"^^ . 4 | . 5 | "name 1"^^ . 6 | . 7 | "label 2"^^ . 8 | . 9 | "name 2"^^ . 10 | . 11 | "label 3"^^ . 12 | . 13 | "name 3"^^ . 14 | . 15 | "label 4"^^ . 16 | . 17 | "name 4"^^ . 18 | . 19 | # "label 5"^^ . 20 | # . 21 | # "name 5"^^ . 22 | # . 23 | # "label 1"^^ . 24 | # . 25 | # "name 1"^^ . 26 | # . 27 | # . 28 | -------------------------------------------------------------------------------- /src/main/resources/test/datasets/filtering/resource2.nq: -------------------------------------------------------------------------------- 1 | "label A"^^ . 2 | # "label B"^^ . 3 | "label 1"^^ . 4 | . 5 | "name 1"^^ . 6 | . 7 | "label 2"^^ . 8 | . 9 | "name 2"^^ . 10 | . 11 | "label 3"^^ . 12 | . 13 | "name 3"^^ . 14 | . 15 | "label 4"^^ . 16 | . 17 | "name 4"^^ . 18 | . 19 | "label 5"^^ . 20 | . 21 | "name 5"^^ . 22 | . 23 | # "label 1"^^ . 24 | # . 25 | # "name 1"^^ . 26 | # . 27 | # . 28 | -------------------------------------------------------------------------------- /src/main/resources/test/datasets/filtering/resource3.nq: -------------------------------------------------------------------------------- 1 | "label A"^^ . 2 | # "label B"^^ . 3 | "label 1"^^ . 4 | . 5 | "name 1"^^ . 6 | . 7 | "label 2"^^ . 8 | . 9 | "name 2"^^ . 10 | . 11 | "label 3"^^ . 12 | . 13 | "name 3"^^ . 14 | . 15 | "label 4"^^ . 16 | . 17 | "name 4"^^ . 18 | . 19 | "label 5"^^ . 20 | . 21 | "name 5"^^ . 22 | . 23 | # "label 1"^^ . 24 | # . 25 | # "name 1"^^ . 26 | # . 27 | # . 28 | -------------------------------------------------------------------------------- /src/main/resources/test/datasets/filtering/type0.nq: -------------------------------------------------------------------------------- 1 | # "label A"^^ . 2 | "label B"^^ . 3 | # "label 1"^^ . 4 | # . 5 | # "name 1"^^ . 6 | # . 7 | # "label 2"^^ . 8 | # . 9 | # "name 2"^^ . 10 | # . 11 | # "label 3"^^ . 12 | # . 13 | # "name 3"^^ . 14 | # . 15 | # "label 4"^^ . 16 | # . 17 | # "name 4"^^ . 18 | # . 19 | # "label 5"^^ . 20 | # . 21 | # "name 5"^^ . 22 | # . 23 | # "label 1"^^ . 24 | # . 25 | # "name 1"^^ . 26 | # . 27 | # . 28 | -------------------------------------------------------------------------------- /src/main/resources/test/datasets/filtering/type1.nq: -------------------------------------------------------------------------------- 1 | # "label A"^^ . 2 | "label B"^^ . 3 | # "label 1"^^ . 4 | # . 5 | # "name 1"^^ . 6 | # . 7 | # "label 2"^^ . 8 | # . 9 | # "name 2"^^ . 10 | # . 11 | # "label 3"^^ . 12 | # . 13 | # "name 3"^^ . 14 | # . 15 | # "label 4"^^ . 16 | # . 17 | # "name 4"^^ . 18 | # . 19 | # "label 5"^^ . 20 | # . 21 | # "name 5"^^ . 22 | # . 23 | "label 1"^^ . 24 | . 25 | "name 1"^^ . 26 | . 27 | . 28 | -------------------------------------------------------------------------------- /src/main/resources/test/datasets/filtering/type2.nq: -------------------------------------------------------------------------------- 1 | # "label A"^^ . 2 | "label B"^^ . 3 | "label 1"^^ . 4 | . 5 | "name 1"^^ . 6 | . 7 | # "label 2"^^ . 8 | # . 9 | # "name 2"^^ . 10 | # . 11 | "label 3"^^ . 12 | . 13 | "name 3"^^ . 14 | . 15 | # "label 4"^^ . 16 | # . 17 | # "name 4"^^ . 18 | # . 19 | # "label 5"^^ . 20 | # . 21 | # "name 5"^^ . 22 | # . 23 | "label 1"^^ . 24 | . 25 | "name 1"^^ . 26 | . 27 | . 28 | -------------------------------------------------------------------------------- /src/main/resources/test/datasets/filtering/type3.nq: -------------------------------------------------------------------------------- 1 | "label A"^^ . 2 | "label B"^^ . 3 | "label 1"^^ . 4 | . 5 | "name 1"^^ . 6 | . 7 | "label 2"^^ . 8 | . 9 | "name 2"^^ . 10 | . 11 | "label 3"^^ . 12 | . 13 | "name 3"^^ . 14 | . 15 | "label 4"^^ . 16 | . 17 | "name 4"^^ . 18 | . 19 | "label 5"^^ . 20 | . 21 | "name 5"^^ . 22 | . 23 | "label 1"^^ . 24 | . 25 | "name 1"^^ . 26 | . 27 | . 28 | -------------------------------------------------------------------------------- /src/main/resources/test/datasets/parserTest.nq: -------------------------------------------------------------------------------- 1 | . 2 | . 3 | . 4 | . 5 | "100"^^ . 6 | "150"^^ . 7 | "TBD"^^ . 8 | . 9 | . 10 | "100"^^ . 11 | "110"^^ . 12 | . 13 | . 14 | . 15 | -------------------------------------------------------------------------------- /src/main/resources/test/datasets/parserTest.ttl: -------------------------------------------------------------------------------- 1 | @prefix ns0: . 2 | @prefix xsd: . 3 | @prefix ns1: . 4 | 5 | a . 6 | a . 7 | 8 | ns0:Flavor ; 9 | a ; 10 | ns0:Weight "100"^^xsd:int, "150"^^xsd:int, "TBD"^^xsd:string . 11 | 12 | 13 | ns0:Flavor , ; 14 | ns0:Weight "100"^^xsd:int ; 15 | ns1:Weight "110"^^xsd:int ; 16 | a , . 17 | 18 | a . -------------------------------------------------------------------------------- /src/test/java/com/merck/rdf2x/jobs/JobFactoryTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Merck Sharp & Dohme Corp. a subsidiary of Merck & Co., 3 | * Inc., Kenilworth, NJ, USA. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.merck.rdf2x.jobs; 19 | 20 | import com.merck.rdf2x.jobs.convert.ConvertJob; 21 | import lombok.extern.slf4j.Slf4j; 22 | import org.apache.spark.SparkContext; 23 | import org.junit.Test; 24 | 25 | import javax.naming.ConfigurationException; 26 | 27 | import static org.junit.Assert.*; 28 | 29 | /** 30 | * Test of {@link JobFactory} 31 | */ 32 | @Slf4j 33 | public class JobFactoryTest { 34 | 35 | @Test 36 | public void testNullJobs() throws ConfigurationException { 37 | assertNull(JobFactory.getJob(new String[]{""})); 38 | assertNull(JobFactory.getJob(new String[]{"convert"})); 39 | assertNull(JobFactory.getJob(new String[]{"convert", "--help"})); 40 | } 41 | 42 | @Test 43 | public void testGetPersistJob() throws ConfigurationException { 44 | Runnable job = JobFactory.getJob(new String[]{"convert", "--input.file", "test.nq", "--output.target", "Preview"}); 45 | // stop the created Spark Context to avoid conflicts in other tests 46 | SparkContext.getOrCreate().stop(); 47 | assertNotNull("Non-null write job returned from factory", job); 48 | assertEquals("Correct job returned from factory", ConvertJob.class, job.getClass()); 49 | } 50 | 51 | } 52 | -------------------------------------------------------------------------------- /src/test/java/com/merck/rdf2x/processing/filtering/QuadFilterTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Merck Sharp & Dohme Corp. a subsidiary of Merck & Co., 3 | * Inc., Kenilworth, NJ, USA. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.merck.rdf2x.processing.filtering; 19 | 20 | import com.merck.rdf2x.test.TestSparkContextProvider; 21 | import com.merck.rdf2x.test.TestUtils; 22 | import lombok.extern.slf4j.Slf4j; 23 | import org.apache.jena.sparql.core.Quad; 24 | import org.apache.spark.api.java.JavaRDD; 25 | import org.junit.Before; 26 | import org.junit.Test; 27 | 28 | import javax.naming.ConfigurationException; 29 | import java.io.Serializable; 30 | import java.util.Arrays; 31 | 32 | /** 33 | * Test of {@link QuadFilter} 34 | */ 35 | @Slf4j 36 | public class QuadFilterTest extends TestSparkContextProvider implements Serializable { 37 | 38 | private JavaRDD testRDD; 39 | 40 | @Before 41 | public void setUp() { 42 | testRDD = TestUtils.getQuadsRDD(jsc(), "filtering/input.nq"); 43 | } 44 | 45 | @Test 46 | public void testEmptyFilter() { 47 | QuadFilterConfig config = new QuadFilterConfig(); 48 | QuadFilter filter = new QuadFilter(config); 49 | assertRDDToStringEquals("Empty filter returns original RDD", testRDD, filter.filter(testRDD)); 50 | } 51 | 52 | @Test(expected = ConfigurationException.class) 53 | public void testRequiredResourcesWithNonZeroDepth() throws ConfigurationException { 54 | QuadFilterConfig config = new QuadFilterConfig() 55 | .setRelatedDepth(1); 56 | config.validate(); 57 | } 58 | 59 | @Test 60 | public void testDirectedResourceFilter() { 61 | for (int depth = 0; depth <= 3; depth++) { 62 | QuadFilterConfig config = new QuadFilterConfig() 63 | .setRelatedDepth(depth) 64 | .setResources(Arrays.asList("http://t.com/a/1", "http://t.com/a/3")); 65 | QuadFilter filter = new QuadFilter(config); 66 | JavaRDD expected = TestUtils.getQuadsRDD(jsc(), "filtering/resource" + depth + ".nq"); 67 | JavaRDD result = filter.filter(testRDD); 68 | assertRDDToStringEquals("Directed filter on resources with depth " + depth, expected, result); 69 | } 70 | } 71 | 72 | @Test 73 | public void testUndirectedResourceFilter() { 74 | for (int depth = 0; depth <= 3; depth++) { 75 | QuadFilterConfig config = new QuadFilterConfig() 76 | .setRelatedDepth(depth) 77 | .setDirected(false) 78 | .setResources(Arrays.asList("http://t.com/b")); 79 | QuadFilter filter = new QuadFilter(config); 80 | JavaRDD expected = TestUtils.getQuadsRDD(jsc(), "filtering/type" + depth + ".nq"); 81 | JavaRDD result = filter.filter(testRDD); 82 | assertRDDToStringEquals("Undirected filter on type with depth " + depth, expected, result); 83 | } 84 | } 85 | 86 | 87 | } -------------------------------------------------------------------------------- /src/test/java/com/merck/rdf2x/processing/formatting/SchemaFormatterTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Merck Sharp & Dohme Corp. a subsidiary of Merck & Co., 3 | * Inc., Kenilworth, NJ, USA. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.merck.rdf2x.processing.formatting; 19 | 20 | import lombok.extern.slf4j.Slf4j; 21 | import org.junit.Test; 22 | 23 | import java.util.*; 24 | import java.util.function.BiFunction; 25 | 26 | import static org.junit.Assert.*; 27 | 28 | @Slf4j 29 | @SuppressWarnings("ConstantConditions") 30 | public class SchemaFormatterTest { 31 | 32 | /** 33 | * Test formatting individual table names 34 | */ 35 | @Test 36 | public void testFormatEntityNames() { 37 | SchemaFormatterConfig config = new SchemaFormatterConfig() 38 | .setMaxTableNameLength(20) 39 | .setReservedNames(new HashSet<>(Collections.singletonList("reserved_name"))); 40 | 41 | SchemaFormatter formatter = new SchemaFormatter(config); 42 | 43 | testFormatNames(formatter::getTypeNames, config.getMaxTableNameLength()); 44 | } 45 | 46 | 47 | /** 48 | * Test formatting individual table names 49 | */ 50 | @Test 51 | public void testFormatColumnNames() { 52 | SchemaFormatterConfig config = new SchemaFormatterConfig() 53 | .setMaxColumnNameLength(25) 54 | .setReservedNames(new HashSet<>(Collections.singletonList("reserved_name"))); 55 | 56 | SchemaFormatter formatter = new SchemaFormatter(config); 57 | 58 | testFormatNames(formatter::getPropertyNames, config.getMaxColumnNameLength()); 59 | } 60 | 61 | /** 62 | * Test formatting individual table names 63 | */ 64 | @Test 65 | public void testFormatRelationNames() { 66 | SchemaFormatterConfig config = new SchemaFormatterConfig() 67 | .setMaxTableNameLength(20) 68 | .setReservedNames(new HashSet<>(Collections.singletonList("reserved_name"))); 69 | 70 | SchemaFormatter formatter = new SchemaFormatter(config); 71 | 72 | testFormatNames((uris, labels) -> formatter.getRelationNames(uris, labels, new HashSet<>()), config.getMaxTableNameLength()); 73 | } 74 | 75 | public void testFormatNames(BiFunction, Map, Map> format, int maxLength) { 76 | 77 | List nameList = Arrays.asList( 78 | "http://example.com/d/folder#name", 79 | "http://example.com/c/name#", 80 | "http://example.com/b/--name--", 81 | "http://example.com/a/name/+*,-%@", 82 | "name", 83 | "http://example.com/second" 84 | ); 85 | 86 | 87 | Map names = format.apply(nameList, new HashMap<>()); 88 | assertTrue("All names are unique", names.values().size() == new HashSet<>(names.values()).size()); 89 | assertTrue("Alphabetically first URI has no suffix", names.get("http://example.com/a/name/+*,-%@").equals("name")); 90 | assertTrue("Alphabetically second URI has _2 suffix", names.get("http://example.com/b/--name--").equals("name_2")); 91 | assertTrue("Alphabetically third URI has _3 suffix", names.get("http://example.com/c/name#").equals("name_3")); 92 | assertTrue("Alphabetically fourth URI has _4 suffix", names.get("http://example.com/d/folder#name").equals("name_4")); 93 | assertTrue("No suffix is added for already unique name", names.get("http://example.com/second").equals("second")); 94 | 95 | assertNotEquals("Formatted name of empty URI is not empty", "", formatSingleName(format, "")); 96 | 97 | assertNotEquals("Reserved name is not used.", "reserved_name", formatSingleName(format, "http://example.com/reserved_name")); 98 | 99 | assertEquals("two_words", formatSingleName(format, "http://example.com/page#two-words")); 100 | assertEquals("two_words", formatSingleName(format, "http://example.com/page#two---words")); 101 | assertEquals("two_words", formatSingleName(format, "http://example.com/page#--two---words--")); 102 | 103 | assertEquals("an_encoded_name", formatSingleName(format, "http://example.com/table?name=an%20encoded%20name")); 104 | 105 | assertEquals("Long name has max length", maxLength, formatSingleName(format, "http://example.com/a_very_very_long_uri_suffix_name").length()); 106 | 107 | } 108 | 109 | private String formatSingleName(BiFunction, Map, Map> format, String uri) { 110 | return format.apply(Collections.singletonList(uri), new HashMap<>()).get(uri); 111 | } 112 | 113 | } 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | -------------------------------------------------------------------------------- /src/test/java/com/merck/rdf2x/processing/relations/RelationExtractorTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Merck Sharp & Dohme Corp. a subsidiary of Merck & Co., 3 | * Inc., Kenilworth, NJ, USA. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.merck.rdf2x.processing.relations; 19 | 20 | import com.merck.rdf2x.beans.Instance; 21 | import com.merck.rdf2x.beans.RelationPredicate; 22 | import com.merck.rdf2x.processing.schema.RelationConfig; 23 | import com.merck.rdf2x.rdf.schema.ClassGraph; 24 | import com.merck.rdf2x.test.TestSparkContextProvider; 25 | import lombok.extern.slf4j.Slf4j; 26 | import org.apache.spark.api.java.JavaRDD; 27 | import org.apache.spark.sql.DataFrame; 28 | import org.apache.spark.sql.Row; 29 | import org.apache.spark.sql.RowFactory; 30 | import org.apache.spark.sql.SQLContext; 31 | import org.apache.spark.sql.types.DataTypes; 32 | import org.apache.spark.sql.types.StructType; 33 | import org.junit.Test; 34 | 35 | import java.util.ArrayList; 36 | import java.util.List; 37 | 38 | import static org.junit.Assert.assertEquals; 39 | 40 | /** 41 | * Test of {@link RelationExtractor} 42 | */ 43 | @Slf4j 44 | public class RelationExtractorTest extends TestSparkContextProvider { 45 | /** 46 | * Test if expected directed relations are collected from a RDD of Instances 47 | */ 48 | @Test 49 | public void testCollectRelations() { 50 | SQLContext sql = new SQLContext(jsc()); 51 | 52 | RelationExtractor collector = new RelationExtractor( 53 | new RelationConfig(), 54 | jsc(), 55 | new ClassGraph() 56 | ); 57 | 58 | List rdd = new ArrayList<>(); 59 | 60 | // cycle one -> two -> three -> one 61 | rdd.add(RowFactory.create(0, 1, 1L, 1, 2L)); 62 | rdd.add(RowFactory.create(0, 1, 2L, 1, 3L)); 63 | rdd.add(RowFactory.create(0, 1, 3L, 1, 1L)); 64 | 65 | // one -> four, four -> one 66 | rdd.add(RowFactory.create(0, 2, 4L, 1, 1L)); 67 | rdd.add(RowFactory.create(0, 1, 1L, 2, 4L)); 68 | 69 | // five -> one 70 | rdd.add(RowFactory.create(0, 3, 5L, 1, 1L)); 71 | 72 | DataFrame expected = sql.createDataFrame(rdd, new StructType() 73 | .add("predicateIndex", DataTypes.IntegerType, false) 74 | .add("fromTypeIndex", DataTypes.IntegerType, false) 75 | .add("fromID", DataTypes.LongType, false) 76 | .add("toTypeIndex", DataTypes.IntegerType, false) 77 | .add("toID", DataTypes.LongType, false) 78 | ); 79 | 80 | // (predicateIndex, fromTypeIndex, instanceID, toTypeIndex, relatedID) 81 | DataFrame result = collector.extractRelations(getTestRDD()); 82 | 83 | assertEquals("Expected relation row schema is collected", expected.schema(), result.schema()); 84 | assertRDDEquals("Expected relation rows are collected", expected.javaRDD(), result.javaRDD()); 85 | } 86 | 87 | private JavaRDD getTestRDD() { 88 | List rdd = new ArrayList<>(); 89 | 90 | Instance one = new Instance(); 91 | one.setType(1); 92 | one.setUri("http://example.com/a/one"); 93 | one.setId(1L); 94 | rdd.add(one); 95 | 96 | Instance two = new Instance(); 97 | two.setType(1); 98 | two.setUri("http://example.com/a/two"); 99 | two.setId(2L); 100 | rdd.add(two); 101 | 102 | Instance three = new Instance(); 103 | three.setType(1); 104 | three.setUri("http://example.com/a/three"); 105 | three.setId(3L); 106 | rdd.add(three); 107 | 108 | Instance four = new Instance(); 109 | four.setType(2); 110 | four.setUri("http://example.com/b/four"); 111 | four.setId(4L); 112 | rdd.add(four); 113 | 114 | Instance five = new Instance(); 115 | five.setType(3); 116 | five.setUri("http://example.com/c/five"); 117 | five.setId(5L); 118 | rdd.add(five); 119 | 120 | // cycle one -> two -> three -> one 121 | one.addRelation(new RelationPredicate(0, two.getUri())); 122 | two.addRelation(new RelationPredicate(0, three.getUri())); 123 | three.addRelation(new RelationPredicate(0, one.getUri())); 124 | 125 | // one -> four, four -> one 126 | one.addRelation(new RelationPredicate(0, four.getUri())); 127 | four.addRelation(new RelationPredicate(0, one.getUri())); 128 | 129 | // five -> one 130 | five.addRelation(new RelationPredicate(0, one.getUri())); 131 | 132 | return jsc().parallelize(rdd); 133 | } 134 | 135 | } 136 | -------------------------------------------------------------------------------- /src/test/java/com/merck/rdf2x/rdf/parsing/ElephasQuadParserTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Merck Sharp & Dohme Corp. a subsidiary of Merck & Co., 3 | * Inc., Kenilworth, NJ, USA. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.merck.rdf2x.rdf.parsing; 19 | 20 | import com.merck.rdf2x.test.TestSparkContextProvider; 21 | import com.merck.rdf2x.test.TestUtils; 22 | import lombok.extern.slf4j.Slf4j; 23 | import org.apache.jena.sparql.core.Quad; 24 | import org.apache.spark.api.java.JavaRDD; 25 | import org.junit.Test; 26 | 27 | import java.util.Arrays; 28 | import java.util.LinkedList; 29 | import java.util.List; 30 | 31 | /** 32 | * Test of {@link ElephasQuadParser} 33 | */ 34 | @Slf4j 35 | public class ElephasQuadParserTest extends TestSparkContextProvider { 36 | 37 | 38 | /** 39 | * Test if expected quads are parsed from N-Quads and Turtle format 40 | */ 41 | @Test 42 | public void testParseQuads() { 43 | QuadParser parser = new ElephasQuadParser( 44 | new QuadParserConfig() 45 | .setBatchSize(2), 46 | jsc() 47 | ); 48 | String[] datasetPaths = new String[]{"parserTest.nq", "parserTest.ttl"}; 49 | JavaRDD expected = getExpectedRDD(); 50 | 51 | for (String datasetPath : datasetPaths) { 52 | JavaRDD parsed = parser 53 | .parseQuads(TestUtils.getDatasetPath(datasetPath)) 54 | .map(ElephasQuadParserTest::quadToRawString); 55 | assertRDDEquals("Parsed file " + datasetPath + " equals expected RDD", expected, parsed); 56 | } 57 | } 58 | 59 | private JavaRDD getExpectedRDD() { 60 | List rdd = new LinkedList<>(); 61 | rdd.add("[http://t.com/flavors#Sweet, http://www.w3.org/1999/02/22-rdf-syntax-ns#type, http://t.com/types#Flavor]"); 62 | rdd.add("[http://t.com/flavors#Sour, http://www.w3.org/1999/02/22-rdf-syntax-ns#type, http://t.com/types#Flavor]"); 63 | rdd.add("[http://t.com/fruit#Apple, http://t.com/rel#Flavor, http://t.com/flavors#Sweet]"); 64 | rdd.add("[http://t.com/fruit#Apple, http://www.w3.org/1999/02/22-rdf-syntax-ns#type, http://t.com/types#Fruit]"); 65 | rdd.add("[http://t.com/fruit#Apple, http://t.com/rel#Weight, \"100\"^^http://www.w3.org/2001/XMLSchema#int]"); 66 | rdd.add("[http://t.com/fruit#Apple, http://t.com/rel#Weight, \"150\"^^http://www.w3.org/2001/XMLSchema#int]"); 67 | rdd.add("[http://t.com/fruit#Apple, http://t.com/rel#Weight, \"TBD\"]"); 68 | rdd.add("[http://t.com/fruit#Tomato, http://t.com/rel#Flavor, http://t.com/flavors#Sweet]"); 69 | rdd.add("[http://t.com/fruit#Tomato, http://t.com/rel#Flavor, http://t.com/flavors#Sour]"); 70 | rdd.add("[http://t.com/fruit#Tomato, http://t.com/rel#Weight, \"100\"^^http://www.w3.org/2001/XMLSchema#int]"); 71 | rdd.add("[http://t.com/fruit#Tomato, http://another.com/rel#Weight, \"110\"^^http://www.w3.org/2001/XMLSchema#int]"); 72 | rdd.add("[http://t.com/fruit#Tomato, http://www.w3.org/1999/02/22-rdf-syntax-ns#type, http://t.com/types#Fruit]"); 73 | rdd.add("[http://t.com/fruit#Tomato, http://www.w3.org/1999/02/22-rdf-syntax-ns#type, http://t.com/types#Vegetable]"); 74 | rdd.add("[http://t.com/fruit#Cucumber, http://www.w3.org/1999/02/22-rdf-syntax-ns#type, http://t.com/types#Vegetable]"); 75 | 76 | return jsc().parallelize(rdd); 77 | } 78 | 79 | private static String quadToRawString(Quad quad) { 80 | return Arrays.asList(new String[]{ 81 | quad.getSubject().getURI(), 82 | quad.getPredicate().getURI(), 83 | quad.getObject().toString() 84 | }).toString(); 85 | } 86 | } -------------------------------------------------------------------------------- /src/test/java/com/merck/rdf2x/test/TestSparkContextProvider.java: -------------------------------------------------------------------------------- 1 | package com.merck.rdf2x.test; 2 | 3 | import com.holdenkarau.spark.testing.JavaRDDComparisons; 4 | import com.holdenkarau.spark.testing.SharedJavaSparkContext; 5 | import com.merck.rdf2x.spark.SparkContextProvider; 6 | import lombok.extern.slf4j.Slf4j; 7 | import org.apache.spark.SparkConf; 8 | import org.apache.spark.api.java.JavaRDD; 9 | import scala.Option; 10 | import scala.Tuple3; 11 | 12 | /* 13 | * Copyright 2017 Merck Sharp & Dohme Corp. a subsidiary of Merck & Co., 14 | * Inc., Kenilworth, NJ, USA. 15 | * 16 | * Licensed under the Apache License, Version 2.0 (the "License"); 17 | * you may not use this file except in compliance with the License. 18 | * You may obtain a copy of the License at 19 | * 20 | * http://www.apache.org/licenses/LICENSE-2.0 21 | * 22 | * Unless required by applicable law or agreed to in writing, software 23 | * distributed under the License is distributed on an "AS IS" BASIS, 24 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 25 | * See the License for the specific language governing permissions and 26 | * limitations under the License. 27 | */ 28 | @Slf4j 29 | public class TestSparkContextProvider extends SharedJavaSparkContext { 30 | 31 | @Override 32 | public SparkConf conf() { 33 | SparkConf conf = super.conf(); 34 | conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); 35 | conf.registerKryoClasses(SparkContextProvider.getSerializableClasses()); 36 | return conf; 37 | } 38 | 39 | @Override 40 | public void runBefore() { 41 | super.runBefore(); 42 | jsc().setLogLevel("WARN"); 43 | } 44 | 45 | public void assertRDDEquals(String message, JavaRDD expected, JavaRDD result) { 46 | Option> diff = JavaRDDComparisons.compareRDD(expected, result); 47 | if (diff.isDefined()) { 48 | log.error("EXPECTED"); 49 | expected.foreach(row -> log.error(row.toString())); 50 | log.error("RESULT"); 51 | result.foreach(row -> log.error(row.toString())); 52 | log.error("FIRST DIFF"); 53 | Tuple3 diffTriple = diff.get(); 54 | log.error(diffTriple.toString()); 55 | if (diffTriple._2() == 0) { 56 | log.error("(row not expected but present in result {} times)", diffTriple._3()); 57 | } 58 | if (diffTriple._3() == 0) { 59 | log.error("(row expected {} times but not present)", diffTriple._2()); 60 | } 61 | throw new AssertionError(message); 62 | } 63 | } 64 | 65 | public void assertRDDEquals(JavaRDD expected, JavaRDD result) { 66 | assertRDDEquals("Datasets are equal.", expected, result); 67 | } 68 | 69 | public void assertRDDToStringEquals(String message, JavaRDD expected, JavaRDD result) { 70 | JavaRDD expectedToString = expected.map(Object::toString); 71 | JavaRDD resultToString = result.map(Object::toString); 72 | assertRDDEquals(message, expectedToString, resultToString); 73 | } 74 | 75 | public void assertRDDToStringEquals(JavaRDD expected, JavaRDD result) { 76 | assertRDDToStringEquals("Datasets are equal.", expected, result); 77 | } 78 | } 79 | -------------------------------------------------------------------------------- /src/test/java/com/merck/rdf2x/test/TestUtils.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Merck Sharp & Dohme Corp. a subsidiary of Merck & Co., 3 | * Inc., Kenilworth, NJ, USA. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.merck.rdf2x.test; 19 | 20 | import com.merck.rdf2x.rdf.parsing.ElephasQuadParser; 21 | import com.merck.rdf2x.rdf.parsing.QuadParser; 22 | import com.merck.rdf2x.rdf.parsing.QuadParserConfig; 23 | import lombok.extern.slf4j.Slf4j; 24 | import org.apache.jena.sparql.core.Quad; 25 | import org.apache.spark.api.java.JavaRDD; 26 | import org.apache.spark.api.java.JavaSparkContext; 27 | 28 | import java.net.URL; 29 | import java.util.Collection; 30 | import java.util.Collections; 31 | import java.util.HashSet; 32 | import java.util.stream.Collectors; 33 | 34 | import static org.junit.Assert.assertTrue; 35 | 36 | /** 37 | * TestUtils stores various methods used for testing 38 | */ 39 | @Slf4j 40 | public class TestUtils { 41 | /** 42 | * Assert all names in a collection are unique 43 | * 44 | * @param message assert message 45 | * @param names collection of names to test against 46 | */ 47 | public static void assertUniqueNames(String message, Collection names) { 48 | boolean isUnique = new HashSet<>(names).size() == names.size(); 49 | if (!isUnique) { 50 | log.error("Not unique names:"); 51 | log.error(names.stream().filter(name -> Collections.frequency(names, name) > 1).collect(Collectors.toList()).toString()); 52 | } 53 | assertTrue(message, isUnique); 54 | } 55 | 56 | /** 57 | * Get path of a dataset in the test resources folder 58 | */ 59 | public static String getDatasetPath(String datasetPath) { 60 | URL url = TestUtils.class.getClassLoader().getResource("test/datasets/" + datasetPath); 61 | return url.getPath(); 62 | } 63 | 64 | /** 65 | * Parse RDF file from resources folder 66 | * @param sc spark context to use for parsing 67 | * @param fileName name of the file to parse 68 | * @return RDD of quads from the requested file 69 | */ 70 | public static JavaRDD getQuadsRDD(JavaSparkContext sc, String fileName) { 71 | QuadParser parser = new ElephasQuadParser( 72 | new QuadParserConfig() 73 | .setBatchSize(2), 74 | sc 75 | ); 76 | String path = TestUtils.getDatasetPath(fileName); 77 | return parser.parseQuads(path); 78 | } 79 | } 80 | --------------------------------------------------------------------------------