└── sansa-datalake ├── .gitignore ├── .travis.yml ├── LICENSE ├── README.md ├── pom.xml ├── sansa-datalake-spark ├── .gitignore ├── pom.xml └── src │ └── main │ ├── java │ └── net │ │ └── sansa_stack │ │ └── datalake │ │ └── spark │ │ ├── NTtoDF.java │ │ └── model │ │ └── Triple.java │ ├── resources │ └── log4j.properties │ └── scala │ └── net │ └── sansa_stack │ └── datalake │ └── spark │ ├── Config.scala │ ├── Main.scala │ ├── Mapper.scala │ ├── Planner.scala │ ├── QueryAnalyser.scala │ ├── QueryExecutor.scala │ ├── Run.scala │ ├── SparkExecutor.scala │ └── utils │ └── Helpers.scala └── scalastyle-config.xml /sansa-datalake/.gitignore: -------------------------------------------------------------------------------- 1 | target/ 2 | pom.xml.tag 3 | pom.xml.releaseBackup 4 | pom.xml.versionsBackup 5 | pom.xml.next 6 | release.properties 7 | dependency-reduced-pom.xml 8 | buildNumber.properties 9 | .mvn/timing.properties 10 | # eclipse conf file 11 | .settings 12 | .classpath 13 | .project 14 | .manager 15 | .scala_dependencies 16 | .cashe 17 | .cache-main 18 | .cache-tests 19 | .classpath 20 | #.coveralls.yml 21 | deptree.txt 22 | # IntelliJ config 23 | *.iml 24 | .idea 25 | /bin 26 | 27 | # filename i use to store output of mvn dependency:tree ~Claus 28 | deptree.txt 29 | # local project specific tmp folder 30 | tmp 31 | 32 | scalastyle-output.xml 33 | -------------------------------------------------------------------------------- /sansa-datalake/.travis.yml: -------------------------------------------------------------------------------- 1 | language: scala 2 | sudo: false 3 | cache: 4 | directories: 5 | - $HOME/.m2 6 | scala: 7 | - 2.12.11 8 | jdk: 9 | - openjdk8 10 | script: 11 | - mvn scalastyle:check 12 | - mvn test 13 | -------------------------------------------------------------------------------- /sansa-datalake/LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /sansa-datalake/README.md: -------------------------------------------------------------------------------- 1 | # DataLake 2 | [![Maven Central](https://maven-badges.herokuapp.com/maven-central/net.sansa-stack/sansa-datalake-parent_2.11/badge.svg)](https://maven-badges.herokuapp.com/maven-central/net.sansa-stack/sansa-datalake-parent_2.11) 3 | [![Build Status](https://ci.aksw.org/jenkins/job/SANSA-ML/job/develop/badge/icon)](https://ci.aksw.org/jenkins/job/SANSA-DataLake//job/master/) 4 | [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) 5 | [![Twitter](https://img.shields.io/twitter/follow/SANSA_Stack.svg?style=social)](https://twitter.com/SANSA_Stack) 6 | 7 | A library to query heterogeneous data sources uniformly using SPARQL. 8 | 9 | ## Description 10 | ### Data Lake 11 | The term Data Lake denotes a schema-less repository of data residing in its original format and form. As such, there is not a single point of entry to the Data Lake, as data in its diversity has various schemata, query interfaces and languages. 12 | 13 | ### _Semantic_ Data Lake 14 | Semantic Data Lake is an effort to enable querying this wealth of heterogeneous data using Semantic Web principles: mapping language and SPARQL query language. This supplies the Data Lake with a schema and enables a one entry point, SPARQL query, to the various heterogeneous data. In order to reach a data source, the latter needs to be connected to. 15 | 16 | That said, to query the data lake using the _Semantic Data Lake_ approach, users need to provide three inputs: (1) Mappings file, (2) Config file, and (3) a SPARQL query, described in the next three sections. 17 | 18 | ### 1. Mapping Language and Data Lake Schema 19 | A virtual schema is added to the Data Lake by _mapping_ data elements, e.g., tables and attributes to ontology concepts, e.g., classes and predicates. We benefit from [RML](http://rml.io/) mappings to express those schema mapping links. 20 | 21 | An example of such mappings is given below. It maps a collection named _Product_ (`rml:source "Product"`) in a MongoDB database to an ontology class _Product_ (`rr:class bsbm:Product`), meaning that every documebt in Product document is of type `bsbm:Product`. The mappings also link MongoDB collection fields `label`, `publisher` and `producer` to ontology predicates `rdfs:label`, `dc:publisher` and `bsbm:producer`, respectively. The `_id` field found in `rr:subjectMap rr:template "http://example.com/{_id}"` triple points to the primary key of MongoDB collection. 22 | 23 | ``` 24 | <#OfferMapping> 25 | rml:logicalSource [ 26 | rml:source "//Offer"; 27 | nosql:store nosql:Mongodb 28 | ]; 29 | rr:subjectMap [ 30 | rr:template "http://example.com/{_id}"; 31 | rr:class schema:Offer 32 | ]; 33 | 34 | rr:predicateObjectMap [ 35 | rr:predicate bsbm:validTo; 36 | rr:objectMap [rml:reference "validTo"] 37 | ]; 38 | 39 | rr:predicateObjectMap [ 40 | rr:predicate dc:publisher; 41 | rr:objectMap [rml:reference "publisher"] 42 | ]; 43 | 44 | rr:predicateObjectMap [ 45 | rr:predicate bsbm:producer; 46 | rr:objectMap [rml:reference "producer"] 47 | ]; 48 | ``` 49 | 50 | Note the presence of the triple `nosql:store nosql:MongoDB`, it contains an addition to RML mappings from the [NoSQL ontology](http://purl.org/db/nosql#) to allow stating what type of source it is being mapped. 51 | 52 | _The mappings file can either be created manually or using the following graphical utility: [Squerall-GUI](https://github.com/EIS-Bonn/Squerall-GUI)_. 53 | 54 | ### 2. Data Connection Configurations 55 | In order for data to connect to a data source, users need to provide a set of config parameters, in JSON format. This differs from data source to another, for example for a MongoDB collection, the config parameters could be: database host URL, database name, collection name, and replica set name. 56 | 57 | ```JSON 58 | { 59 | "type": "mongodb", 60 | "options": { 61 | "url": "127.0.0.1", 62 | "database": "bsbm", 63 | "collection": "offer", 64 | "options": "replicaSet=mongo-rs" 65 | }, 66 | "source": "//Offer", 67 | "entity": "Offer" 68 | } 69 | ``` 70 | 71 | It is necessary to link the configured source (`"source": "//Offer"`) to the mapped source (`rml:logicalSource rml:source "//Offer"`, see Mapping section above) 72 | 73 | _The config file can either be created manually or using the following graphical utility: [Squerall-GUI](https://github.com/EIS-Bonn/Squerall-GUI)_. 74 | 75 | ### 3. SPARQL Query Interface 76 | SPARQL queries are expressed using the Ontology terms the data was previously mapped to. SPARQL query should conform to the currently supported SPARQL fragment: 77 | 78 | ```SPARQL 79 | Query := Prefix* SELECT Distinguish WHERE{ Clauses } Modifiers? 80 | Prefix := PREFIX "string:" IRI 81 | Distinguish := DISTINCT? (“*”|(Var|Aggregate)+) 82 | Aggregate := (AggOpe(Var) ASVar) 83 | AggOpe := SUM|MIN|MAX|AVG|COUNT 84 | Clauses := TP* Filter? 85 | Filter := FILTER (Var FiltOpe Litteral) 86 | | FILTER regex(Var, "%string%") 87 | FiltOpe :==|!=|<|<=|>|>= 88 | TP := VarIRIVar .|Varrdf:type IRI. 89 | Var := "?string" 90 | Modifiers := (LIMITk)? (ORDER BY(ASC|DESC)? Var)? (GROUP BYVar+)? 91 | ``` 92 | 93 | ### File Storage format 94 | The previous three files can be stored either locally, in HDFS on in an AWS S3 bucket. For the latter, make sure to have your credentials ([see](https://docs.aws.amazon.com/sdk-for-java/v1/developer-guide/setup-credentials.html)) stored in ~/.aws/credentials (C:\Users\USERNAME\.aws\credentials on Windows), in the following form: 95 | ``` 96 | [default] 97 | aws_access_key_id=... 98 | aws_secret_access_key=... 99 | ``` 100 | 101 | ## Usage 102 | The usage of the Semantic Data Lake is documented under the respective SANSA-Query [datalake component](https://github.com/SANSA-Stack/SANSA-Query/tree/develop/sansa-query-spark/src/main/scala/net/sansa_stack/query/spark/datalake). 103 | 104 | ## How to Contribute 105 | We always welcome new contributors to the project! Please see [our contribution guide](http://sansa-stack.net/contributing-to-sansa/) for more details on how to get started contributing to SANSA. 106 | -------------------------------------------------------------------------------- /sansa-datalake/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | sansa-datalake-parent_2.12 8 | 9 | 10 | net.sansa-stack 11 | sansa-parent_2.12 12 | 0.7.2-SNAPSHOT 13 | 14 | 15 | 16 | pom 17 | 18 | SANSA Stack - DataLake Layer - Parent 19 | A library to query heterogeneous data sources uniformly using SPARQL 20 | https://github.com/SANSA-Stack/SANSA-DataLake 21 | 2015 22 | 23 | 24 | Smart Data Analytics (SDA) research group 25 | http://sda.tech 26 | 27 | 28 | 29 | https://github.com/SANSA-Stack/SANSA-DataLake 30 | scm:git:git://github.com/SANSA-Stack/SANSA-DataLake.git 31 | scm:git:git@github.com:SANSA-Stack/SANSA-DataLake.git 32 | HEAD 33 | 34 | 35 | 36 | https://github.com/SANSA-Stack/SANSA-DataLake/issues 37 | GitHub 38 | 39 | 40 | 41 | 42 | Apache License 2.0 43 | http://www.apache.org/licenses/LICENSE-2.0.html 44 | repo 45 | 46 | 47 | 48 | 49 | 50 | Mohamed Nadjib MAMI 51 | https://github.com/mnmami 52 | SDA 53 | http://sda.tech 54 | 55 | contributor 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | maven-project-info-reports-plugin 64 | 2.9 65 | 66 | 67 | net.alchim31.maven 68 | scala-maven-plugin 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | org.apache.spark 78 | spark-hive_${scala.binary.version} 79 | ${spark.version} 80 | test 81 | 82 | 83 | 84 | de.javakaffee 85 | kryo-serializers 86 | 87 | 88 | 89 | io.gatling 90 | jsonpath_${scala.binary.version} 91 | 0.6.10 92 | 93 | 94 | 95 | com.typesafe.play 96 | play_${scala.binary.version} 97 | 2.6.2 98 | 99 | 100 | 101 | 102 | com.datastax.spark 103 | spark-cassandra-connector_${scala.binary.version} 104 | 2.4.2 105 | 106 | 107 | 108 | org.mongodb.spark 109 | mongo-spark-connector_${scala.binary.version} 110 | 2.4.0 111 | 112 | 113 | 114 | com.couchbase.client 115 | spark-connector_${scala.binary.version} 116 | 2.4.0 117 | 118 | 119 | 120 | 125 | 126 | 127 | mysql 128 | mysql-connector-java 129 | 8.0.16 130 | 131 | 132 | 133 | 134 | com.amazonaws 135 | aws-java-sdk-s3 136 | 1.11.791 137 | 138 | 139 | 140 | 141 | 142 | 143 | org.scalatest 144 | scalatest_${scala.binary.version} 145 | 146 | 147 | 148 | 149 | 150 | 151 | org.apache.maven.plugins 152 | maven-compiler-plugin 153 | 154 | 155 | 156 | org.apache.maven.plugins 157 | maven-surefire-plugin 158 | 159 | 160 | 161 | org.apache.maven.plugins 162 | maven-source-plugin 163 | 164 | 165 | 166 | org.apache.maven.plugins 167 | maven-javadoc-plugin 168 | 169 | 170 | 171 | net.alchim31.maven 172 | scala-maven-plugin 173 | 174 | 175 | 176 | org.apache.maven.plugins 177 | maven-site-plugin 178 | 179 | 180 | 181 | com.amashchenko.maven.plugin 182 | gitflow-maven-plugin 183 | 184 | 185 | 186 | org.scalatest 187 | scalatest-maven-plugin 188 | 189 | ${project.build.directory}/surefire-reports 190 | . 191 | SANSA-DataLake-Tests.txt 192 | 193 | 194 | 195 | test 196 | 197 | test 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | maven.aksw.internal 209 | AKSW Release Repository 210 | http://maven.aksw.org/archiva/repository/internal 211 | 212 | 213 | maven.aksw.snapshots 214 | AKSW Snapshot Repository 215 | http://maven.aksw.org/archiva/repository/snapshots 216 | 217 | 218 | 219 | 220 | 221 | root-dir 222 | 223 | 224 | ${project.basedir}/../../scalastyle-config.xml 225 | 226 | 227 | 228 | ${project.basedir}/../scalastyle-config.xml 229 | 230 | 231 | 232 | doclint-java8-disable 233 | 234 | [1.8,) 235 | 236 | 237 | 238 | 239 | 240 | org.apache.maven.plugins 241 | maven-javadoc-plugin 242 | 243 | 244 | attach-javadocs 245 | 246 | jar 247 | 248 | 249 | false 250 | 251 | 252 | 253 | 254 | none 255 | 256 | 257 | 258 | 259 | 260 | 261 | 262 | release 263 | 264 | 265 | performRelease 266 | true 267 | 268 | 269 | 270 | 271 | 272 | 273 | 274 | org.apache.maven.plugins 275 | maven-gpg-plugin 276 | 277 | 278 | 279 | org.sonatype.plugins 280 | nexus-staging-maven-plugin 281 | 282 | 283 | 284 | 285 | 286 | 287 | 288 | 289 | 290 | oss-sonatype 291 | oss-sonatype 292 | https://oss.sonatype.org/content/repositories/snapshots/ 293 | 294 | true 295 | 296 | 297 | 298 | apache-snapshot 299 | Apache repository (snapshots) 300 | https://repository.apache.org/content/repositories/snapshots/ 301 | 302 | true 303 | 304 | 305 | 306 | maven.aksw.internal 307 | AKSW Release Repository 308 | http://maven.aksw.org/archiva/repository/internal 309 | 310 | true 311 | 312 | 313 | false 314 | 315 | 316 | 317 | maven.aksw.snapshots 318 | AKSW Snapshot Repository 319 | http://maven.aksw.org/archiva/repository/snapshots 320 | 321 | false 322 | 323 | 324 | true 325 | 326 | 327 | 328 | 329 | 330 | sansa-datalake-spark 331 | 332 | 333 | -------------------------------------------------------------------------------- /sansa-datalake/sansa-datalake-spark/.gitignore: -------------------------------------------------------------------------------- 1 | /target/ 2 | -------------------------------------------------------------------------------- /sansa-datalake/sansa-datalake-spark/pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | 5 | sansa-datalake-spark_2.12 6 | 7 | 8 | net.sansa-stack 9 | sansa-datalake-parent_2.12 10 | 0.7.2-SNAPSHOT 11 | 12 | 13 | 14 | 15 | org.apache.spark 16 | spark-core_${scala.binary.version} 17 | 18 | 19 | 20 | org.apache.spark 21 | spark-sql_${scala.binary.version} 22 | 23 | 24 | 25 | org.apache.spark 26 | spark-hive_${scala.binary.version} 27 | compile 28 | 29 | 30 | 31 | 32 | org.scala-lang 33 | scala-library 34 | 35 | 36 | 37 | 38 | org.apache.jena 39 | jena-core 40 | 41 | 42 | 43 | org.apache.jena 44 | jena-arq 45 | 46 | 47 | 48 | io.gatling 49 | jsonpath_${scala.binary.version} 50 | 51 | 52 | 53 | com.typesafe.play 54 | play_${scala.binary.version} 55 | 56 | 57 | 58 | 59 | com.datastax.spark 60 | spark-cassandra-connector_${scala.binary.version} 61 | 62 | 63 | 64 | org.mongodb.spark 65 | mongo-spark-connector_${scala.binary.version} 66 | 67 | 68 | 69 | com.couchbase.client 70 | spark-connector_${scala.binary.version} 71 | 72 | 73 | 74 | 78 | 79 | 80 | mysql 81 | mysql-connector-java 82 | 83 | 84 | 85 | com.amazonaws 86 | aws-java-sdk-s3 87 | 88 | 89 | 90 | 91 | com.typesafe.scala-logging 92 | scala-logging_${scala.binary.version} 93 | 94 | 95 | 96 | ch.qos.logback 97 | logback-classic 98 | 1.2.3 99 | test 100 | 101 | 102 | 103 | 104 | 105 | 106 | org.scalastyle 107 | scalastyle-maven-plugin 108 | 109 | 110 | net.alchim31.maven 111 | scala-maven-plugin 112 | 113 | 114 | 115 | 116 | -------------------------------------------------------------------------------- /sansa-datalake/sansa-datalake-spark/src/main/java/net/sansa_stack/datalake/spark/NTtoDF.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Created by mmami on 10.10.16. 3 | */ 4 | package net.sansa_stack.datalake.spark; 5 | 6 | import org.apache.spark.sql.*; 7 | import org.apache.spark.api.java.JavaPairRDD; 8 | import org.apache.spark.api.java.JavaRDD; 9 | import org.apache.spark.api.java.function.FlatMapFunction; 10 | import org.apache.spark.api.java.function.Function; 11 | import org.apache.spark.api.java.function.PairFunction; 12 | import org.apache.spark.sql.types.DataTypes; 13 | import org.apache.spark.sql.types.StructField; 14 | import org.apache.spark.sql.types.StructType; 15 | import net.sansa_stack.datalake.spark.model.Triple; 16 | import scala.Tuple2; 17 | 18 | import java.sql.Timestamp; 19 | import java.util.*; 20 | import java.util.regex.Matcher; 21 | import java.util.regex.Pattern; 22 | import java.io.Serializable; 23 | 24 | public class NTtoDF implements Serializable { 25 | 26 | private String className; 27 | 28 | public NTtoDF() { } 29 | 30 | public NTtoDF options(Map options) { 31 | className = options.get("class"); 32 | 33 | return this; 34 | } 35 | 36 | //@SuppressWarnings("unchecked") 37 | public Dataset read(String input_path, SparkSession spark) { 38 | 39 | try { 40 | 41 | 42 | // 1. Read text file 43 | JavaRDD lines = spark.read().textFile(input_path).toJavaRDD(); 44 | //JavaRDD lines = spark.read().textFile(input_path); 45 | 46 | // 2. Map lines to Triple objects 47 | JavaRDD triples = lines.map((Function) line -> { 48 | 49 | //String[] parts = line.split(" "); 50 | 51 | List parts = new ArrayList<>(); 52 | Matcher m = Pattern.compile("([^\"]\\S*|\".+?\")\\s*").matcher(line); 53 | while (m.find()) 54 | parts.add(m.group(1)); 55 | 56 | Triple triple; 57 | 58 | if (parts.get(1).equals("")) 59 | triple = new Triple(replaceInValue(removeTagSymbol(parts.get(0))), null, replaceInValue(removeTagSymbol(parts.get(2)))); 60 | else { 61 | String subject = replaceInValue(removeTagSymbol(parts.get(0))); // MEASURE removeTagSymbol() time 62 | String property = replaceInColumn(removeTagSymbol(parts.get(1))); 63 | String object = replaceInValue(removeTagSymbol(parts.get(2))); 64 | String type = replaceInValue(removeTagSymbol(parts.get(3))); // Either there is a type (xslt) or not (.) 65 | 66 | String objectAndType = (parts.size() == 5) ? (object + type) : object; 67 | objectAndType = reverse(objectAndType); 68 | 69 | triple = new Triple(subject, property, objectAndType); 70 | } 71 | 72 | return triple; 73 | }); 74 | 75 | 76 | // 3. Map Triple objects to pairs (Triple.subject,[Triple.property, Triple.object]) 77 | //@SuppressWarnings({ "rawtypes" }) 78 | JavaPairRDD> subject_property = triples.mapToPair(( 79 | PairFunction>) trpl -> 80 | new Tuple2(trpl.getSubject(), new Tuple2(trpl.getProperty(), trpl.getObject())) 81 | ); 82 | 83 | // 4. Group pairs by subject => s,(p,o)[] 84 | JavaPairRDD>> groupBySubject = subject_property.groupByKey(); 85 | 86 | // 5. Map to pairs (Type,(s,(p,o)[])) 87 | //@SuppressWarnings({ "serial" }) 88 | JavaPairRDD>>> type_s_po = groupBySubject.mapToPair(( 89 | PairFunction>>, String, Tuple2>>>) list -> { 90 | 91 | List> p_o = new ArrayList<>(); 92 | List types = new ArrayList<>(); 93 | String property; 94 | String object; 95 | Tuple2 tt; 96 | Tuple2 t2; 97 | 98 | String subject = list._1(); 99 | for (Tuple2 stringStringTuple2 : list._2()) { 100 | tt = stringStringTuple2; 101 | property = tt._1(); 102 | object = tt._2(); 103 | if (property == null) { 104 | p_o.add(new Tuple2<>("type_" + object, "1")); 105 | types.add(object); 106 | } else { 107 | // Form Tuple2(P,O) 108 | t2 = new Tuple2<>(property, object); 109 | p_o.add(t2); 110 | } 111 | } 112 | 113 | Collections.sort(types); // order types lexicographically then select the last one => similar instances end up in same table 114 | 115 | //String chosen_type = lastType; // The last type is generally the most specific, but this is definitely not a rule. 116 | String chosen_type = types.get(types.size()-1); 117 | 118 | // We might use a hierarchy of classes from the schema if provided in future 119 | p_o.remove(new Tuple2("type_" + chosen_type, "1")); 120 | 121 | Tuple2 s_po = new Tuple2(subject, p_o); 122 | return new Tuple2>>>(chosen_type, s_po); 123 | }); 124 | 125 | // 6. Group by type => (type, It(s, It(p, o))) 126 | JavaPairRDD>>>> groupByType = type_s_po.groupByKey(); 127 | 128 | // 7. Get all the types 129 | //groupByType: >>>> 130 | // THIS CAN BE SUB-OPTIMAL WITH LARGE DATA. 131 | List keys = groupByType.keys().distinct().collect(); 132 | 133 | System.out.println("Types found: " + keys); 134 | // 8. Iterate through all types 135 | //int t = 0; 136 | //for (String key : keys) { 137 | //t++; 138 | //if (t < 20) { // To remove later 139 | //if(key.contains("HistoricTower")){ 140 | 141 | // 8.1 Get RDD of the type 142 | //@SuppressWarnings("unused") 143 | JavaRDD>>> rddByKey = getRddByKey(groupByType, className); 144 | 145 | // 8.2 Map the type RDD => Return type columns 146 | //JavaRDD> cols = rddByKey.map(i -> { 147 | JavaRDD cols = rddByKey.flatMap((FlatMapFunction>>, String>) i -> { 148 | LinkedHashMap po = new LinkedHashMap<>(); // a hashamp (that keeps order) to store all type's columns 149 | 150 | // 8.2.1 Iterate through all (p,o) and collect the columns (update incrementally the hashmap) 151 | 152 | for (Tuple2 temp : i._2) { 153 | String property = temp._1(); 154 | String object = reverse(temp._2()); 155 | 156 | if (object.contains("XMLSchema#double")) { 157 | if (!po.containsKey(property + "--TD") && !po.containsKey(property + "--TAD")) 158 | property = property + "--TD"; 159 | else if (!po.containsKey(property + "--TAD")) { 160 | po.remove(property + "--TD"); 161 | property = property + "--TAD"; 162 | } 163 | 164 | } else if (object.contains("XMLSchema#int")) { 165 | property = property + "--TI"; 166 | 167 | if (po.containsKey(property)) 168 | property = property.replace("--TI", "--TAI"); 169 | 170 | } else if (object.contains("XMLSchema#boolean")) { 171 | property = property + "--TB"; 172 | } else if (object.contains("XMLSchema#dateTime")) { 173 | property = property + "--TTS"; 174 | } 175 | 176 | if (po.containsKey(property) && !po.containsKey(property + "**")) { 177 | po.remove(property); 178 | property = property + "**"; 179 | //System.out.println("Property: " + property); 180 | } else if (po.containsKey(property + "**")) { 181 | property = property + "**"; 182 | } 183 | 184 | po.put(property, ""); // CAUTION: overwriting previous columns 185 | } 186 | 187 | // 8.2.2 At last, add the id column 188 | po.put("id", ""); 189 | 190 | return po.keySet().iterator(); 191 | //return (Iterator) po.keySet(); 192 | }); 193 | 194 | // 8.- Vars 195 | LinkedHashMap type_columns = new LinkedHashMap(); // a hashamp (that keeps order) to store all type's columns 196 | String col; 197 | 198 | // 8.3 Read columns and construct a hashmap 199 | final List readColumns = cols.distinct().collect(); 200 | 201 | for (String j : readColumns) type_columns.put(j,""); // Overwrite original columns (collect() may return columns in different order than collected firstly) 202 | 203 | // 8.4 Generate the Parquet table schema from the collected columns 204 | List table_columns = new ArrayList<>(); 205 | HashMap toSaveToDB = new HashMap<>(); 206 | 207 | 208 | for (String s : readColumns) { 209 | if(s.contains("--TD")) { 210 | if(!readColumns.contains(s.split("--")[0] + "--TAD")) { 211 | col = s.split("--")[0]; 212 | table_columns.add(DataTypes.createStructField(col, DataTypes.DoubleType, true)); 213 | //toSaveToDB.put(col, "double"); 214 | } 215 | } else if(s.contains("--TI")) { 216 | col = s.split("--")[0]; 217 | table_columns.add(DataTypes.createStructField(col, DataTypes.IntegerType, true)); 218 | //toSaveToDB.put(col, "int"); 219 | } else if(s.contains("--TB")) { 220 | col = s.split("--")[0]; 221 | table_columns.add(DataTypes.createStructField(col, DataTypes.BooleanType, true)); 222 | //toSaveToDB.put(col, "boolean"); 223 | } else if(s.contains("--TTS")) { 224 | col = s.split("--")[0]; 225 | table_columns.add(DataTypes.createStructField(col, DataTypes.TimestampType, true)); 226 | //toSaveToDB.put(col, "timeDate"); 227 | } else if(s.contains("--TAD")) { 228 | col = s.split("--")[0]; 229 | table_columns.add(DataTypes.createStructField(col, DataTypes.createArrayType(DataTypes.DoubleType, true), true)); 230 | //toSaveToDB.put(col, "arrayDouble"); 231 | } else if(s.contains("--TAI")) { 232 | col = s.split("--")[0]; 233 | table_columns.add(DataTypes.createStructField(col, DataTypes.createArrayType(DataTypes.IntegerType, true), true)); 234 | //toSaveToDB.put(col, "arrayInt"); 235 | } else if(s.contains("**")) { 236 | col = s.replace("**", ""); 237 | table_columns.add(DataTypes.createStructField(col, DataTypes.createArrayType(DataTypes.StringType, true), true)); 238 | } else { 239 | table_columns.add(DataTypes.createStructField(s, DataTypes.StringType, true)); 240 | //toSaveToDB.put(s, "string"); 241 | } 242 | } 243 | 244 | // 8.5 Save columns to database 245 | //saveToMongoDB(replaceInType(key), toSaveToDB, dsName, dsIRI); 246 | 247 | StructType schema = DataTypes.createStructType(table_columns); 248 | 249 | // 8.6. Map RDD of (subject, Iter(property, object)) to an RDD of Row 250 | JavaRDD returnValues = rddByKey.map((Function>>, Row>) i -> { 251 | 252 | Row values_list; 253 | LinkedHashMap po = new LinkedHashMap<>(); 254 | 255 | // 8.6.1 Initialize the hashmap values with null (they're previously initialized with a String "", so if a certain value is an int => a cast error) 256 | for (String j : readColumns) { // TO INHENCE 257 | if(j.contains("--TI")) 258 | po.put(j.replace("--TI", ""),null); 259 | else if(j.contains("--TD") && !readColumns.contains(j + "--TAD")) 260 | po.put(j.replace("--TD", ""),null); 261 | else if(j.contains("--TB")) 262 | po.put(j.replace("--TB", ""),null); 263 | else if(j.contains("--TTS")) 264 | po.put(j.replace("--TTS", ""),null); 265 | else if(j.contains("--TAI")) 266 | po.put(j.replace("--TAI", ""),null); 267 | else if(j.contains("--TAD")) 268 | po.put(j.replace("--TAD", ""),null); 269 | else if(j.contains("**")) 270 | po.put(j.replace("**", ""),null); 271 | else 272 | po.put(j,null); 273 | } 274 | 275 | // 8.6.2 Iterate through all the (property, object) pairs to save data in the collected columns 276 | String subject = i._1; 277 | 278 | for(Tuple2 temp : i._2) { 279 | String property = temp._1(); 280 | String object = reverse(temp._2()); 281 | Object newobject = null; 282 | 283 | if (readColumns.contains(property + "--TD") && !readColumns.contains(property + "--TAD")) { 284 | newobject = Double.parseDouble(object.replace("^^www.w3.org/2001/XMLSchema#double", "").replace("\"", "")); 285 | po.put(property, newobject); 286 | } else if (readColumns.contains(property + "--TI")) { 287 | newobject = Integer.parseInt(object.replace("^^www.w3.org/2001/XMLSchema#integer", "").replace("^^www.w3.org/2001/XMLSchema#int", "").replace("\"", "")); 288 | po.put(property, newobject); 289 | } else if (readColumns.contains(property + "--TB")) { 290 | newobject = Boolean.parseBoolean(object.replace("^^www.w3.org/2001/XMLSchema#boolean", "").replace("\"", "")); 291 | po.put(property, newobject); 292 | } else if (readColumns.contains(property + "--TTS")) { 293 | //SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); 294 | newobject = Timestamp.valueOf(object.replace("^^www.w3.org/2001/XMLSchema#dateTime", "").replace("\"", "").replace("T", " ")); 295 | po.put(property, newobject); 296 | } else if (readColumns.contains(property + "--TAD")) { 297 | ArrayList arr; 298 | newobject = Double.parseDouble(object.replace("^^www.w3.org/2001/XMLSchema#double", "").replace("\"", "")); 299 | if (po.get(property) != null) { 300 | //System.out.println("TYPE (" + po.get(property) + "): "); 301 | arr = (ArrayList) po.get(property); 302 | arr.add((Double) newobject); 303 | } else { 304 | //System.out.println("TYPE (" + po.get(property) + ")"); 305 | arr = new ArrayList<>(); 306 | arr.add((Double) newobject); 307 | } 308 | po.put(property, arr); 309 | } else if (readColumns.contains(property + "--TAI")) { 310 | ArrayList arr = new ArrayList<>(); 311 | if (po.containsKey(property)) { 312 | arr = (ArrayList) po.get(property); 313 | arr.add((Integer) newobject); 314 | } else { 315 | arr.add((Integer) newobject); 316 | } 317 | po.put(property, arr); 318 | } else if (readColumns.contains(property + "**")) { 319 | //ArrayList arr = new ArrayList(); 320 | ArrayList temparr; // In new Parquet, ArrayString type saves only String[]s not ArrayLists, so needs to change back and forth from String to ArrayList String 321 | String[] arr; 322 | newobject = object.replace("**", "").replace("\"", ""); 323 | if (po.get(property) != null) { 324 | //System.out.println("TYPE (" + po.get(property) + "): "); 325 | 326 | arr = (String[]) po.get(property); 327 | temparr = new ArrayList<>(Arrays.asList(arr)); 328 | //arr = (ArrayList) po.get(property); 329 | // create arraylist 330 | temparr.add((String) newobject); 331 | 332 | arr = temparr.toArray(new String[0]); 333 | } else { 334 | arr = new String[]{(String) newobject}; 335 | //arr = new ArrayList(); 336 | //arr.add((String) newobject); 337 | } 338 | //String[] ary = new String[arr.size()]; 339 | //ary = arr.toArray(ary); 340 | po.put(property, arr); 341 | } else 342 | po.put(property, object); 343 | } 344 | 345 | // 8.6.3 Add the subject finally as the ID to the hashmap 346 | po.put("id", subject); 347 | 348 | //System.out.println("Values to be inserted under this schema: " + po.keySet()); 349 | 350 | // 8.6.4 Create the row from the hashmap values 351 | List vals = new ArrayList<>(po.values()); 352 | values_list = RowFactory.create(vals.toArray()); 353 | 354 | return values_list; 355 | }); 356 | 357 | /*returnValues.collect().forEach(row -> { 358 | System.out.println(row.toString()); 359 | });*/ 360 | 361 | //System.out.println("returnValues: " + returnValues); 362 | //System.out.println("schema: " + schema); 363 | 364 | // 8.7 Create an RDD by applying a schema to the RDD 365 | /*Dataset typeDataFrame = spark.createDataFrame(returnValues, schema);*/ 366 | Dataset typeDataFrame = spark.createDataFrame(returnValues, schema); 367 | 368 | // 8.8 Save to Parquet table 369 | //typeDataFrame.write().parquet(output_path + replaceInType(key)); 370 | //} 371 | //} 372 | //ctx.close(); 373 | //spark.stop(); 374 | 375 | return typeDataFrame; 376 | } catch (Exception ex) { 377 | System.out.println("SOMETHING WENT WRONG..." + ex.getMessage()); 378 | //spark.stop(); 379 | 380 | spark.close(); 381 | } 382 | 383 | return null; 384 | } 385 | 386 | private String reverse(String string) { 387 | return new StringBuffer(string).reverse().toString(); 388 | } 389 | 390 | private JavaRDD getRddByKey(JavaPairRDD>>>> pairRdd, String key) { 391 | 392 | JavaPairRDD>>>> a = pairRdd.filter(( 393 | Function>>>>, Boolean>) v -> { 394 | // TODO Auto-generated method stub 395 | return v._1().equals(key); 396 | }); 397 | 398 | /*return a.values().flatMap(tuples -> tuples.iterator());*/ 399 | return a.values().flatMap(tuples -> tuples.iterator()); 400 | } 401 | 402 | // Helping methods 403 | private String removeTagSymbol(String string) { 404 | return string.replace("<", "").replace(">", ""); 405 | } 406 | 407 | private String replaceInValue(String str) { 408 | return str.replace("http://", ""); 409 | } 410 | 411 | private String replaceInType(String str) { 412 | return str.replace("/", "__").replace("-", "@"); 413 | } 414 | 415 | private String replaceInColumn(String str) { 416 | return str.replace("http://", ""); 417 | } 418 | 419 | } 420 | -------------------------------------------------------------------------------- /sansa-datalake/sansa-datalake-spark/src/main/java/net/sansa_stack/datalake/spark/model/Triple.java: -------------------------------------------------------------------------------- 1 | package net.sansa_stack.datalake.spark.model; 2 | 3 | /** 4 | * Created by mmami on 10.10.16. 5 | */ 6 | import java.io.Serializable; 7 | 8 | public class Triple implements Serializable { 9 | public String subject; 10 | public String property; 11 | public Object object; 12 | 13 | public Triple(String subject, String property, Object newobject) { 14 | this.subject = subject; 15 | this.property = property; 16 | this.object = newobject; 17 | } 18 | 19 | public String getSubject() { 20 | return subject; 21 | } 22 | public void setSubject(String subject) { 23 | this.subject = subject; 24 | } 25 | public String getProperty() { 26 | return property; 27 | } 28 | public void setProperty(String property) { 29 | this.property = property; 30 | } 31 | public Object getObject() { 32 | return object; 33 | } 34 | public void setObject(String object) { 35 | this.object = object; 36 | } 37 | 38 | } 39 | -------------------------------------------------------------------------------- /sansa-datalake/sansa-datalake-spark/src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | log4j.rootLogger=INFO, console 2 | 3 | # A1 is set to be a ConsoleAppender. 4 | log4j.appender.console=org.apache.log4j.ConsoleAppender 5 | 6 | # A1 uses PatternLayout. 7 | log4j.appender.console.layout=org.apache.log4j.PatternLayout 8 | log4j.appender.console.layout.ConversionPattern=%-4r [%t] %-5p [%c] %x %m%n 9 | 10 | log4j.logger.org.apache.spark = ERROR 11 | log4j.logger.org.spark_project.jetty.server = ERROR 12 | log4j.logger.org.apache.parquet = ERROR 13 | -------------------------------------------------------------------------------- /sansa-datalake/sansa-datalake-spark/src/main/scala/net/sansa_stack/datalake/spark/Config.scala: -------------------------------------------------------------------------------- 1 | package net.sansa_stack.datalake.spark 2 | 3 | import com.typesafe.config.ConfigFactory 4 | 5 | 6 | class Config { } 7 | 8 | object Config { 9 | 10 | def get(key: String): String = { 11 | 12 | val value = ConfigFactory.load().getString(key) 13 | 14 | value 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /sansa-datalake/sansa-datalake-spark/src/main/scala/net/sansa_stack/datalake/spark/Main.scala: -------------------------------------------------------------------------------- 1 | package net.sansa_stack.datalake.spark 2 | 3 | import org.apache.commons.lang.time.StopWatch 4 | import org.apache.spark.sql.{DataFrame, SparkSession} 5 | 6 | 7 | object Main extends App { 8 | 9 | if (args.length != 4) { 10 | System.err.println("Please provide path to query, mappings and config file as well as Spark master URL") 11 | System.exit(0) 12 | } 13 | 14 | val queryFile = args(0) 15 | val mappingsFile = args(1) 16 | val configFile = args(2) 17 | val executorID = args(3) 18 | 19 | val spark = SparkSession.builder.master(executorID).appName("SANSA-DataLake").getOrCreate 20 | 21 | val hadoopConfig = spark.conf 22 | 23 | val executor : SparkExecutor = new SparkExecutor(spark, mappingsFile) 24 | 25 | val stopwatch: StopWatch = new StopWatch 26 | stopwatch.start() 27 | 28 | val run = new Run[DataFrame](executor) 29 | run.application(queryFile, mappingsFile, configFile) 30 | 31 | stopwatch.stop() 32 | 33 | val timeTaken = stopwatch.getTime 34 | 35 | println(s"Query execution time: $timeTaken ms") 36 | 37 | spark.stop() 38 | 39 | } 40 | -------------------------------------------------------------------------------- /sansa-datalake/sansa-datalake-spark/src/main/scala/net/sansa_stack/datalake/spark/Mapper.scala: -------------------------------------------------------------------------------- 1 | package net.sansa_stack.datalake.spark 2 | 3 | import java.io.ByteArrayInputStream 4 | 5 | import scala.collection.mutable 6 | import scala.collection.mutable.ListBuffer 7 | 8 | import com.typesafe.scalalogging.Logger 9 | import org.apache.jena.query.{QueryExecutionFactory, QueryFactory} 10 | import org.apache.jena.rdf.model.ModelFactory 11 | import play.api.libs.functional.syntax._ 12 | import play.api.libs.json._ 13 | 14 | import net.sansa_stack.datalake.spark.utils.Helpers 15 | 16 | 17 | class Mapper (mappingsFile: String) { 18 | 19 | val logger = Logger("SANSA-DataLake") 20 | 21 | def findDataSources( 22 | stars: mutable.HashMap[ 23 | String, 24 | mutable.Set[(String, String)] 25 | ] with mutable.MultiMap[ 26 | String, 27 | (String, String) 28 | ], 29 | configFile: String 30 | ) : 31 | // returns 32 | mutable.Set[(String, 33 | mutable.Set[(mutable.HashMap[String, String], String, String, mutable.HashMap[String, (String, Boolean)])], 34 | mutable.HashMap[String, (Map[String, String], String)] 35 | )] = { 36 | 37 | val starSources : 38 | mutable.Set[( 39 | String, // Star core 40 | mutable.Set[(mutable.HashMap[String, String], String, String, mutable.HashMap[String, (String, Boolean)])], // A set of data sources relevant to the Star (pred_attr, src, srcType) 41 | mutable.HashMap[String, (Map[String, String], String)] // A set of options of each relevant data source 42 | )] = mutable.Set() 43 | 44 | var count = 0 45 | 46 | for (s <- stars) { 47 | val subject = s._1 // core of the star 48 | val predicates_objects = s._2 49 | 50 | logger.info(s"\n- Going to find datasources relevant to $subject...") 51 | val ds = findDataSource(predicates_objects) // One or more relevant data sources 52 | count = count + 1 53 | 54 | // Options of relevant sources of one star 55 | val optionsentityPerStar : mutable.HashMap[String, (Map[String, String], String)] = new mutable.HashMap() 56 | 57 | // Iterate through the relevant data sources to get options 58 | // One star can have many relevant sources (containing its predicates) 59 | for (d <- ds) { 60 | val src = d._2 61 | 62 | val configJSON = Helpers.readFileFromPath(configFile) 63 | 64 | case class ConfigObject(source: String, options: Map[String, String], entity: String) 65 | 66 | implicit val userReads: Reads[ConfigObject] = ( 67 | (__ \ 'source).read[String] and 68 | (__ \ 'options).read[Map[String, String]] and 69 | (__ \ 'entity).read[String] 70 | ) (ConfigObject) 71 | 72 | val sources = (Json.parse(configJSON) \ "sources").as[Seq[ConfigObject]] 73 | 74 | for (s <- sources) { 75 | if (s.source == src) { 76 | val source = s.source 77 | val options = s.options 78 | val entity = s.entity 79 | 80 | optionsentityPerStar.put(source, (options, entity)) 81 | } 82 | } 83 | } 84 | 85 | starSources.add((subject, ds, optionsentityPerStar)) 86 | } 87 | 88 | // return: subject (star core), list of (data source, options) 89 | starSources 90 | } 91 | 92 | private def findDataSource(predicates_objects: mutable.Set[(String, String)]) : mutable.Set[(mutable.HashMap[String, String], String, String, mutable.HashMap[String, (String, Boolean)])] = { 93 | var listOfPredicatesForQuery = "" 94 | val listOfPredicates : mutable.Set[String] = mutable.Set() 95 | val returnedSources : mutable.Set[(mutable.HashMap[String, String], String, String, mutable.HashMap[String, (String, Boolean)])] = mutable.Set() 96 | 97 | var temp = 0 98 | 99 | logger.info("...with the (Predicate, Object) pairs: " + predicates_objects) 100 | 101 | for (v <- predicates_objects) { 102 | val predicate = v._1 103 | 104 | if (predicate == "rdf:type" || predicate == "a") { 105 | logger.info("...of class: " + v._2) 106 | listOfPredicatesForQuery += "?mp rr:subjectMap ?sm . ?sm rr:class " + v._2 + " . " 107 | 108 | } else { 109 | listOfPredicatesForQuery += "?mp rr:predicateObjectMap ?pom" + temp + " . " + 110 | "?pom" + temp + " rr:predicate " + predicate + " . " + 111 | "?pom" + temp + " rr:objectMap ?om" + temp + " . " 112 | 113 | listOfPredicates.add(predicate) 114 | temp +=1 115 | 116 | } 117 | } 118 | 119 | val queryString = "PREFIX rml: " + 120 | "PREFIX rr: " + 121 | "PREFIX foaf: " + 122 | "PREFIX nosql: " + 123 | "SELECT distinct ?src ?type WHERE {" + 124 | "?mp rml:logicalSource ?ls . " + 125 | "?ls rml:source ?src . " + 126 | "?ls nosql:store ?type . " + 127 | listOfPredicatesForQuery + 128 | "}" 129 | 130 | logger.info("...for this, the following query will be executed: " + queryString + " on " + mappingsFile) 131 | val query = QueryFactory.create(queryString) 132 | 133 | val mappingsString = Helpers.readFileFromPath(mappingsFile) 134 | 135 | val in = new ByteArrayInputStream(mappingsString.getBytes) 136 | 137 | if (in == null) { 138 | throw new IllegalArgumentException("ERROR: File: " + mappingsString + " not found") 139 | } 140 | 141 | val model = ModelFactory.createDefaultModel() 142 | model.read(in, null, "TURTLE") 143 | 144 | // Execute the query and obtain results 145 | val qe = QueryExecutionFactory.create(query, model) 146 | val results = qe.execSelect() 147 | 148 | while(results.hasNext) { // only one result expected (for the moment) 149 | val soln = results.nextSolution() 150 | val src = soln.get("src").toString 151 | val srcType = soln.get("type").toString 152 | 153 | logger.info(">>> Relevant source detected [" + src + "] of type [" + srcType + "]") // considering only first one src 154 | 155 | val predicate_attribute: mutable.HashMap[String, String] = mutable.HashMap() 156 | val predicate_transformations: mutable.HashMap[String, (String, Boolean)] = mutable.HashMap() 157 | 158 | // We will look for predicate transformations (subject transformations later on) 159 | for (p <- listOfPredicates) { 160 | 161 | val getAttributeOfPredicate = "PREFIX rml: " + 162 | "PREFIX rr: " + 163 | "PREFIX foaf: " + 164 | "SELECT ?om ?r ?id WHERE {" + 165 | "?mp rml:logicalSource ?ls . " + 166 | "?ls rml:source \"" + src + "\" . " + 167 | "?mp rr:subjectMap ?sm . " + 168 | "?sm rr:template ?id . " + 169 | "?mp rr:predicateObjectMap ?pom . " + 170 | "?pom rr:predicate " + p + " . " + 171 | "?pom rr:objectMap ?om . " + 172 | "OPTIONAL {?om rml:reference ?r} . " + 173 | "}" 174 | 175 | val query1 = QueryFactory.create(getAttributeOfPredicate) 176 | val qe1 = QueryExecutionFactory.create(query1, model) 177 | val results1 = qe1.execSelect() 178 | 179 | while (results1.hasNext) { 180 | val soln1 = results1.nextSolution() 181 | val om = soln1.getResource("om") 182 | 183 | var fn = "" 184 | var attr = "" 185 | var trans : ListBuffer[String] = ListBuffer() 186 | 187 | if (om.getURI != null) { // the case of FunctionMap 188 | 189 | // Get function 190 | val queryString = "PREFIX rml: " + 191 | "PREFIX rr: " + 192 | "PREFIX foaf: " + 193 | "PREFIX edm: " + 194 | "PREFIX fnml: " + 195 | "PREFIX fno: " + 196 | "PREFIX grel: " + 197 | "SELECT ?fn ?ref WHERE {" + 198 | "<#" + om.getLocalName + "> fnml:functionValue ?fv . " + 199 | "?fv rml:logicalSource \"" + src + "\" . " + 200 | "?fv rr:predicateObjectMap ?pom . " + 201 | "?pom rr:predicate fno:executes . " + 202 | "?pom rr:objectMap ?om . " + 203 | "?om rr:constant ?fn . " + 204 | "?fv rr:predicateObjectMap ?pom1 . " + // we don't use multiple ?pom's coz we don't know how 205 | "?pom1 rr:predicate ?param . " + // many params we have, eg. toUpperCase only 1 param. 206 | "?pom1 rr:objectMap ?om1 . " + // so, 1st ref is the attribute, rest are fnt params 207 | "?om1 rr:reference ?ref . " + 208 | "}" 209 | 210 | val query2 = QueryFactory.create(queryString) 211 | val qe2 = QueryExecutionFactory.create(query2, model) 212 | val results2 = qe2.execSelect() 213 | while (results2.hasNext) { 214 | val soln2 = results2.nextSolution() 215 | 216 | fn = soln2.get("fn").toString 217 | attr = soln2.get("ref").toString // Used also for pred_attr.put() 218 | 219 | trans += fn 220 | trans += attr 221 | } 222 | trans = trans.distinct // to omit duplicates, in this case the function URI e.g. _:greaterThan 223 | 224 | logger.info(s"Transformations for predicate $p (attr: $attr): $trans") 225 | predicate_transformations.put(p, (trans.mkString(" "), false)) 226 | 227 | } else { 228 | try { 229 | attr = soln1.get("r").toString 230 | } catch { 231 | case _: NullPointerException => println("ERROR: Relevant source detected but cannot " + 232 | "be read due to mappings issues. For example, are you using `rr:parentTriplesMap` instead of `rml:reference`?") 233 | System.exit(1) 234 | } 235 | } 236 | 237 | predicate_attribute.put(p, attr) 238 | } 239 | } 240 | 241 | // We will look for subject transformations 242 | val getAttributeOfPredicate = "PREFIX rml: " + 243 | "PREFIX rr: " + 244 | "PREFIX foaf: " + 245 | "SELECT ?fn ?id WHERE {" + 246 | "?mp rml:logicalSource ?ls . " + 247 | "?ls rml:source \"" + src + "\" . " + 248 | "?mp rr:subjectMap ?sm . " + 249 | "?sm rr:objectMap ?fn ." + 250 | "}" 251 | 252 | val query1 = QueryFactory.create(getAttributeOfPredicate) 253 | val qe1 = QueryExecutionFactory.create(query1, model) 254 | val results1 = qe1.execSelect() 255 | 256 | while (results1.hasNext) { 257 | val soln1 = results1.nextSolution() 258 | val fnMap = soln1.getResource("fn") 259 | 260 | var fn = "" 261 | var attr = "" 262 | var trans: ListBuffer[String] = ListBuffer() 263 | 264 | if (fnMap != null) { // the case of FunctionMap 265 | // Get function 266 | val queryString = "PREFIX rml: " + 267 | "PREFIX rr: " + 268 | "PREFIX foaf: " + 269 | "PREFIX edm: " + 270 | "PREFIX fnml: " + 271 | "PREFIX fno: " + 272 | "PREFIX grel: " + 273 | "SELECT ?fn ?ref WHERE {" + 274 | "<#" + fnMap.getLocalName + "> fnml:functionValue ?fv . " + 275 | "?fv rml:logicalSource \"" + src + "\" . " + 276 | "?fv rr:predicateObjectMap ?pom . " + 277 | "?pom rr:predicate fno:executes . " + 278 | "?pom rr:objectMap ?om . " + 279 | "?om rr:constant ?fn . " + 280 | "?fv rr:predicateObjectMap ?pom1 . " + // we don't use multiple ?pom's coz we don't know how 281 | "?pom1 rr:predicate ?param . " + // many params we have, eg. toUpperCase only 1 param. 282 | "?pom1 rr:objectMap ?om1 . " + // so, 1st ref is the attribute, rest are fnt params 283 | "?om1 rr:reference ?ref . " + 284 | "}" 285 | 286 | // val id = soln1.get("id").toString.stripSuffix("}").split("\\{")(1) // get 'id' from 'url{id}' 287 | 288 | val query2 = QueryFactory.create(queryString) 289 | val qe2 = QueryExecutionFactory.create(query2, model) 290 | val results2 = qe2.execSelect() 291 | while (results2.hasNext) { 292 | val soln2 = results2.nextSolution() 293 | 294 | fn = soln2.get("fn").toString 295 | attr = soln2.get("ref").toString // Used also for pred_attr.put() 296 | 297 | trans += fn 298 | trans += attr 299 | } 300 | trans = trans.distinct // to omit duplicates, in this case the function URI e.g. _:greaterThan 301 | 302 | logger.info(s"Transformations for subject/ID ($attr): $trans") 303 | predicate_transformations.put("ID", (trans.mkString(" "), true)) 304 | } 305 | } 306 | 307 | returnedSources.add((predicate_attribute, src, srcType, predicate_transformations)) 308 | } 309 | 310 | qe.close() // Important: free up resources used running the query 311 | 312 | returnedSources 313 | } 314 | } 315 | -------------------------------------------------------------------------------- /sansa-datalake/sansa-datalake-spark/src/main/scala/net/sansa_stack/datalake/spark/Planner.scala: -------------------------------------------------------------------------------- 1 | package net.sansa_stack.datalake.spark 2 | 3 | import java.util 4 | 5 | import scala.collection.JavaConverters._ 6 | import scala.collection.immutable.ListMap 7 | import scala.collection.mutable 8 | import scala.collection.mutable.ListBuffer 9 | 10 | import com.google.common.collect.ArrayListMultimap 11 | import com.typesafe.scalalogging.Logger 12 | import play.api.libs.functional.syntax._ 13 | import play.api.libs.json.{Json, Reads, __} 14 | 15 | import net.sansa_stack.datalake.spark.utils.Helpers 16 | import net.sansa_stack.datalake.spark.utils.Helpers._ 17 | 18 | 19 | class Planner(stars: mutable.HashMap[String, mutable.Set[(String, String)]] with mutable.MultiMap[String, (String, String)]) { 20 | 21 | val logger = Logger("SANSA-DataLake") 22 | 23 | def getNeededPredicates(star_predicate_var: mutable.HashMap[(String, String), String], 24 | joins: ArrayListMultimap[String, (String, String)], 25 | select_vars: util.List[String], 26 | groupBys: (ListBuffer[String], mutable.Set[(String, String)]), 27 | prefixes: Map[String, String]) : (mutable.Set[String], mutable.Set[(String, String)]) = { 28 | 29 | logger.info("star_predicate_var: " + star_predicate_var) 30 | val predicates : mutable.Set[String] = mutable.Set.empty 31 | val predicatesForSelect : mutable.Set[(String, String)] = mutable.Set.empty 32 | 33 | val join_left_vars = joins.keySet() 34 | val join_right_vars = joins.values().asScala.map(x => x._1).toSet // asScala, converts Java Collection to Scala Collection 35 | 36 | val join_left_right_vars = join_right_vars.union(join_left_vars.asScala) 37 | 38 | logger.info("--> All (left & right) join operands: " + join_left_right_vars) 39 | 40 | for (t <- star_predicate_var) { 41 | val s_p = t._1 42 | val o = t._2 43 | 44 | val occurrences = star_predicate_var groupBy ( _._2 ) mapValues ( _.size ) // To capture variables (objects) used in more than one predicate 45 | 46 | if (select_vars.contains(o.replace("?", "")) || join_left_vars.contains(o) || join_right_vars.contains(o) || occurrences(o) > 1) { 47 | predicates.add(s_p._2) 48 | } 49 | 50 | if (select_vars.contains(o.replace("?", ""))) { 51 | predicatesForSelect.add(s_p) 52 | } 53 | if (groupBys != null ) { 54 | // Forming e.g. "failure_isFailureOf_fsmt" 55 | val groupByPredicate = s_p._1.replace("?", "") + "_" + omitNamespace(s_p._2) + "_" + prefixes(get_NS_predicate(s_p._2)._1) 56 | 57 | if (groupBys._2.map(_._1).contains(groupByPredicate)) { // map to get only cols eg failure_isFailureOf from Set((failure_isFailureOf_fsmt,count)) 58 | predicates.add(s_p._2) 59 | } 60 | } 61 | } 62 | 63 | (predicates, predicatesForSelect) 64 | } 65 | 66 | def generateJoinPlan: (ArrayListMultimap[String, (String, String)], mutable.Set[String], mutable.Set[String], Map[(String, String), String]) = { 67 | 68 | val keys = stars.keySet.toSeq 69 | logger.info("Stars: " + keys.toString()) 70 | val joins : ArrayListMultimap[String, (String, String)] = ArrayListMultimap.create[String, (String, String)]() 71 | var joinPairs : Map[(String, String), String] = Map.empty 72 | 73 | val joinedToFlag : mutable.Set[String] = mutable.Set() 74 | val joinedFromFlag : mutable.Set[String] = mutable.Set() 75 | 76 | for (i <- keys.indices) { 77 | val currentSubject = keys(i) 78 | val valueSet = stars(currentSubject) 79 | for(p_o <- valueSet) { 80 | val o = p_o._2 81 | if (keys.contains(o)) { // A previous star of o 82 | val p = p_o._1 83 | joins.put(currentSubject, (o, p)) 84 | joinPairs += (omitQuestionMark(currentSubject), omitQuestionMark(o)) -> p 85 | joinedToFlag.add(o) 86 | joinedFromFlag.add(currentSubject) 87 | } 88 | } 89 | } 90 | 91 | (joins, joinedToFlag, joinedFromFlag, joinPairs) 92 | } 93 | 94 | def reorder(joins: ArrayListMultimap[String, (String, String)], starDataTypesMap: Map[String, mutable.Set[String]], 95 | starNbrFilters: Map[String, Integer], starWeights: Map[String, Double], configFile: String): ListMap[(String, String), Double] = { 96 | 97 | logger.info("...REORDERING JOINS, if needed...") 98 | 99 | var joinsToReorder : ListBuffer[(String, String)] = ListBuffer() 100 | 101 | for (j <- joins.entries.asScala) { 102 | joinsToReorder += ((j.getKey, j.getValue._1)) 103 | } 104 | 105 | val scoredJoins = getScoredJoins(joins, starWeights) 106 | 107 | val sortedScoredJoins = ListMap(scoredJoins.toSeq.sortWith(_._2 > _._2): _*) 108 | 109 | sortedScoredJoins 110 | } 111 | 112 | def getScoredJoins(joins : ArrayListMultimap[String, (String, String)], scores: Map[String, Double]): Map[(String, String), Double] = { 113 | var scoredJoins : Map[(String, String), Double] = Map() 114 | 115 | for (j <- joins.entries.asScala) 116 | scoredJoins += (j.getKey, j.getValue._1) -> (scores(j.getKey) + scores(j.getValue._1)) 117 | 118 | scoredJoins 119 | } 120 | 121 | def sortStarsByWeight(starDataTypesMap: Map[String, mutable.Set[String]], filters: Map[String, Integer], configFile: String): Map[String, Double] = { 122 | val configJSON = Helpers.readFileFromPath(configFile) 123 | 124 | case class ConfigObject(datasource: String, weight: Double) 125 | 126 | implicit val userReads: Reads[ConfigObject] = ( 127 | (__ \ 'datasource).read[String] and 128 | (__ \ 'weight).read[Double] 129 | )(ConfigObject) 130 | 131 | val weights = (Json.parse(configJSON) \ "weights").as[Seq[ConfigObject]] 132 | 133 | var scoresByDatasource : Map[String, Double] = Map() 134 | for (w <- weights) { 135 | scoresByDatasource += w.datasource -> w.weight 136 | } 137 | 138 | logger.info(s"- We use the following scores of the data source types: $scoresByDatasource \n") 139 | 140 | val scores = starScores(starDataTypesMap, scoresByDatasource, filters) 141 | 142 | scores 143 | } 144 | 145 | def starScores(starDataTypesMap: Map[String, mutable.Set[String]], weightsByDatasource: Map[String, Double], filters: Map[String, Integer]): Map[String, Double] = { 146 | var scores : Map[String, Double] = Map() 147 | 148 | var datasourceTypeWeight = 0.0 // Coucou! 149 | 150 | for (s <- starDataTypesMap) { 151 | val star = s._1 // eg. ?r 152 | val datasourceTypeURI_s = s._2 // eg. http://purl.org/db/nosql#cassandra 153 | 154 | val nbrFilters = filters(star).toInt 155 | 156 | if (datasourceTypeURI_s.size == 1) { // only one relevant datasource 157 | val datasourceType = datasourceTypeURI_s.head.split("#")(1) // eg. cassandra 158 | 159 | if (nbrFilters > 0) { 160 | datasourceTypeWeight = weightsByDatasource(datasourceType) + 1 161 | } else { 162 | datasourceTypeWeight = weightsByDatasource(datasourceType) 163 | } 164 | // Add up the number of filters to the score of the star 165 | } 166 | // else, we keep 0, as we are assuming if there are more than 1 data sources, queryig & union-ing them would be expensive 167 | scores += (star -> datasourceTypeWeight) 168 | } 169 | 170 | scores 171 | } 172 | } 173 | -------------------------------------------------------------------------------- /sansa-datalake/sansa-datalake-spark/src/main/scala/net/sansa_stack/datalake/spark/QueryAnalyser.scala: -------------------------------------------------------------------------------- 1 | package net.sansa_stack.datalake.spark 2 | 3 | import java.util 4 | 5 | import com.google.common.collect.ArrayListMultimap 6 | import com.typesafe.scalalogging.Logger 7 | import net.sansa_stack.datalake.spark.utils.Helpers._ 8 | import org.apache.jena.query.QueryFactory 9 | import org.apache.jena.sparql.syntax.{ElementFilter, ElementVisitorBase, ElementWalker} 10 | import scala.collection.JavaConverters._ 11 | import scala.collection.mutable 12 | import scala.collection.mutable.ListBuffer 13 | 14 | 15 | class QueryAnalyser(query: String) { 16 | 17 | val logger = Logger("SANSA-DataLake") 18 | 19 | def getPrefixes : Map[String, String] = { 20 | val q = QueryFactory.create(query) 21 | val prolog = q.getPrologue.getPrefixMapping.getNsPrefixMap 22 | 23 | val prefix: Map[String, String] = invertMap(prolog) 24 | 25 | logger.info("\n- Prefixes: " + prefix) 26 | 27 | prefix 28 | } 29 | 30 | def getProject : (util.List[String], Boolean) = { 31 | val q = QueryFactory.create(query) 32 | val project = q.getResultVars 33 | 34 | logger.info(s"\n- Projected vars: $project") 35 | 36 | (project, q.isDistinct) 37 | } 38 | 39 | def getFilters : ArrayListMultimap[String, (String, String)] = { 40 | val q = QueryFactory.create(query) 41 | val filters : ArrayListMultimap[String, (String, String)] = ArrayListMultimap.create[String, (String, String)]() 42 | 43 | ElementWalker.walk(q.getQueryPattern, new ElementVisitorBase() { // ...when it's a block of triples... 44 | override def visit(ef: ElementFilter): Unit = { // ...go through all the triples... 45 | val bits = ef.getExpr.toString.replace("(", "").replace(")", "").split(" ", 3) // 3 not to split when the right operand is a string with possible white spaces 46 | val operation = bits(1) 47 | val leftOperand = bits(0) 48 | val rightOperand = bits(2) 49 | 50 | logger.info(s"Filter: $operation,($leftOperand,$rightOperand)") 51 | filters.put(operation, (leftOperand, rightOperand)) 52 | } 53 | }) 54 | 55 | filters 56 | } 57 | 58 | def getOrderBy: mutable.Set[(String, String)] = { 59 | val q = QueryFactory.create(query) 60 | var orderBys : mutable.Set[(String, String)] = mutable.Set() 61 | 62 | if (q.hasOrderBy) { 63 | val orderBy = q.getOrderBy.iterator() 64 | 65 | while(orderBy.hasNext) { 66 | val it = orderBy.next() 67 | 68 | orderBys += ((it.direction.toString, it.expression.toString)) 69 | } 70 | } else { 71 | orderBys = null 72 | } 73 | 74 | orderBys 75 | } 76 | 77 | def getGroupBy(variablePredicateStar: Map[String, (String, String)], prefixes: Map[String, String]): (ListBuffer[String], mutable.Set[(String, String)]) = { 78 | val q = QueryFactory.create(query) 79 | val groupByCols : ListBuffer[String] = ListBuffer() 80 | var aggregationFunctions : mutable.Set[(String, String)] = mutable.Set() 81 | 82 | if (q.hasGroupBy) { 83 | val groupByVars = q.getGroupBy.getVars.asScala.toList 84 | for (gbv <- groupByVars) { 85 | val str = variablePredicateStar(gbv.toString())._1 86 | val vr = variablePredicateStar(gbv.toString())._2 87 | val ns_p = get_NS_predicate(vr) 88 | val column = omitQuestionMark(str) + "_" + ns_p._2 + "_" + prefixes(ns_p._1) 89 | 90 | groupByCols.asJava.add(column) 91 | } 92 | 93 | val agg = q.getAggregators.asScala 94 | logger.info("agg: " + agg) 95 | for(ag <- agg) { // toPrefixString returns (aggregate_function aggregate_var) eg. (sum ?price) 96 | val bits = ag.getAggregator.toPrefixString.split(" ") 97 | 98 | val aggCol = "?" + bits(1).dropRight(1).substring(1) // ? added eg ?price in variablePredicateStar 99 | val str = variablePredicateStar(aggCol)._1 100 | val vr = variablePredicateStar(aggCol)._2 101 | val ns_p = get_NS_predicate(vr) 102 | val column = omitQuestionMark(str) + "_" + ns_p._2 + "_" + prefixes(ns_p._1) 103 | 104 | aggregationFunctions += ((column, bits(0).substring(1))) // o_price_cbo -> sum 105 | } 106 | 107 | (groupByCols, aggregationFunctions) 108 | 109 | } else { 110 | null 111 | } 112 | 113 | } 114 | 115 | def getStars : (mutable.HashMap[String, mutable.Set[(String, String)]] with mutable.MultiMap[String, (String, String)], mutable.HashMap[(String, String), String]) = { 116 | 117 | val q = QueryFactory.create(query) 118 | val originalBGP = q.getQueryPattern.toString 119 | 120 | val bgp = originalBGP.replaceAll("\n", "").replaceAll("\\s+", " ").replace("{", " ").replace("}", " ") // See example below + replace breaklines + remove extra white spaces 121 | val triples = bgp.split("\\.(?![^\\<\\[]*[\\]\\>])") 122 | 123 | logger.info("\n- The BGP of the input query: " + originalBGP) 124 | logger.info("\n- Number of triple-stars detected: " + triples.length) 125 | 126 | val stars = new mutable.HashMap[String, mutable.Set[(String, String)]] with mutable.MultiMap[String, (String, String)] 127 | // Multi-map to add/append elements to the value 128 | 129 | // Save [star]_[predicate] 130 | val star_pred_var : mutable.HashMap[(String, String), String] = mutable.HashMap() 131 | 132 | for (i <- triples.indices) { // i <- 0 until triples.length 133 | val triple = triples(i).trim 134 | 135 | logger.info(s"Triple: $triple") 136 | 137 | if (!triple.contains(';')) { // only one predicate attached to the subject 138 | val tripleBits = triple.split(" ") 139 | stars.addBinding(tripleBits(0), (tripleBits(1), tripleBits(2))) 140 | // addBinding` because standard methods like `+` will overwrite the complete key-value pair instead of adding the value to the existing key 141 | 142 | star_pred_var.put((tripleBits(0), tripleBits(1)), tripleBits(2)) 143 | } else { 144 | val triples = triple.split(";") 145 | val firsTriple = triples(0) 146 | val firsTripleBits = firsTriple.split(" ") 147 | val sbj = firsTripleBits(0) // get the first triple which has s p o - rest will be only p o ; 148 | stars.addBinding(sbj, (firsTripleBits(1), firsTripleBits(2))) // add that first triple 149 | star_pred_var.put((sbj, firsTripleBits(1)), firsTripleBits(2)) 150 | 151 | for (i <- 1 until triples.length) { 152 | val t = triples(i).trim.split(" ") 153 | stars.addBinding(sbj, (t(0), t(1))) 154 | 155 | star_pred_var.put((sbj, t(0)), t(1)) 156 | } 157 | } 158 | } 159 | 160 | (stars, star_pred_var) 161 | } 162 | 163 | def getTransformations (trans: String): (Map[String, (String, Array[String])], Map[String, Array[String]]) = { 164 | // Transformations 165 | val transformations = trans.trim().substring(1).split("&&") // E.g. [?k?a.l.+60, ?a?l.r.toInt] 166 | var transmap_left : Map[String, (String, Array[String])] = Map.empty 167 | var transmap_right : Map[String, Array[String]] = Map.empty 168 | for (t <- transformations) { // E.g. ?a?l.r.toInt.scl[61] 169 | val tbits = t.trim.split("\\.", 2) // E.g.[?a?l, r.toInt.scl(_+61)] 170 | val vars = tbits(0).substring(1).split("\\?") // [a, l] 171 | val operation = tbits(1) // E.g. r.toInt.scl(_+60) 172 | val temp = operation.split("\\.", 2) // E.g. [r, toInt.scl(_+61)] 173 | val lORr = temp(0) // E.g. r 174 | val functions = temp(1).split("\\.") // E.g. [toInt, scl(_+61)] 175 | if (lORr == "l") { 176 | transmap_left += (vars(0) -> (vars(1), functions)) 177 | } else { 178 | transmap_right += (vars(1) -> functions) 179 | } 180 | 181 | } 182 | (transmap_left, transmap_right) 183 | } 184 | 185 | def hasLimit: Boolean = QueryFactory.create(query).hasLimit 186 | 187 | def getLimit: Int = QueryFactory.create(query).getLimit.toInt 188 | 189 | } 190 | -------------------------------------------------------------------------------- /sansa-datalake/sansa-datalake-spark/src/main/scala/net/sansa_stack/datalake/spark/QueryExecutor.scala: -------------------------------------------------------------------------------- 1 | package net.sansa_stack.datalake.spark 2 | 3 | import java.util 4 | 5 | import com.google.common.collect.ArrayListMultimap 6 | import com.typesafe.scalalogging.Logger 7 | import scala.collection.mutable 8 | import scala.collection.mutable.ListBuffer 9 | 10 | 11 | trait QueryExecutor[T] { // T is a ParSet (Parallel dataSet) 12 | 13 | val logger = Logger("SANSA-DataLake") 14 | 15 | /* Generates a ParSet with the number of filters (on predicates) in the star */ 16 | def query(sources : mutable.Set[(mutable.HashMap[String, String], String, String, mutable.HashMap[String, (String, Boolean)])], 17 | optionsMap: mutable.HashMap[String, (Map[String, String], String)], 18 | toJoinWith: Boolean, 19 | star: String, 20 | prefixes: Map[String, String], 21 | select: util.List[String], 22 | star_predicate_var: mutable.HashMap[(String, String), String], 23 | neededPredicates: mutable.Set[String], 24 | filters: ArrayListMultimap[String, (String, String)], 25 | leftJoinTransformations: (String, Array[String]), 26 | rightJoinTransformations: Array[String], 27 | joinPairs: Map[(String, String), String] 28 | ) : (T, Integer, String) 29 | 30 | /* Transforms a ParSet to another ParSet based on the SPARQL TRANSFORM clause */ 31 | def transform(ps: Any, column: String, transformationsArray : Array[String]): Any 32 | 33 | /* Print the schema of the ParSet */ 34 | def join(joins: ArrayListMultimap[String, (String, String)], prefixes: Map[String, String], star_df: Map[String, T]): T 35 | 36 | /* Generates a new ParSet projecting out one or more attributes */ 37 | def project(jDF: Any, columnNames: Seq[String], distinct: Boolean): T 38 | 39 | /* Counts the number of tuples of a ParSet */ 40 | def count(joinPS: T): Long 41 | 42 | /* Sort tuples of a ParSet based on an attribute variable */ 43 | def orderBy(joinPS: Any, direction: String, variable: String): T 44 | 45 | /* Group attributes based on aggregates function(s) */ 46 | def groupBy(joinPS: Any, groupBys: (ListBuffer[String], mutable.Set[(String, String)])): T 47 | 48 | /* Return the first 'limitValue' values of the ParSet */ 49 | def limit(joinPS: Any, limitValue: Int) : T 50 | 51 | /* Show some results */ 52 | def show(PS: Any) 53 | 54 | /* Compute the results */ 55 | def run(jDF: Any) 56 | } 57 | -------------------------------------------------------------------------------- /sansa-datalake/sansa-datalake-spark/src/main/scala/net/sansa_stack/datalake/spark/Run.scala: -------------------------------------------------------------------------------- 1 | package net.sansa_stack.datalake.spark 2 | 3 | import java.io.FileNotFoundException 4 | 5 | import scala.collection.JavaConverters._ 6 | import scala.collection.mutable 7 | 8 | import com.typesafe.scalalogging.Logger 9 | import org.apache.spark.sql.DataFrame 10 | 11 | import net.sansa_stack.datalake.spark.utils.Helpers 12 | import net.sansa_stack.datalake.spark.utils.Helpers._ 13 | 14 | 15 | class Run[A](executor: QueryExecutor[A]) { 16 | 17 | private var finalDataSet: A = _ 18 | 19 | def application(queryFile: String, mappingsFile: String, configFile: String): DataFrame = { 20 | 21 | val logger = Logger("SANSA-DataLake") 22 | 23 | // 1. Read SPARQL query 24 | logger.info("QUERY ANALYSIS startigng...") 25 | 26 | try { 27 | var query = Helpers.readFileFromPath(queryFile) 28 | 29 | println(s"Going to execute the query:\n$query") 30 | 31 | // Transformations 32 | var transformExist = false 33 | var transformationsInLine = "" 34 | if (query.contains("TRANSFORM")) { 35 | transformationsInLine = query.substring(query.indexOf("TRANSFORM") + 9, query.lastIndexOf(")")) // E.g. ?k?a.toInt && ?a?l.r.toInt.scl(_+61) 36 | query = query.replace("TRANSFORM" + transformationsInLine + ")", "") // TRANSFORM is not defined in Jena, so remove 37 | transformExist = true 38 | } 39 | 40 | // 2. Extract star-shaped BGPs 41 | val qa = new QueryAnalyser(query) 42 | 43 | 44 | val stars = qa.getStars 45 | val starsNbr = stars._1.size 46 | 47 | // Create a map between the variable and its star and predicate URL [variable -> (star,predicate)] 48 | // Need e.g. to create the column to 'SQL ORDER BY' from 'SPARQL ORDER BY' 49 | var variablePredicateStar: Map[String, (String, String)] = Map() 50 | for (v <- stars._1) { 51 | val star = v._1 52 | val predicate_variable_set = v._2 53 | for (pv <- predicate_variable_set) { 54 | val predicate = pv._1 55 | val variable = pv._2 56 | 57 | variablePredicateStar += (variable -> (star, predicate)) 58 | } 59 | } 60 | 61 | logger.info(s"Predicate Star: $variablePredicateStar") 62 | 63 | val prefixes = qa.getPrefixes 64 | val (select, distinct) = qa.getProject 65 | val filters = qa.getFilters 66 | val orderBys = qa.getOrderBy 67 | val groupBys = qa.getGroupBy(variablePredicateStar, prefixes) 68 | 69 | var limit: Int = 0 70 | if (qa.hasLimit) limit = qa.getLimit 71 | 72 | logger.info("- Predicates per star:") 73 | 74 | val star_predicate_var = stars._2 // TODO: assuming no (star,predicate) with two vars? 75 | logger.info("star_predicate_var: " + star_predicate_var) 76 | 77 | // 3. Generate plan of joins 78 | logger.info("PLAN GENERATION & MAPPINGS") 79 | val pl = new Planner(stars._1) 80 | val pln = pl.generateJoinPlan 81 | val joins = pln._1 82 | val joinedToFlag = pln._2 83 | val joinedFromFlag = pln._3 84 | val joinPairs = pln._4 85 | 86 | // 4. Check mapping file 87 | logger.info("---> MAPPING CONSULTATION") 88 | 89 | val mappers = new Mapper(mappingsFile) 90 | val results = mappers.findDataSources(stars._1, configFile) 91 | 92 | val neededPredicates = pl.getNeededPredicates(star_predicate_var, joins, select, groupBys, prefixes) 93 | val neededPredicatesAll = neededPredicates._1 // all predicates used 94 | val neededPredicatesSelect = neededPredicates._2 // only projected out predicates 95 | 96 | logger.info("--> Needed predicates all: " + neededPredicatesAll) 97 | 98 | var star_df: Map[String, A] = Map.empty 99 | var starNbrFilters: Map[String, Integer] = Map() 100 | 101 | var starDataTypesMap: Map[String, mutable.Set[String]] = Map() 102 | val parsetIDs: Map[String, String] = Map() // Used when subject variables are projected out 103 | 104 | logger.info("---> GOING NOW TO COLLECT DATA") 105 | 106 | for (s <- results) { 107 | val star = s._1 108 | logger.info("star: " + star) 109 | val dataSources = s._2 110 | val options = s._3 111 | 112 | val dataTypes = dataSources.map(d => d._3) 113 | 114 | // 'Mappings' transformations 115 | for (ds <- dataSources) { 116 | val transformations = ds._4 117 | 118 | if (transformations.nonEmpty) { 119 | transformExist = true 120 | } 121 | 122 | for (t <- transformations) { 123 | logger.info("Visiting transformation related to predicate: " + t._1 + " = " + t._2) 124 | val fncParamBits = t._2._1.split(" ") 125 | val fncName = fncParamBits(0) 126 | var fncParam = "" 127 | 128 | if (fncParamBits.size > 2) { // E.g., skip 2 producerID 129 | fncParam = fncParamBits(1) 130 | } // otherwise, it's 1 parameter, e.g., toInt producerID 131 | 132 | val IDorNot = t._2._2 133 | var lOrR = "" 134 | lOrR = if (IDorNot) "l" else "r" 135 | 136 | // Construct the in-line transformation declarations (like 'SPARQL' transformations) 137 | joinPairs.keys.foreach( 138 | x => if (omitQuestionMark(star) == x._1 && joinPairs(x) == t._1) { // Case of predicate transformations 139 | if (transformationsInLine != "") { 140 | transformationsInLine += " && " 141 | } 142 | transformationsInLine += s"?${x._1}?${x._2}.$lOrR.${getFunctionFromURI(fncName)}" 143 | if (fncParam != "") { 144 | transformationsInLine += s"($fncParam)" 145 | } 146 | } else if (omitQuestionMark(star) == x._2) { // Case of ID transformations 147 | if (transformationsInLine != "") { 148 | transformationsInLine += " && " 149 | } 150 | transformationsInLine += s"?${x._1}?${x._2}.$lOrR.${getFunctionFromURI(fncName)}" 151 | if (fncParam != "") { 152 | transformationsInLine += s"($fncParam)" 153 | } 154 | } 155 | ) 156 | } 157 | } 158 | 159 | if (transformationsInLine != "") { 160 | logger.info(s"Transformations found (inline): $transformationsInLine") 161 | } 162 | 163 | starDataTypesMap += (star -> dataTypes) 164 | var parsetIDs : Map[String, String] = Map() 165 | 166 | logger.info("Getting DF relevant to the star: " + star) 167 | 168 | // Transformations 169 | var leftJoinTransformations: (String, Array[String]) = null 170 | var rightJoinTransformations: Array[String] = null 171 | if (transformExist) { 172 | val (transmap_left, transmap_right) = qa.getTransformations(transformationsInLine) 173 | 174 | val str = omitQuestionMark(star) 175 | if (transmap_left.keySet.contains(str)) { 176 | // Get with whom there is a join 177 | val rightOperand = transmap_left(str)._1 178 | val ops = transmap_left(str)._2 179 | 180 | // Get the predicate of the join 181 | val joinLeftPredicate = joinPairs((str, rightOperand)) 182 | leftJoinTransformations = (joinLeftPredicate, ops) 183 | logger.info("Transform (left) on predicate " + joinLeftPredicate + " using " + ops.mkString("_")) 184 | } 185 | 186 | if (transmap_right.keySet.contains(str)) { 187 | rightJoinTransformations = transmap_right(str) 188 | logger.info("Transform (right) ID using " + rightJoinTransformations.mkString("...")) 189 | } 190 | } 191 | 192 | if (joinedToFlag.contains(star) || joinedFromFlag.contains(star)) { 193 | val (ds, numberOfFiltersOfThisStar, parsetID) = executor.query(dataSources, options, toJoinWith = true, star, prefixes, 194 | select, star_predicate_var, neededPredicatesAll, filters, leftJoinTransformations, rightJoinTransformations, 195 | joinPairs) 196 | 197 | if (parsetID != "") { 198 | parsetIDs += (star -> parsetID) 199 | } 200 | 201 | star_df += (star -> ds) // DataFrame representing a star 202 | 203 | starNbrFilters += star -> numberOfFiltersOfThisStar 204 | 205 | logger.info("join...with ParSet schema: " + ds) 206 | } else if (!joinedToFlag.contains(star) && !joinedFromFlag.contains(star)) { 207 | val (ds, numberOfFiltersOfThisStar, parsetID) = executor.query(dataSources, options, toJoinWith = false, star, prefixes, 208 | select, star_predicate_var, neededPredicatesAll, filters, leftJoinTransformations, rightJoinTransformations, 209 | joinPairs) 210 | 211 | // ds.printSchema() // SEE WHAT TO DO HERE TO SHOW BACK THE SCHEMA - MOVE IN SPARKEXECUTOR 212 | 213 | parsetIDs += (star -> parsetID) 214 | star_df += (star -> ds) // DataFrame representing a star 215 | 216 | starNbrFilters += star -> numberOfFiltersOfThisStar 217 | 218 | logger.info("single...with ParSet schema: " + ds) 219 | } 220 | } 221 | 222 | logger.info("QUERY EXECUTION starting...") 223 | logger.info(s"DataFrames: $star_df") 224 | 225 | if (starsNbr > 1) { 226 | logger.info(s"- Here are the (Star, ParSet) pairs:") 227 | logger.info("Join Pairs: " + joinPairs) 228 | 229 | if (starsNbr > 1) logger.info(s"- Here are join pairs: $joins") else logger.info("No join detected.") 230 | logger.info(s"- Number of predicates per star: $starNbrFilters ") 231 | 232 | val starWeights = pl.sortStarsByWeight(starDataTypesMap, starNbrFilters, configFile) 233 | logger.info(s"- Stars weighted (performance + nbr of filters): $starWeights") 234 | 235 | val sortedScoredJoins = pl.reorder(joins, starDataTypesMap, starNbrFilters, starWeights, configFile) 236 | logger.info(s"- Sorted scored joins: $sortedScoredJoins") 237 | val startingJoin = sortedScoredJoins.head 238 | 239 | // Convert starting join to: (leftStar, (rightStar, joinVar)) so we can remove it from $joins 240 | var firstJoin: (String, (String, String)) = null 241 | for (j <- joins.entries.asScala) { 242 | if (j.getKey == startingJoin._1._1 && j.getValue._1 == startingJoin._1._2) { 243 | firstJoin = startingJoin._1._1 -> (startingJoin._1._2, j.getValue._2) 244 | } 245 | } 246 | logger.info(s"- Starting join: $firstJoin") 247 | 248 | finalDataSet = executor.join(joins, prefixes, star_df) 249 | 250 | // finalDataSet.asInstanceOf[DataFrame].printSchema() 251 | 252 | // finalDataSet = executor.joinReordered(joins, prefixes, star_df, firstJoin, starWeights) 253 | } else { 254 | logger.info(s" Single star query") 255 | finalDataSet = star_df.head._2 256 | } 257 | 258 | // Project out columns from the final global join results 259 | var columnNames = Seq[String]() 260 | logger.info(s"--> Needed predicates select: $neededPredicatesSelect") 261 | for (i <- neededPredicatesSelect) { 262 | val star = i._1 263 | val ns_predicate = i._2 264 | val bits = get_NS_predicate(ns_predicate) 265 | 266 | val selected_predicate = omitQuestionMark(star) + "_" + bits._2 + "_" + prefixes(bits._1) 267 | columnNames = columnNames :+ selected_predicate 268 | } 269 | 270 | // Add subjects 271 | for (i <- parsetIDs) { 272 | val star = i._1 273 | val parsetID = i._2 274 | 275 | columnNames = columnNames :+ s"${omitQuestionMark(star)}" 276 | } 277 | 278 | if (groupBys != null) { 279 | logger.info(s"groupBys: $groupBys") 280 | finalDataSet = executor.groupBy(finalDataSet, groupBys) 281 | 282 | // Add aggregation columns to the final project ones 283 | for (gb <- groupBys._2) { 284 | logger.info("-> Add to Project list:" + gb._2) 285 | columnNames = columnNames :+ gb._2 + "(" + gb._1 + ")" 286 | } 287 | } 288 | 289 | // TODO: check the order of PROJECT and ORDER-BY 290 | logger.info(s"SELECTED column names: $columnNames") 291 | 292 | if (orderBys != null) { 293 | logger.info(s"orderBys: $orderBys") 294 | 295 | var orderByList: Set[(String, String)] = Set() 296 | for (o <- orderBys) { 297 | val orderDirection = o._1 298 | val str = variablePredicateStar(o._2)._1 299 | val vr = variablePredicateStar(o._2)._2 300 | val ns_p = get_NS_predicate(vr) 301 | val column = omitQuestionMark(str) + "_" + ns_p._2 + "_" + prefixes(ns_p._1) 302 | orderByList += ((column, orderDirection)) 303 | } 304 | 305 | // TODO: (-1 ASC, -2 DESC) confirm with multiple order-by's 306 | logger.info(s"ORDER BY list: $orderByList (-1 ASC, -2 DESC)") 307 | 308 | for (o <- orderByList) { 309 | val variable = o._1 310 | val direction = o._2 311 | 312 | finalDataSet = executor.orderBy(finalDataSet, direction, variable) 313 | } 314 | } 315 | 316 | logger.info("|__ Has distinct? " + distinct) 317 | finalDataSet = executor.project(finalDataSet, columnNames, distinct) 318 | 319 | if (limit > 0) { 320 | finalDataSet = executor.limit(finalDataSet, limit) 321 | } 322 | 323 | executor.run(finalDataSet) 324 | 325 | finalDataSet.asInstanceOf[DataFrame] 326 | 327 | } catch { 328 | case ex : FileNotFoundException => 329 | println("ERROR: One of input files ins't found (Report it: " + ex + ")") 330 | logger.debug(ex.getStackTrace.toString) 331 | null 332 | 333 | case ex : org.apache.jena.riot.RiotException => 334 | println("ERROR: invalid Mappings. Check syntax. (Report it: " + ex + ")") 335 | logger.debug(ex.getStackTrace.toString) 336 | null 337 | 338 | case ex : org.apache.spark.SparkException => 339 | println("ERROR: invalid Spark Master. (Report it: " + ex + ")") 340 | logger.debug(ex.getStackTrace.toString) 341 | null 342 | 343 | case ex : com.fasterxml.jackson.core.JsonParseException => 344 | println("ERROR: invalid JSON content in config file. (Report it: " + ex + ")") 345 | logger.debug(ex.getStackTrace.toString) 346 | null 347 | 348 | case ex : java.lang.IllegalArgumentException => 349 | println("ERROR: invalid mappings. (Report it: " + ex + ")") 350 | logger.debug(ex.getStackTrace.toString) 351 | null 352 | 353 | case ex : org.apache.jena.query.QueryParseException => 354 | println("ERROR: invalid query. (Report it: " + ex + ")") 355 | logger.debug(ex.getStackTrace.toString) 356 | null 357 | 358 | case ex : com.amazonaws.services.s3.model.AmazonS3Exception => 359 | println(ex.getStackTrace) 360 | println("ERROR: Access to Amazon S3 denied. Check bucket name and key. Check you have ~/.aws/credentials file " + 361 | "with the correct content: \n[default]\naws_access_key_id=...\naws_secret_access_key=...") 362 | null 363 | } 364 | } 365 | } 366 | -------------------------------------------------------------------------------- /sansa-datalake/sansa-datalake-spark/src/main/scala/net/sansa_stack/datalake/spark/SparkExecutor.scala: -------------------------------------------------------------------------------- 1 | package net.sansa_stack.datalake.spark 2 | 3 | import java.util 4 | 5 | import com.google.common.collect.ArrayListMultimap 6 | import com.mongodb.spark.config.ReadConfig 7 | import com.typesafe.scalalogging.Logger 8 | import net.sansa_stack.datalake.spark.utils.Helpers._ 9 | import org.apache.spark.sql.{AnalysisException, Column, DataFrame, SparkSession} 10 | import org.apache.spark.sql.functions._ 11 | import org.apache.spark.sql.types.IntegerType 12 | import scala.collection.JavaConverters._ 13 | import scala.collection.immutable.ListMap 14 | import scala.collection.mutable 15 | import scala.collection.mutable.{ArrayBuffer, HashMap, ListBuffer, Set} 16 | 17 | 18 | class SparkExecutor(spark: SparkSession, mappingsFile: String) extends QueryExecutor[DataFrame] { 19 | 20 | override val logger = Logger("SANSA-DataLake") 21 | 22 | def getType: DataFrame = { 23 | val dataframe : DataFrame = null 24 | dataframe 25 | } 26 | 27 | def query(sources : mutable.Set[(mutable.HashMap[String, String], String, String, mutable.HashMap[String, (String, Boolean)])], 28 | optionsMap_entity: mutable.HashMap[String, (Map[String, String], String)], 29 | toJoinWith: Boolean, 30 | star: String, 31 | prefixes: Map[String, String], 32 | select: util.List[String], 33 | star_predicate_var: mutable.HashMap[(String, String), String], 34 | neededPredicates: mutable.Set[String], 35 | filters: ArrayListMultimap[String, (String, String)], 36 | leftJoinTransformations: (String, Array[String]), 37 | rightJoinTransformations: Array[String], 38 | joinPairs: Map[(String, String), String] 39 | ): (DataFrame, Integer, String) = { 40 | 41 | spark.sparkContext.setLogLevel("ERROR") 42 | 43 | var finalDF : DataFrame = null 44 | var dataSource_count = 0 45 | var parSetId = "" // To use when subject (thus ID) is projected out in SELECT 46 | 47 | for (s <- sources) { 48 | logger.info("NEXT SOURCE...") 49 | dataSource_count += 1 // in case of multiple relevant data sources to union 50 | 51 | val attr_predicate = s._1 52 | logger.info("Star: " + star) 53 | logger.info("Attribute_predicate: " + attr_predicate) 54 | val sourcePath = s._2 55 | val sourceType = getTypeFromURI(s._3) 56 | logger.info("sourcePathsourcePath: " + sourcePath) 57 | val options = optionsMap_entity(sourcePath)._1 // entity is not needed here in SparkExecutor 58 | 59 | // TODO: move to another class better 60 | var columns = getSelectColumnsFromSet(attr_predicate, omitQuestionMark(star), prefixes, select, star_predicate_var, neededPredicates, filters) 61 | 62 | val str = omitQuestionMark(star) 63 | 64 | if (select.contains(str)) { 65 | parSetId = getID(sourcePath, mappingsFile) 66 | columns = s"$parSetId AS `$str`, " + columns 67 | } 68 | 69 | logger.info("Relevant source (" + dataSource_count + ") is: [" + sourcePath + "] of type: [" + sourceType + "]") 70 | 71 | logger.info(s"...from which columns ($columns) are going to be projected") 72 | logger.info(s"...with the following configuration options: $options" ) 73 | 74 | if (toJoinWith) { // That kind of table that is the 1st or 2nd operand of a join operation 75 | val id = getID(sourcePath, mappingsFile) 76 | logger.info(s"...is to be joined with using the ID: ${str}_id (obtained from subjectMap)") 77 | if (columns == "") { 78 | columns = id + " AS " + str + "_ID" 79 | } else { 80 | columns = columns + "," + id + " AS " + str + "_ID" 81 | } 82 | } 83 | 84 | logger.info("sourceType: " + sourceType) 85 | 86 | var df : DataFrame = null 87 | sourceType match { 88 | case "csv" => df = spark.read.options(options).csv(sourcePath) 89 | case "parquet" => df = spark.read.options(options).parquet(sourcePath) 90 | case "cassandra" => 91 | df = spark.read.format("org.apache.spark.sql.cassandra").options(options).load 92 | // case "elasticsearch" => // Will be enabled again when a Scala 2.12 version is provided. 93 | // df = spark.read.format("org.elasticsearch.spark.sql").options(options).load 94 | case "mongodb" => 95 | // spark.conf.set("spark.mongodb.input.uri", "mongodb://127.0.0.1/test.myCollection") 96 | val values = options.values.toList 97 | val mongoConf = if (values.length == 4) makeMongoURI(values(0), values(1), values(2), values(3)) 98 | else makeMongoURI(values(0), values(1), values(2), null) 99 | val mongoOptions: ReadConfig = ReadConfig(Map("uri" -> mongoConf, "partitioner" -> "MongoPaginateBySizePartitioner")) 100 | df = spark.read.format("com.mongodb.spark.sql").options(mongoOptions.asOptions).load 101 | case "jdbc" => 102 | df = spark.read.format("jdbc").options(options).load() 103 | case "rdf" => 104 | val rdf = new NTtoDF() 105 | df = rdf.options(options.asJava).read(sourcePath, spark).toDF() 106 | case _ => 107 | } 108 | 109 | df.createOrReplaceTempView("table") 110 | try { 111 | val newDF = spark.sql("SELECT " + columns + " FROM table") 112 | 113 | if (dataSource_count == 1) { 114 | finalDF = newDF 115 | } else { 116 | finalDF = finalDF.union(newDF) 117 | } 118 | } catch { 119 | case ae: AnalysisException => val logger = println("ERROR: There is a mismatch between the mappings, query and/or data. " + 120 | "Examples: Check `rr:reference` references a correct attribute, or if you have transformations, " + 121 | "Check `rml:logicalSource` is the same between the TripleMap and the FunctionMap. Check if you are " + 122 | "SELECTing a variable used in the graph patterns. Returned error is:\n" + ae) 123 | System.exit(1) 124 | } 125 | 126 | // Transformations 127 | if (leftJoinTransformations != null && leftJoinTransformations._2 != null) { 128 | val column: String = leftJoinTransformations._1 129 | logger.info("Left Join Transformations: " + column + " - " + leftJoinTransformations._2.mkString(".")) 130 | val ns_pred = get_NS_predicate(column) 131 | val ns = prefixes(ns_pred._1) 132 | val pred = ns_pred._2 133 | val col = str + "_" + pred + "_" + ns 134 | finalDF = transform(finalDF, col, leftJoinTransformations._2) 135 | 136 | } 137 | if (rightJoinTransformations != null && !rightJoinTransformations.isEmpty) { 138 | logger.info("right Join Transformations: " + rightJoinTransformations.mkString("_")) 139 | val col = str + "_ID" 140 | finalDF = transform(finalDF, col, rightJoinTransformations) 141 | } 142 | } 143 | 144 | logger.info("- filters: " + filters + " for star " + star) 145 | 146 | var whereString = "" 147 | 148 | var nbrOfFiltersOfThisStar = 0 149 | 150 | val it = filters.keySet().iterator() 151 | while (it.hasNext) { 152 | val value = it.next() 153 | val predicate = star_predicate_var. 154 | filter(t => t._2 == value). 155 | keys. // To obtain (star, predicate) pairs having as value the FILTER'ed value 156 | filter(t => t._1 == star). 157 | map(f => f._2).toList 158 | 159 | if (predicate.nonEmpty) { 160 | val ns_p = get_NS_predicate(predicate.head) // Head because only one value is expected to be attached to the same star an same (object) variable 161 | val column = omitQuestionMark(star) + "_" + ns_p._2 + "_" + prefixes(ns_p._1) 162 | logger.info("--- Filter column: " + column) 163 | 164 | nbrOfFiltersOfThisStar = filters.get(value).size() 165 | 166 | val conditions = filters.get(value).iterator() 167 | while (conditions.hasNext) { 168 | val operand_value = conditions.next() 169 | logger.info("--- Operand - Value: " + operand_value) 170 | whereString = column + operand_value._1 + operand_value._2 171 | logger.info("--- WHERE string: " + whereString) 172 | 173 | 174 | if (operand_value._1 != "regex") { 175 | try { 176 | finalDF = finalDF.filter(whereString) 177 | } catch { 178 | case ae: NullPointerException => val logger = println("ERROR: No relevant source detected.") 179 | System.exit(1) 180 | } 181 | } else { 182 | finalDF = finalDF.filter(finalDF(column).like(operand_value._2.replace("\"", ""))) 183 | // regular expression with _ matching an arbitrary character and % matching an arbitrary sequence 184 | } 185 | } 186 | } 187 | } 188 | 189 | logger.info(s"Number of filters of this star is: $nbrOfFiltersOfThisStar") 190 | 191 | (finalDF, nbrOfFiltersOfThisStar, parSetId) 192 | } 193 | 194 | def transform(df: Any, column: String, transformationsArray : Array[String]): DataFrame = { 195 | 196 | var ndf : DataFrame = df.asInstanceOf[DataFrame] 197 | for (t <- transformationsArray) { 198 | logger.info("Transformation next: " + t) 199 | t match { 200 | case "toInt" => 201 | logger.info("TOINT found") 202 | ndf = ndf.withColumn(column, ndf(column).cast(IntegerType)) 203 | // From SO: values not castable will become null 204 | case s if s.contains("scl") => 205 | val scaleValue = s.replace("scl", "").trim.stripPrefix("(").stripSuffix(")") 206 | logger.info("SCL found: " + scaleValue) 207 | val operation = scaleValue.charAt(0) 208 | operation match { 209 | case '+' => ndf = ndf.withColumn(column, ndf(column) + scaleValue.substring(1).toInt) 210 | case '-' => ndf = ndf.withColumn(column, ndf(column) - scaleValue.substring(1).toInt) 211 | case '*' => ndf = ndf.withColumn(column, ndf(column) * scaleValue.substring(1).toInt) 212 | } 213 | case s if s.contains("skp") => 214 | val skipValue = s.replace("skp", "").trim.stripPrefix("(").stripSuffix(")") 215 | logger.info("SKP found: " + skipValue) 216 | ndf = ndf.filter(!ndf(column).equalTo(skipValue)) 217 | case s if s.contains("substit") => 218 | val replaceValues = s.replace("substit", "").trim.stripPrefix("(").stripSuffix(")").split("\\,") 219 | val valToReplace = replaceValues(0) 220 | val valToReplaceWith = replaceValues(1) 221 | logger.info("SUBSTIT found: " + replaceValues.mkString(" -> ")) 222 | ndf = ndf.withColumn(column, when(col(column).equalTo(valToReplace), valToReplaceWith)) 223 | 224 | case s if s.contains("replc") => 225 | val replaceValues = s.replace("replc", "").trim.stripPrefix("(").stripSuffix(")").split("\\,") 226 | val valToReplace = replaceValues(0).replace("\"", "") 227 | val valToReplaceWith = replaceValues(1).replace("\"", "") 228 | logger.info("REPLC found: " + replaceValues.mkString(" -> ") + " on column: " + column) 229 | ndf = ndf.withColumn(column, when(col(column).contains(valToReplace), regexp_replace(ndf(column), valToReplace, valToReplaceWith))) 230 | case s if s.contains("prefix") => 231 | val prefix = s.replace("prfix", "").trim.stripPrefix("(").stripSuffix(")") 232 | logger.info("PREFIX found: " + prefix) 233 | ndf = ndf.withColumn(column, concat(lit(prefix), ndf.col(column))) 234 | case s if s.contains("postfix") => 235 | val postfix = s.replace("postfix", "").trim.stripPrefix("(").stripSuffix(")") 236 | logger.info("POSTFIX found: " + postfix) 237 | ndf = ndf.withColumn(column, concat(lit(ndf.col(column), postfix))) 238 | case _ => 239 | } 240 | } 241 | 242 | ndf 243 | } 244 | 245 | def join(joins: ArrayListMultimap[String, (String, String)], prefixes: Map[String, String], star_df: Map[String, DataFrame]): DataFrame = { 246 | import scala.collection.JavaConverters._ 247 | import scala.collection.mutable.ListBuffer 248 | 249 | var pendingJoins = mutable.Queue[(String, (String, String))]() 250 | val seenDF : ListBuffer[(String, String)] = ListBuffer() 251 | var firstTime = true 252 | val join = " x " 253 | var jDF : DataFrame = null 254 | 255 | val it = joins.entries.iterator 256 | while ({it.hasNext}) { 257 | val entry = it.next 258 | 259 | val op1 = entry.getKey 260 | val op2 = entry.getValue._1 261 | val jVal = entry.getValue._2 262 | 263 | logger.info(s"-> GOING TO JOIN ($op1 $join $op2) USING $jVal...") 264 | 265 | val njVal = get_NS_predicate(jVal) 266 | val ns = prefixes(njVal._1) 267 | 268 | it.remove() 269 | 270 | val df1 = star_df(op1) 271 | val df2 = star_df(op2) 272 | 273 | if (firstTime) { // First time look for joins in the join hashmap 274 | logger.info("...that's the FIRST JOIN") 275 | seenDF.asJava.add((op1, jVal)) 276 | seenDF.asJava.add((op2, "ID")) 277 | firstTime = false 278 | 279 | // Join level 1 280 | try { 281 | jDF = df1.join(df2, df1.col(omitQuestionMark(op1) + "_" + omitNamespace(jVal) + "_" + ns).equalTo(df2(omitQuestionMark(op2) + "_ID"))) 282 | logger.info("...done") 283 | } catch { 284 | case ae: NullPointerException => val logger = println("ERROR: No relevant source detected.") 285 | System.exit(1) 286 | } 287 | 288 | } else { 289 | val dfs_only = seenDF.map(_._1) 290 | logger.info(s"EVALUATING NEXT JOIN ...checking prev. done joins: $dfs_only") 291 | if (dfs_only.contains(op1) && !dfs_only.contains(op2)) { 292 | logger.info("...we can join (this direction >>)") 293 | 294 | val leftJVar = omitQuestionMark(op1) + "_" + omitNamespace(jVal) + "_" + ns 295 | val rightJVar = omitQuestionMark(op2) + "_ID" 296 | jDF = jDF.join(df2, jDF.col(leftJVar).equalTo(df2.col(rightJVar))) 297 | 298 | seenDF.asJava.add((op2, "ID")) 299 | 300 | 301 | } else if (!dfs_only.contains(op1) && dfs_only.contains(op2)) { 302 | logger.info("...we can join (this direction >>)") 303 | 304 | val leftJVar = omitQuestionMark(op1) + "_" + omitNamespace(jVal) + "_" + ns 305 | val rightJVar = omitQuestionMark(op2) + "_ID" 306 | jDF = df1.join(jDF, df1.col(leftJVar).equalTo(jDF.col(rightJVar))) 307 | 308 | seenDF.asJava.add((op1, jVal)) 309 | 310 | } else if (!dfs_only.contains(op1) && !dfs_only.contains(op2)) { 311 | logger.info("...no join possible -> GOING TO THE QUEUE") 312 | pendingJoins.enqueue((op1, (op2, jVal))) 313 | } 314 | } 315 | } 316 | 317 | while (pendingJoins.nonEmpty) { 318 | logger.info("ENTERED QUEUED AREA: " + pendingJoins) 319 | val dfs_only = seenDF.map(_._1) 320 | 321 | val e = pendingJoins.head 322 | 323 | val op1 = e._1 324 | val op2 = e._2._1 325 | val jVal = e._2._2 326 | 327 | val njVal = get_NS_predicate(jVal) 328 | val ns = prefixes(njVal._1) 329 | 330 | logger.info(s"-> Joining ($op1 $join $op2) using $jVal...") 331 | 332 | val df1 = star_df(op1) 333 | val df2 = star_df(op2) 334 | 335 | if (dfs_only.contains(op1) && !dfs_only.contains(op2)) { 336 | val leftJVar = omitQuestionMark(op1) + "_" + omitNamespace(jVal) + "_" + ns 337 | val rightJVar = omitQuestionMark(op2) + "_ID" 338 | jDF = jDF.join(df2, jDF.col(leftJVar).equalTo(df2.col(rightJVar))) // deep-left 339 | 340 | seenDF.asJava.add((op2, "ID")) 341 | } else if (!dfs_only.contains(op1) && dfs_only.contains(op2)) { 342 | val leftJVar = omitQuestionMark(op1) + "_" + omitNamespace(jVal) + "_" + ns 343 | val rightJVar = omitQuestionMark(op2) + "_ID" 344 | jDF = jDF.join(df1, df1.col(leftJVar).equalTo(jDF.col(rightJVar))) // deep-left 345 | 346 | seenDF.asJava.add((op1, jVal)) 347 | } else if (!dfs_only.contains(op1) && !dfs_only.contains(op2)) { 348 | pendingJoins.enqueue((op1, (op2, jVal))) 349 | } 350 | 351 | pendingJoins = pendingJoins.tail 352 | } 353 | 354 | jDF 355 | } 356 | 357 | def joinReordered(joins: ArrayListMultimap[String, (String, String)], prefixes: Map[String, String], star_df: Map[String, DataFrame], 358 | startingJoin: (String, (String, String)), starWeights: Map[String, Double]): DataFrame = { 359 | import scala.collection.JavaConverters._ 360 | import scala.collection.mutable.ListBuffer 361 | 362 | val seenDF : ListBuffer[(String, String)] = ListBuffer() 363 | val joinSymbol = " x " 364 | var jDF : DataFrame = null 365 | 366 | val op1 = startingJoin._1 367 | val op2 = startingJoin._2._1 368 | val jVal = startingJoin._2._2 369 | val njVal = get_NS_predicate(jVal) 370 | val ns = prefixes(njVal._1) 371 | val df1 = star_df(op1) 372 | val df2 = star_df(op2) 373 | 374 | logger.info(s"-> DOING FIRST JOIN ($op1 $joinSymbol $op2) USING $jVal (namespace: $ns)") 375 | 376 | seenDF.asJava.add((op1, jVal)) 377 | seenDF.asJava.add((op2, "ID")) // TODO: implement join var in the right side too 378 | 379 | // Join level 1 380 | val leftJVar = omitQuestionMark(op1) + "_" + omitNamespace(jVal) + "_" + ns 381 | val rightJVar = omitQuestionMark(op2) + "_ID" 382 | jDF = df1.join(df2, df1.col(leftJVar).equalTo(df2(rightJVar))) 383 | 384 | joins.remove(startingJoin._1, (startingJoin._2._1, startingJoin._2._2)) 385 | 386 | logger.info("...done!") 387 | 388 | var joinsMap : Map[(String, String), String] = Map() 389 | for (jj <- joins.entries().asScala) { 390 | joinsMap += (jj.getKey, jj.getValue._1) -> jj.getValue._2 391 | } 392 | val seenDF1 : mutable.Set[(String, String)] = mutable.Set() 393 | for (s <- seenDF) { 394 | seenDF1 += s 395 | } 396 | 397 | logger.info("joinsMap: " + joinsMap) 398 | while(joinsMap.asJava.size() > 0) { 399 | 400 | val dfs_only = seenDF.map(_._1) 401 | logger.info(s"-> Looking for join(s) that join(s) with: $dfs_only") 402 | 403 | var joinable : Map[(String, String), String] = Map.empty // or Map() 404 | 405 | val j = joinsMap.iterator 406 | while ({j.hasNext}) { 407 | val entry = j.next 408 | 409 | val op1 = entry._1._1 410 | val op2 = entry._1._2 411 | val jVal = entry._2 412 | 413 | val njVal = get_NS_predicate(jVal) 414 | val ns = prefixes(njVal._1) 415 | 416 | if (dfs_only.contains(op1) || dfs_only.contains(op2)) { 417 | joinable += ((op1, op2) -> jVal) 418 | joinsMap -= ((op1, op2)) 419 | } 420 | } 421 | 422 | logger.info("Found those: " + joinable) 423 | 424 | var weighedJoins : Map[(String, (String, String), String), Double] = Map() 425 | for(jj <- joinable) { 426 | val op1 = jj._1._1 427 | val op2 = jj._1._2 428 | val jVal = jj._2 429 | 430 | if (dfs_only.contains(op1) && !dfs_only.contains(op2)) { 431 | logger.info(s"...getting weight of join variable $op2") 432 | 433 | weighedJoins += (op1, (op2, jVal), "op2") -> starWeights(op2) 434 | 435 | } else if (!dfs_only.contains(op1) && dfs_only.contains(op2)) { 436 | logger.info(s"...getting weight of join variable $op1") 437 | 438 | weighedJoins += (op1, (op2, jVal), "op1") -> starWeights(op1) 439 | } 440 | } 441 | 442 | // Sort joins by their weight on the joining side 443 | logger.info(s"weighedJoins: $weighedJoins") 444 | 445 | val sortedWeighedJoins = ListMap(weighedJoins.toSeq.sortWith(_._2 > _._2) : _*) 446 | 447 | logger.info(s"sortedWeighedJoins: $sortedWeighedJoins") 448 | 449 | for(s <- sortedWeighedJoins) { 450 | val op1 = s._1._1 451 | val op2 = s._1._2._1 452 | val jVal = s._1._2._2 453 | val njVal = get_NS_predicate(jVal) 454 | val ns = prefixes(njVal._1) 455 | val joinSide = s._1._3 456 | 457 | val df1 = star_df(op1) 458 | val df2 = star_df(op2) 459 | 460 | logger.info(s"---- $op1 -- $op2 -- $joinSide -- $jVal") 461 | 462 | if (joinSide.equals("op2")) { 463 | logger.info("...we can join (this direction >>) ") 464 | 465 | val leftJVar = omitQuestionMark(op1) + "_" + omitNamespace(jVal) + "_" + ns 466 | val rightJVar = omitQuestionMark(op2) + "_ID" 467 | 468 | logger.info(s"$leftJVar XXX $rightJVar") 469 | jDF = jDF.join(df2, jDF.col(leftJVar).equalTo(df2.col(rightJVar))) 470 | 471 | seenDF.asJava.add((op2, "ID")) 472 | } else if (joinSide.equals("op1")) { 473 | logger.info("...we can join (this direction <<) ") 474 | 475 | val leftJVar = omitQuestionMark(op1) + "_" + omitNamespace(jVal) + "_" + ns 476 | val rightJVar = omitQuestionMark(op2) + "_ID" 477 | jDF = df1.join(jDF, df1.col(leftJVar).equalTo(jDF.col(rightJVar))) 478 | 479 | seenDF.asJava.add((op1, jVal)) 480 | } 481 | } 482 | logger.info(s"-> Fully joined: $seenDF \n") 483 | } 484 | 485 | jDF 486 | } 487 | 488 | def project(jDF: Any, columnNames: Seq[String], distinct: Boolean): DataFrame = { 489 | if (!distinct) { 490 | jDF.asInstanceOf[DataFrame].select(columnNames.head, columnNames.tail : _*) 491 | } else { 492 | jDF.asInstanceOf[DataFrame].select(columnNames.head, columnNames.tail : _*).distinct() 493 | } 494 | } 495 | 496 | def schemaOf(jDF: DataFrame): Unit = { 497 | jDF.printSchema() 498 | } 499 | 500 | def count(jDF: DataFrame): Long = { 501 | jDF.count() 502 | } 503 | 504 | def orderBy(jDF: Any, direction: String, variable: String): DataFrame = { 505 | logger.info("ORDERING...") 506 | 507 | if (direction == "-1") { 508 | jDF.asInstanceOf[DataFrame].orderBy(asc(variable)) 509 | } else { // TODO: assuming the other case is automatically -1 IFNOT change to "else if (direction == "-2") {" 510 | jDF.asInstanceOf[DataFrame].orderBy(desc(variable)) 511 | } 512 | } 513 | 514 | def groupBy(jDF: Any, groupBys: (ListBuffer[String], mutable.Set[(String, String)])): DataFrame = { 515 | 516 | val groupByVars = groupBys._1 517 | val aggregationFunctions = groupBys._2 518 | 519 | val cols : ListBuffer[Column] = ListBuffer() 520 | for (gbv <- groupByVars) { 521 | cols += col(gbv) 522 | } 523 | logger.info("aggregationFunctions: " + aggregationFunctions) 524 | 525 | var aggSet : mutable.Set[(String, String)] = mutable.Set() 526 | for (af <- aggregationFunctions) { 527 | aggSet += ((af._1, af._2)) 528 | } 529 | val aa = aggSet.toList 530 | val newJDF : DataFrame = jDF.asInstanceOf[DataFrame].groupBy(cols: _*).agg(aa.head, aa.tail : _*) 531 | 532 | newJDF.printSchema() 533 | 534 | newJDF 535 | } 536 | 537 | def limit(jDF: Any, limitValue: Int) : DataFrame = jDF.asInstanceOf[DataFrame].limit(limitValue) 538 | 539 | def show(jDF: Any): Unit = { 540 | val columns = ArrayBuffer[String]() 541 | // jDF.asInstanceOf[DataFrame].show 542 | val df = jDF.asInstanceOf[DataFrame] 543 | // df.printSchema() 544 | val schema = df.schema 545 | for (col <- schema) 546 | columns += col.name 547 | 548 | println(columns.mkString(",")) 549 | df.take(20).foreach(x => println(x)) 550 | 551 | println(s"Number of results: ${jDF.asInstanceOf[DataFrame].count()}") 552 | } 553 | 554 | def run(jDF: Any): Unit = { 555 | this.show(jDF) 556 | } 557 | } 558 | -------------------------------------------------------------------------------- /sansa-datalake/sansa-datalake-spark/src/main/scala/net/sansa_stack/datalake/spark/utils/Helpers.scala: -------------------------------------------------------------------------------- 1 | package net.sansa_stack.datalake.spark.utils 2 | 3 | import java.io.ByteArrayInputStream 4 | import java.net.URI 5 | import java.util 6 | 7 | import scala.collection.mutable 8 | 9 | import com.amazonaws.services.s3.AmazonS3ClientBuilder 10 | import com.google.common.collect.ArrayListMultimap 11 | import com.typesafe.scalalogging.Logger 12 | import org.apache.jena.query.{QueryExecutionFactory, QueryFactory} 13 | import org.apache.jena.rdf.model.ModelFactory 14 | 15 | /** 16 | * Created by mmami on 26.07.17. 17 | */ 18 | object Helpers { 19 | 20 | val logger: Logger = Logger("SANSA-DataLake") 21 | 22 | def invertMap(prolog: util.Map[String, String]): Map[String, String] = { 23 | var star_df : Map[String, String] = Map.empty 24 | 25 | val keys = prolog.keySet() 26 | val it = keys.iterator() 27 | while(it.hasNext) { 28 | val key : String = it.next() 29 | star_df += (prolog.get(key) -> key) 30 | } 31 | 32 | star_df 33 | } 34 | 35 | def omitQuestionMark(str: String): String = str.replace("?", "") 36 | 37 | 38 | def omitNamespace(URI: String): String = { 39 | val URIBits = URI.replace("<", "").replace(">", "").replace("#", "/").split("/") 40 | URIBits(URIBits.length-1) 41 | } 42 | 43 | def getNamespaceFromURI(URI: String): String = { 44 | "" // TODO: create 45 | } 46 | 47 | def get_NS_predicate(predicateURI: String): (String, String) = { 48 | 49 | val url = predicateURI.replace("<", "").replace(">", "") 50 | val URIBits = url.split("/") 51 | 52 | var pred = "" 53 | if(predicateURI.contains("#")) { 54 | pred = URIBits(URIBits.length-1).split("#")(1) // like: http://www.w3.org/2000/01/[rdf-schema#label] 55 | } else { 56 | pred = URIBits(URIBits.length-1) 57 | } 58 | 59 | val ns = url.replace(pred, "") 60 | 61 | (ns, pred) 62 | } 63 | 64 | def getTypeFromURI(typeURI: String) : String = { 65 | val dataType = typeURI.split("#") // from nosql ns 66 | 67 | val rtrn = dataType(dataType.length-1) 68 | 69 | rtrn 70 | } 71 | 72 | def getSelectColumnsFromSet(pred_attr: mutable.HashMap[String, String], 73 | star: String, 74 | prefixes: Map[String, String], 75 | select: util.List[String], 76 | star_predicate_var: mutable.HashMap[(String, String), String], 77 | neededPredicates: mutable.Set[String], 78 | filters: ArrayListMultimap[String, (String, String)] 79 | ): String = { 80 | 81 | var columns = "" 82 | var i = 0 83 | 84 | for (v <- pred_attr) { 85 | val attr = v._2 86 | val ns_predicate = Helpers.get_NS_predicate(v._1) 87 | 88 | val ns_predicate_bits = ns_predicate 89 | val NS = ns_predicate_bits._1 90 | val predicate = ns_predicate_bits._2 91 | 92 | val objVar = star_predicate_var(("?" + star, "<" + NS + predicate + ">")) 93 | 94 | logger.info("-> Variable: " + objVar + " exists in WHERE, is it in SELECT? " + select.contains(objVar.replace("?", ""))) 95 | 96 | if (neededPredicates.contains(v._1)) { 97 | val c = " `" + attr + "` AS `" + star + "_" + predicate + "_" + prefixes(NS) + "`" 98 | 99 | if (i == 0) columns += c else columns += "," + c 100 | i += 1 101 | } 102 | 103 | if (filters.keySet().contains(objVar)) { 104 | val c = " `" + attr + "` AS `" + star + "_" + predicate + "_" + prefixes(NS) + "`" 105 | 106 | if (!columns.contains(c)) { // if the column has already been added from the SELECT predicates 107 | if (i == 0) columns += c else columns += "," + c 108 | i += 1 109 | } 110 | 111 | } 112 | } 113 | 114 | columns 115 | } 116 | 117 | def getSelectColumnsFromSet(pred_attr: mutable.HashMap[String, String], 118 | star: String, 119 | prefixes: Map[String, String], 120 | select: util.List[String], 121 | star_predicate_var: mutable.HashMap[(String, String), String], 122 | neededPredicates: mutable.Set[String] 123 | ): String = { 124 | 125 | var columns = "" 126 | var i = 0 127 | 128 | for (v <- pred_attr) { 129 | val attr = v._2 130 | val ns_predicate = Helpers.get_NS_predicate(v._1) 131 | 132 | val ns_predicate_bits = ns_predicate 133 | val NS = ns_predicate_bits._1 134 | val predicate = ns_predicate_bits._2 135 | 136 | val objVar = star_predicate_var(("?" + star, "<" + NS + predicate + ">")) 137 | 138 | logger.info("-> Variable: " + objVar + " exists in WHERE, is it in SELECT? " + select.contains(objVar.replace("?", ""))) 139 | 140 | if (neededPredicates.contains(v._1)) { 141 | val c = attr + " AS `" + star + "_" + predicate + "_" + prefixes(NS) + "`" 142 | if (i == 0) columns += c else columns += "," + c 143 | i += 1 144 | } 145 | } 146 | 147 | columns 148 | } 149 | 150 | def getID(sourcePath: String, mappingsFile: String): String = { 151 | 152 | val queryStr = "PREFIX rml: " + 153 | "PREFIX rr: " + 154 | "PREFIX foaf: " + 155 | "SELECT ?t WHERE {" + 156 | "?mp rml:logicalSource ?ls . " + 157 | "?ls rml:source \"" + sourcePath + "\" . " + 158 | "?mp rr:subjectMap ?sm . " + 159 | "?sm rr:template ?t " + 160 | "}" 161 | 162 | val mappingsString = readFileFromPath(mappingsFile) 163 | 164 | val in = new ByteArrayInputStream(mappingsString.getBytes) 165 | 166 | val model = ModelFactory.createDefaultModel() 167 | model.read(in, null, "TURTLE") 168 | 169 | var id = "" 170 | 171 | val query = QueryFactory.create(queryStr) 172 | val qe = QueryExecutionFactory.create(query, model) 173 | val rs = qe.execSelect() 174 | if (rs.hasNext) { 175 | val qs = rs.next() 176 | val template = qs.get("t").toString 177 | 178 | val templateBits = template.split("/") 179 | id = templateBits(templateBits.length-1).replace("{", "").replace("}", "") 180 | } 181 | qe.close() 182 | model.close() 183 | 184 | id 185 | } 186 | 187 | def makeMongoURI(uri: String, database: String, collection: String, options: String): String = { 188 | if (options == null) { 189 | s"mongodb://$uri/$database.$collection" 190 | } else { 191 | s"mongodb://$uri/$database.$collection?$options" 192 | } 193 | // mongodb://db1.example.net,db2.example.net:27002,db3.example.net:27003/?db_name&replicaSet=YourReplicaSetName 194 | // mongodb://172.18.160.16,172.18.160.17,172.18.160.18/db.offer?replicaSet=mongo-rs 195 | } 196 | 197 | def getFunctionFromURI(URI: String): String = { 198 | val functionName = URI match { 199 | case "http://users.ugent.be/~bjdmeest/function/grel.ttl#scale" => "scl" 200 | case "http://users.ugent.be/~bjdmeest/function/grel.ttl#substitute" => "substit" 201 | case "http://users.ugent.be/~bjdmeest/function/grel.ttl#skip" => "skp" 202 | case "http://users.ugent.be/~bjdmeest/function/grel.ttl#replace" => "replc" 203 | case "http://users.ugent.be/~bjdmeest/function/grel.ttl#prefix" => "prefix" 204 | case "http://users.ugent.be/~bjdmeest/function/grel.ttl#postfix" => "postfix" 205 | case "http://users.ugent.be/~bjdmeest/function/grel.ttl#toInt" => "toInt" 206 | case _ => "" 207 | } 208 | 209 | functionName 210 | } 211 | 212 | /** 213 | * Reads file from path and returns its content as string. 214 | * Supported protocols: hds, s3, file 215 | * 216 | * @param path the path to the file 217 | * @return the content as string 218 | */ 219 | def readFileFromPath(path: String): String = { 220 | val uri = URI.create(path) 221 | 222 | val scheme = uri.getScheme 223 | 224 | val source = scheme match { 225 | case "hdfs" => 226 | val hdfs = org.apache.hadoop.fs.FileSystem.get(uri, new org.apache.hadoop.conf.Configuration()) 227 | val hdfsPath = new org.apache.hadoop.fs.Path(uri) 228 | 229 | scala.io.Source.fromInputStream(hdfs.open(hdfsPath)) 230 | case "s3" => 231 | val bucket_key = path.replace("s3://", "").split("/") 232 | val bucket = bucket_key.apply(0) // apply(x) = (x) 233 | val key = if (bucket_key.length > 2) bucket_key.slice(1, bucket_key.length).mkString("/") else bucket_key(1) // Case of folder 234 | 235 | import com.amazonaws.services.s3.model.GetObjectRequest 236 | 237 | val s3 = AmazonS3ClientBuilder.standard() 238 | .withRegion("us-east-1") 239 | .withForceGlobalBucketAccessEnabled(true) 240 | .build() 241 | 242 | val s3object = s3.getObject(new GetObjectRequest(bucket, key)) 243 | 244 | scala.io.Source.fromInputStream(s3object.getObjectContent) 245 | case "file" | null => // from local file system 246 | scala.io.Source.fromFile(path) 247 | case _ => throw new IllegalArgumentException(s"unsupported path (only s3, hdfs, and file supported yet): $path") 248 | } 249 | 250 | val content = source.mkString 251 | source.close() 252 | 253 | content 254 | } 255 | 256 | def main(args: Array[String]): Unit = { 257 | println(Helpers.readFileFromPath("/tmp/flight.owl")) 258 | println(Helpers.readFileFromPath("s3://sansa-datalake/Q1.sparql")) 259 | println(Helpers.readFileFromPath("hdfs://localhost:8080/tmp/foo.bar")) 260 | } 261 | 262 | } 263 | -------------------------------------------------------------------------------- /sansa-datalake/scalastyle-config.xml: -------------------------------------------------------------------------------- 1 | 17 | 39 | 40 | 41 | Scalastyle standard configuration 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | true 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | ARROW, EQUALS, ELSE, TRY, CATCH, FINALLY, LARROW, RARROW 126 | 127 | 128 | 129 | 130 | 131 | ARROW, EQUALS, COMMA, COLON, IF, ELSE, DO, WHILE, FOR, MATCH, TRY, CATCH, FINALLY, LARROW, RARROW 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | @VisibleForTesting 155 | 158 | 159 | 160 | 161 | Runtime\.getRuntime\.addShutdownHook 162 | 170 | 171 | 172 | 173 | mutable\.SynchronizedBuffer 174 | 182 | 183 | 184 | 185 | Class\.forName 186 | 193 | 194 | 195 | 196 | Await\.result 197 | 204 | 205 | 206 | 207 | 208 | JavaConversions 209 | Instead of importing implicits in scala.collection.JavaConversions._, import 210 | scala.collection.JavaConverters._ and use .asScala / .asJava methods 211 | 212 | 213 | 214 | org\.apache\.commons\.lang\. 215 | Use Commons Lang 3 classes (package org.apache.commons.lang3.*) instead 216 | of Commons Lang 2 (package org.apache.commons.lang.*) 217 | 218 | 219 | 220 | extractOpt 221 | Use Utils.jsonOption(x).map(.extract[T]) instead of .extractOpt[T], as the latter 222 | is slower. 223 | 224 | 225 | 226 | 227 | java,scala,3rdParty,sansa 228 | javax?\..* 229 | scalax?\..* 230 | (?!net\.sansa_stack\.inference\.).* 231 | net\.sansa_stack\..* 232 | 233 | 234 | 235 | 236 | 237 | COMMA 238 | 239 | 240 | 241 | 242 | 243 | \)\{ 244 | 247 | 248 | 249 | 250 | 251 | 252 | 253 | 254 | 255 | case[^\n>]*=>\s*\{ 256 | Omit braces in case clauses. 257 | 258 | 259 | 260 | 261 | ^Override$ 262 | override modifier should be used instead of @java.lang.Override. 263 | 264 | 265 | 266 | 267 | 268 | 269 | 270 | 271 | 272 | 273 | 274 | 275 | 276 | 277 | 278 | 279 | 280 | 281 | 282 | 283 | 284 | 285 | 286 | 287 | 288 | 289 | 290 | 291 | 292 | 293 | 294 | 295 | 296 | 297 | 298 | 299 | 300 | 301 | 302 | 303 | 304 | 305 | 306 | 307 | 308 | 309 | 310 | 311 | 312 | 800> 313 | 314 | 315 | 316 | 317 | 30 318 | 319 | 320 | 321 | 322 | 10 323 | 324 | 325 | 326 | 327 | 50 328 | 329 | 330 | 331 | 332 | 333 | 334 | 335 | 336 | 337 | 338 | -1,0,1,2,3 339 | 340 | 341 | 342 | --------------------------------------------------------------------------------