└── sansa-datalake
├── .gitignore
├── .travis.yml
├── LICENSE
├── README.md
├── pom.xml
├── sansa-datalake-spark
├── .gitignore
├── pom.xml
└── src
│ └── main
│ ├── java
│ └── net
│ │ └── sansa_stack
│ │ └── datalake
│ │ └── spark
│ │ ├── NTtoDF.java
│ │ └── model
│ │ └── Triple.java
│ ├── resources
│ └── log4j.properties
│ └── scala
│ └── net
│ └── sansa_stack
│ └── datalake
│ └── spark
│ ├── Config.scala
│ ├── Main.scala
│ ├── Mapper.scala
│ ├── Planner.scala
│ ├── QueryAnalyser.scala
│ ├── QueryExecutor.scala
│ ├── Run.scala
│ ├── SparkExecutor.scala
│ └── utils
│ └── Helpers.scala
└── scalastyle-config.xml
/sansa-datalake/.gitignore:
--------------------------------------------------------------------------------
1 | target/
2 | pom.xml.tag
3 | pom.xml.releaseBackup
4 | pom.xml.versionsBackup
5 | pom.xml.next
6 | release.properties
7 | dependency-reduced-pom.xml
8 | buildNumber.properties
9 | .mvn/timing.properties
10 | # eclipse conf file
11 | .settings
12 | .classpath
13 | .project
14 | .manager
15 | .scala_dependencies
16 | .cashe
17 | .cache-main
18 | .cache-tests
19 | .classpath
20 | #.coveralls.yml
21 | deptree.txt
22 | # IntelliJ config
23 | *.iml
24 | .idea
25 | /bin
26 |
27 | # filename i use to store output of mvn dependency:tree ~Claus
28 | deptree.txt
29 | # local project specific tmp folder
30 | tmp
31 |
32 | scalastyle-output.xml
33 |
--------------------------------------------------------------------------------
/sansa-datalake/.travis.yml:
--------------------------------------------------------------------------------
1 | language: scala
2 | sudo: false
3 | cache:
4 | directories:
5 | - $HOME/.m2
6 | scala:
7 | - 2.12.11
8 | jdk:
9 | - openjdk8
10 | script:
11 | - mvn scalastyle:check
12 | - mvn test
13 |
--------------------------------------------------------------------------------
/sansa-datalake/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/sansa-datalake/README.md:
--------------------------------------------------------------------------------
1 | # DataLake
2 | [](https://maven-badges.herokuapp.com/maven-central/net.sansa-stack/sansa-datalake-parent_2.11)
3 | [](https://ci.aksw.org/jenkins/job/SANSA-DataLake//job/master/)
4 | [](https://opensource.org/licenses/Apache-2.0)
5 | [](https://twitter.com/SANSA_Stack)
6 |
7 | A library to query heterogeneous data sources uniformly using SPARQL.
8 |
9 | ## Description
10 | ### Data Lake
11 | The term Data Lake denotes a schema-less repository of data residing in its original format and form. As such, there is not a single point of entry to the Data Lake, as data in its diversity has various schemata, query interfaces and languages.
12 |
13 | ### _Semantic_ Data Lake
14 | Semantic Data Lake is an effort to enable querying this wealth of heterogeneous data using Semantic Web principles: mapping language and SPARQL query language. This supplies the Data Lake with a schema and enables a one entry point, SPARQL query, to the various heterogeneous data. In order to reach a data source, the latter needs to be connected to.
15 |
16 | That said, to query the data lake using the _Semantic Data Lake_ approach, users need to provide three inputs: (1) Mappings file, (2) Config file, and (3) a SPARQL query, described in the next three sections.
17 |
18 | ### 1. Mapping Language and Data Lake Schema
19 | A virtual schema is added to the Data Lake by _mapping_ data elements, e.g., tables and attributes to ontology concepts, e.g., classes and predicates. We benefit from [RML](http://rml.io/) mappings to express those schema mapping links.
20 |
21 | An example of such mappings is given below. It maps a collection named _Product_ (`rml:source "Product"`) in a MongoDB database to an ontology class _Product_ (`rr:class bsbm:Product`), meaning that every documebt in Product document is of type `bsbm:Product`. The mappings also link MongoDB collection fields `label`, `publisher` and `producer` to ontology predicates `rdfs:label`, `dc:publisher` and `bsbm:producer`, respectively. The `_id` field found in `rr:subjectMap rr:template "http://example.com/{_id}"` triple points to the primary key of MongoDB collection.
22 |
23 | ```
24 | <#OfferMapping>
25 | rml:logicalSource [
26 | rml:source "//Offer";
27 | nosql:store nosql:Mongodb
28 | ];
29 | rr:subjectMap [
30 | rr:template "http://example.com/{_id}";
31 | rr:class schema:Offer
32 | ];
33 |
34 | rr:predicateObjectMap [
35 | rr:predicate bsbm:validTo;
36 | rr:objectMap [rml:reference "validTo"]
37 | ];
38 |
39 | rr:predicateObjectMap [
40 | rr:predicate dc:publisher;
41 | rr:objectMap [rml:reference "publisher"]
42 | ];
43 |
44 | rr:predicateObjectMap [
45 | rr:predicate bsbm:producer;
46 | rr:objectMap [rml:reference "producer"]
47 | ];
48 | ```
49 |
50 | Note the presence of the triple `nosql:store nosql:MongoDB`, it contains an addition to RML mappings from the [NoSQL ontology](http://purl.org/db/nosql#) to allow stating what type of source it is being mapped.
51 |
52 | _The mappings file can either be created manually or using the following graphical utility: [Squerall-GUI](https://github.com/EIS-Bonn/Squerall-GUI)_.
53 |
54 | ### 2. Data Connection Configurations
55 | In order for data to connect to a data source, users need to provide a set of config parameters, in JSON format. This differs from data source to another, for example for a MongoDB collection, the config parameters could be: database host URL, database name, collection name, and replica set name.
56 |
57 | ```JSON
58 | {
59 | "type": "mongodb",
60 | "options": {
61 | "url": "127.0.0.1",
62 | "database": "bsbm",
63 | "collection": "offer",
64 | "options": "replicaSet=mongo-rs"
65 | },
66 | "source": "//Offer",
67 | "entity": "Offer"
68 | }
69 | ```
70 |
71 | It is necessary to link the configured source (`"source": "//Offer"`) to the mapped source (`rml:logicalSource rml:source "//Offer"`, see Mapping section above)
72 |
73 | _The config file can either be created manually or using the following graphical utility: [Squerall-GUI](https://github.com/EIS-Bonn/Squerall-GUI)_.
74 |
75 | ### 3. SPARQL Query Interface
76 | SPARQL queries are expressed using the Ontology terms the data was previously mapped to. SPARQL query should conform to the currently supported SPARQL fragment:
77 |
78 | ```SPARQL
79 | Query := Prefix* SELECT Distinguish WHERE{ Clauses } Modifiers?
80 | Prefix := PREFIX "string:" IRI
81 | Distinguish := DISTINCT? (“*”|(Var|Aggregate)+)
82 | Aggregate := (AggOpe(Var) ASVar)
83 | AggOpe := SUM|MIN|MAX|AVG|COUNT
84 | Clauses := TP* Filter?
85 | Filter := FILTER (Var FiltOpe Litteral)
86 | | FILTER regex(Var, "%string%")
87 | FiltOpe :==|!=|<|<=|>|>=
88 | TP := VarIRIVar .|Varrdf:type IRI.
89 | Var := "?string"
90 | Modifiers := (LIMITk)? (ORDER BY(ASC|DESC)? Var)? (GROUP BYVar+)?
91 | ```
92 |
93 | ### File Storage format
94 | The previous three files can be stored either locally, in HDFS on in an AWS S3 bucket. For the latter, make sure to have your credentials ([see](https://docs.aws.amazon.com/sdk-for-java/v1/developer-guide/setup-credentials.html)) stored in ~/.aws/credentials (C:\Users\USERNAME\.aws\credentials on Windows), in the following form:
95 | ```
96 | [default]
97 | aws_access_key_id=...
98 | aws_secret_access_key=...
99 | ```
100 |
101 | ## Usage
102 | The usage of the Semantic Data Lake is documented under the respective SANSA-Query [datalake component](https://github.com/SANSA-Stack/SANSA-Query/tree/develop/sansa-query-spark/src/main/scala/net/sansa_stack/query/spark/datalake).
103 |
104 | ## How to Contribute
105 | We always welcome new contributors to the project! Please see [our contribution guide](http://sansa-stack.net/contributing-to-sansa/) for more details on how to get started contributing to SANSA.
106 |
--------------------------------------------------------------------------------
/sansa-datalake/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
5 | 4.0.0
6 |
7 | sansa-datalake-parent_2.12
8 |
9 |
10 | net.sansa-stack
11 | sansa-parent_2.12
12 | 0.7.2-SNAPSHOT
13 |
14 |
15 |
16 | pom
17 |
18 | SANSA Stack - DataLake Layer - Parent
19 | A library to query heterogeneous data sources uniformly using SPARQL
20 | https://github.com/SANSA-Stack/SANSA-DataLake
21 | 2015
22 |
23 |
24 | Smart Data Analytics (SDA) research group
25 | http://sda.tech
26 |
27 |
28 |
29 | https://github.com/SANSA-Stack/SANSA-DataLake
30 | scm:git:git://github.com/SANSA-Stack/SANSA-DataLake.git
31 | scm:git:git@github.com:SANSA-Stack/SANSA-DataLake.git
32 | HEAD
33 |
34 |
35 |
36 | https://github.com/SANSA-Stack/SANSA-DataLake/issues
37 | GitHub
38 |
39 |
40 |
41 |
42 | Apache License 2.0
43 | http://www.apache.org/licenses/LICENSE-2.0.html
44 | repo
45 |
46 |
47 |
48 |
49 |
50 | Mohamed Nadjib MAMI
51 | https://github.com/mnmami
52 | SDA
53 | http://sda.tech
54 |
55 | contributor
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 | maven-project-info-reports-plugin
64 | 2.9
65 |
66 |
67 | net.alchim31.maven
68 | scala-maven-plugin
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 | org.apache.spark
78 | spark-hive_${scala.binary.version}
79 | ${spark.version}
80 | test
81 |
82 |
83 |
84 | de.javakaffee
85 | kryo-serializers
86 |
87 |
88 |
89 | io.gatling
90 | jsonpath_${scala.binary.version}
91 | 0.6.10
92 |
93 |
94 |
95 | com.typesafe.play
96 | play_${scala.binary.version}
97 | 2.6.2
98 |
99 |
100 |
101 |
102 | com.datastax.spark
103 | spark-cassandra-connector_${scala.binary.version}
104 | 2.4.2
105 |
106 |
107 |
108 | org.mongodb.spark
109 | mongo-spark-connector_${scala.binary.version}
110 | 2.4.0
111 |
112 |
113 |
114 | com.couchbase.client
115 | spark-connector_${scala.binary.version}
116 | 2.4.0
117 |
118 |
119 |
120 |
125 |
126 |
127 | mysql
128 | mysql-connector-java
129 | 8.0.16
130 |
131 |
132 |
133 |
134 | com.amazonaws
135 | aws-java-sdk-s3
136 | 1.11.791
137 |
138 |
139 |
140 |
141 |
142 |
143 | org.scalatest
144 | scalatest_${scala.binary.version}
145 |
146 |
147 |
148 |
149 |
150 |
151 | org.apache.maven.plugins
152 | maven-compiler-plugin
153 |
154 |
155 |
156 | org.apache.maven.plugins
157 | maven-surefire-plugin
158 |
159 |
160 |
161 | org.apache.maven.plugins
162 | maven-source-plugin
163 |
164 |
165 |
166 | org.apache.maven.plugins
167 | maven-javadoc-plugin
168 |
169 |
170 |
171 | net.alchim31.maven
172 | scala-maven-plugin
173 |
174 |
175 |
176 | org.apache.maven.plugins
177 | maven-site-plugin
178 |
179 |
180 |
181 | com.amashchenko.maven.plugin
182 | gitflow-maven-plugin
183 |
184 |
185 |
186 | org.scalatest
187 | scalatest-maven-plugin
188 |
189 | ${project.build.directory}/surefire-reports
190 | .
191 | SANSA-DataLake-Tests.txt
192 |
193 |
194 |
195 | test
196 |
197 | test
198 |
199 |
200 |
201 |
202 |
203 |
204 |
205 |
206 |
207 |
208 | maven.aksw.internal
209 | AKSW Release Repository
210 | http://maven.aksw.org/archiva/repository/internal
211 |
212 |
213 | maven.aksw.snapshots
214 | AKSW Snapshot Repository
215 | http://maven.aksw.org/archiva/repository/snapshots
216 |
217 |
218 |
219 |
220 |
221 | root-dir
222 |
223 |
224 | ${project.basedir}/../../scalastyle-config.xml
225 |
226 |
227 |
228 | ${project.basedir}/../scalastyle-config.xml
229 |
230 |
231 |
232 | doclint-java8-disable
233 |
234 | [1.8,)
235 |
236 |
237 |
238 |
239 |
240 | org.apache.maven.plugins
241 | maven-javadoc-plugin
242 |
243 |
244 | attach-javadocs
245 |
246 | jar
247 |
248 |
249 | false
250 |
251 |
252 |
253 |
254 | none
255 |
256 |
257 |
258 |
259 |
260 |
261 |
262 | release
263 |
264 |
265 | performRelease
266 | true
267 |
268 |
269 |
270 |
271 |
272 |
273 |
274 | org.apache.maven.plugins
275 | maven-gpg-plugin
276 |
277 |
278 |
279 | org.sonatype.plugins
280 | nexus-staging-maven-plugin
281 |
282 |
283 |
284 |
285 |
286 |
287 |
288 |
289 |
290 | oss-sonatype
291 | oss-sonatype
292 | https://oss.sonatype.org/content/repositories/snapshots/
293 |
294 | true
295 |
296 |
297 |
298 | apache-snapshot
299 | Apache repository (snapshots)
300 | https://repository.apache.org/content/repositories/snapshots/
301 |
302 | true
303 |
304 |
305 |
306 | maven.aksw.internal
307 | AKSW Release Repository
308 | http://maven.aksw.org/archiva/repository/internal
309 |
310 | true
311 |
312 |
313 | false
314 |
315 |
316 |
317 | maven.aksw.snapshots
318 | AKSW Snapshot Repository
319 | http://maven.aksw.org/archiva/repository/snapshots
320 |
321 | false
322 |
323 |
324 | true
325 |
326 |
327 |
328 |
329 |
330 | sansa-datalake-spark
331 |
332 |
333 |
--------------------------------------------------------------------------------
/sansa-datalake/sansa-datalake-spark/.gitignore:
--------------------------------------------------------------------------------
1 | /target/
2 |
--------------------------------------------------------------------------------
/sansa-datalake/sansa-datalake-spark/pom.xml:
--------------------------------------------------------------------------------
1 |
3 | 4.0.0
4 |
5 | sansa-datalake-spark_2.12
6 |
7 |
8 | net.sansa-stack
9 | sansa-datalake-parent_2.12
10 | 0.7.2-SNAPSHOT
11 |
12 |
13 |
14 |
15 | org.apache.spark
16 | spark-core_${scala.binary.version}
17 |
18 |
19 |
20 | org.apache.spark
21 | spark-sql_${scala.binary.version}
22 |
23 |
24 |
25 | org.apache.spark
26 | spark-hive_${scala.binary.version}
27 | compile
28 |
29 |
30 |
31 |
32 | org.scala-lang
33 | scala-library
34 |
35 |
36 |
37 |
38 | org.apache.jena
39 | jena-core
40 |
41 |
42 |
43 | org.apache.jena
44 | jena-arq
45 |
46 |
47 |
48 | io.gatling
49 | jsonpath_${scala.binary.version}
50 |
51 |
52 |
53 | com.typesafe.play
54 | play_${scala.binary.version}
55 |
56 |
57 |
58 |
59 | com.datastax.spark
60 | spark-cassandra-connector_${scala.binary.version}
61 |
62 |
63 |
64 | org.mongodb.spark
65 | mongo-spark-connector_${scala.binary.version}
66 |
67 |
68 |
69 | com.couchbase.client
70 | spark-connector_${scala.binary.version}
71 |
72 |
73 |
74 |
78 |
79 |
80 | mysql
81 | mysql-connector-java
82 |
83 |
84 |
85 | com.amazonaws
86 | aws-java-sdk-s3
87 |
88 |
89 |
90 |
91 | com.typesafe.scala-logging
92 | scala-logging_${scala.binary.version}
93 |
94 |
95 |
96 | ch.qos.logback
97 | logback-classic
98 | 1.2.3
99 | test
100 |
101 |
102 |
103 |
104 |
105 |
106 | org.scalastyle
107 | scalastyle-maven-plugin
108 |
109 |
110 | net.alchim31.maven
111 | scala-maven-plugin
112 |
113 |
114 |
115 |
116 |
--------------------------------------------------------------------------------
/sansa-datalake/sansa-datalake-spark/src/main/java/net/sansa_stack/datalake/spark/NTtoDF.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Created by mmami on 10.10.16.
3 | */
4 | package net.sansa_stack.datalake.spark;
5 |
6 | import org.apache.spark.sql.*;
7 | import org.apache.spark.api.java.JavaPairRDD;
8 | import org.apache.spark.api.java.JavaRDD;
9 | import org.apache.spark.api.java.function.FlatMapFunction;
10 | import org.apache.spark.api.java.function.Function;
11 | import org.apache.spark.api.java.function.PairFunction;
12 | import org.apache.spark.sql.types.DataTypes;
13 | import org.apache.spark.sql.types.StructField;
14 | import org.apache.spark.sql.types.StructType;
15 | import net.sansa_stack.datalake.spark.model.Triple;
16 | import scala.Tuple2;
17 |
18 | import java.sql.Timestamp;
19 | import java.util.*;
20 | import java.util.regex.Matcher;
21 | import java.util.regex.Pattern;
22 | import java.io.Serializable;
23 |
24 | public class NTtoDF implements Serializable {
25 |
26 | private String className;
27 |
28 | public NTtoDF() { }
29 |
30 | public NTtoDF options(Map options) {
31 | className = options.get("class");
32 |
33 | return this;
34 | }
35 |
36 | //@SuppressWarnings("unchecked")
37 | public Dataset read(String input_path, SparkSession spark) {
38 |
39 | try {
40 |
41 |
42 | // 1. Read text file
43 | JavaRDD lines = spark.read().textFile(input_path).toJavaRDD();
44 | //JavaRDD lines = spark.read().textFile(input_path);
45 |
46 | // 2. Map lines to Triple objects
47 | JavaRDD triples = lines.map((Function) line -> {
48 |
49 | //String[] parts = line.split(" ");
50 |
51 | List parts = new ArrayList<>();
52 | Matcher m = Pattern.compile("([^\"]\\S*|\".+?\")\\s*").matcher(line);
53 | while (m.find())
54 | parts.add(m.group(1));
55 |
56 | Triple triple;
57 |
58 | if (parts.get(1).equals(""))
59 | triple = new Triple(replaceInValue(removeTagSymbol(parts.get(0))), null, replaceInValue(removeTagSymbol(parts.get(2))));
60 | else {
61 | String subject = replaceInValue(removeTagSymbol(parts.get(0))); // MEASURE removeTagSymbol() time
62 | String property = replaceInColumn(removeTagSymbol(parts.get(1)));
63 | String object = replaceInValue(removeTagSymbol(parts.get(2)));
64 | String type = replaceInValue(removeTagSymbol(parts.get(3))); // Either there is a type (xslt) or not (.)
65 |
66 | String objectAndType = (parts.size() == 5) ? (object + type) : object;
67 | objectAndType = reverse(objectAndType);
68 |
69 | triple = new Triple(subject, property, objectAndType);
70 | }
71 |
72 | return triple;
73 | });
74 |
75 |
76 | // 3. Map Triple objects to pairs (Triple.subject,[Triple.property, Triple.object])
77 | //@SuppressWarnings({ "rawtypes" })
78 | JavaPairRDD> subject_property = triples.mapToPair((
79 | PairFunction>) trpl ->
80 | new Tuple2(trpl.getSubject(), new Tuple2(trpl.getProperty(), trpl.getObject()))
81 | );
82 |
83 | // 4. Group pairs by subject => s,(p,o)[]
84 | JavaPairRDD>> groupBySubject = subject_property.groupByKey();
85 |
86 | // 5. Map to pairs (Type,(s,(p,o)[]))
87 | //@SuppressWarnings({ "serial" })
88 | JavaPairRDD>>> type_s_po = groupBySubject.mapToPair((
89 | PairFunction>>, String, Tuple2>>>) list -> {
90 |
91 | List> p_o = new ArrayList<>();
92 | List types = new ArrayList<>();
93 | String property;
94 | String object;
95 | Tuple2 tt;
96 | Tuple2 t2;
97 |
98 | String subject = list._1();
99 | for (Tuple2 stringStringTuple2 : list._2()) {
100 | tt = stringStringTuple2;
101 | property = tt._1();
102 | object = tt._2();
103 | if (property == null) {
104 | p_o.add(new Tuple2<>("type_" + object, "1"));
105 | types.add(object);
106 | } else {
107 | // Form Tuple2(P,O)
108 | t2 = new Tuple2<>(property, object);
109 | p_o.add(t2);
110 | }
111 | }
112 |
113 | Collections.sort(types); // order types lexicographically then select the last one => similar instances end up in same table
114 |
115 | //String chosen_type = lastType; // The last type is generally the most specific, but this is definitely not a rule.
116 | String chosen_type = types.get(types.size()-1);
117 |
118 | // We might use a hierarchy of classes from the schema if provided in future
119 | p_o.remove(new Tuple2("type_" + chosen_type, "1"));
120 |
121 | Tuple2 s_po = new Tuple2(subject, p_o);
122 | return new Tuple2>>>(chosen_type, s_po);
123 | });
124 |
125 | // 6. Group by type => (type, It(s, It(p, o)))
126 | JavaPairRDD>>>> groupByType = type_s_po.groupByKey();
127 |
128 | // 7. Get all the types
129 | //groupByType: >>>>
130 | // THIS CAN BE SUB-OPTIMAL WITH LARGE DATA.
131 | List keys = groupByType.keys().distinct().collect();
132 |
133 | System.out.println("Types found: " + keys);
134 | // 8. Iterate through all types
135 | //int t = 0;
136 | //for (String key : keys) {
137 | //t++;
138 | //if (t < 20) { // To remove later
139 | //if(key.contains("HistoricTower")){
140 |
141 | // 8.1 Get RDD of the type
142 | //@SuppressWarnings("unused")
143 | JavaRDD>>> rddByKey = getRddByKey(groupByType, className);
144 |
145 | // 8.2 Map the type RDD => Return type columns
146 | //JavaRDD> cols = rddByKey.map(i -> {
147 | JavaRDD cols = rddByKey.flatMap((FlatMapFunction>>, String>) i -> {
148 | LinkedHashMap po = new LinkedHashMap<>(); // a hashamp (that keeps order) to store all type's columns
149 |
150 | // 8.2.1 Iterate through all (p,o) and collect the columns (update incrementally the hashmap)
151 |
152 | for (Tuple2 temp : i._2) {
153 | String property = temp._1();
154 | String object = reverse(temp._2());
155 |
156 | if (object.contains("XMLSchema#double")) {
157 | if (!po.containsKey(property + "--TD") && !po.containsKey(property + "--TAD"))
158 | property = property + "--TD";
159 | else if (!po.containsKey(property + "--TAD")) {
160 | po.remove(property + "--TD");
161 | property = property + "--TAD";
162 | }
163 |
164 | } else if (object.contains("XMLSchema#int")) {
165 | property = property + "--TI";
166 |
167 | if (po.containsKey(property))
168 | property = property.replace("--TI", "--TAI");
169 |
170 | } else if (object.contains("XMLSchema#boolean")) {
171 | property = property + "--TB";
172 | } else if (object.contains("XMLSchema#dateTime")) {
173 | property = property + "--TTS";
174 | }
175 |
176 | if (po.containsKey(property) && !po.containsKey(property + "**")) {
177 | po.remove(property);
178 | property = property + "**";
179 | //System.out.println("Property: " + property);
180 | } else if (po.containsKey(property + "**")) {
181 | property = property + "**";
182 | }
183 |
184 | po.put(property, ""); // CAUTION: overwriting previous columns
185 | }
186 |
187 | // 8.2.2 At last, add the id column
188 | po.put("id", "");
189 |
190 | return po.keySet().iterator();
191 | //return (Iterator) po.keySet();
192 | });
193 |
194 | // 8.- Vars
195 | LinkedHashMap type_columns = new LinkedHashMap(); // a hashamp (that keeps order) to store all type's columns
196 | String col;
197 |
198 | // 8.3 Read columns and construct a hashmap
199 | final List readColumns = cols.distinct().collect();
200 |
201 | for (String j : readColumns) type_columns.put(j,""); // Overwrite original columns (collect() may return columns in different order than collected firstly)
202 |
203 | // 8.4 Generate the Parquet table schema from the collected columns
204 | List table_columns = new ArrayList<>();
205 | HashMap toSaveToDB = new HashMap<>();
206 |
207 |
208 | for (String s : readColumns) {
209 | if(s.contains("--TD")) {
210 | if(!readColumns.contains(s.split("--")[0] + "--TAD")) {
211 | col = s.split("--")[0];
212 | table_columns.add(DataTypes.createStructField(col, DataTypes.DoubleType, true));
213 | //toSaveToDB.put(col, "double");
214 | }
215 | } else if(s.contains("--TI")) {
216 | col = s.split("--")[0];
217 | table_columns.add(DataTypes.createStructField(col, DataTypes.IntegerType, true));
218 | //toSaveToDB.put(col, "int");
219 | } else if(s.contains("--TB")) {
220 | col = s.split("--")[0];
221 | table_columns.add(DataTypes.createStructField(col, DataTypes.BooleanType, true));
222 | //toSaveToDB.put(col, "boolean");
223 | } else if(s.contains("--TTS")) {
224 | col = s.split("--")[0];
225 | table_columns.add(DataTypes.createStructField(col, DataTypes.TimestampType, true));
226 | //toSaveToDB.put(col, "timeDate");
227 | } else if(s.contains("--TAD")) {
228 | col = s.split("--")[0];
229 | table_columns.add(DataTypes.createStructField(col, DataTypes.createArrayType(DataTypes.DoubleType, true), true));
230 | //toSaveToDB.put(col, "arrayDouble");
231 | } else if(s.contains("--TAI")) {
232 | col = s.split("--")[0];
233 | table_columns.add(DataTypes.createStructField(col, DataTypes.createArrayType(DataTypes.IntegerType, true), true));
234 | //toSaveToDB.put(col, "arrayInt");
235 | } else if(s.contains("**")) {
236 | col = s.replace("**", "");
237 | table_columns.add(DataTypes.createStructField(col, DataTypes.createArrayType(DataTypes.StringType, true), true));
238 | } else {
239 | table_columns.add(DataTypes.createStructField(s, DataTypes.StringType, true));
240 | //toSaveToDB.put(s, "string");
241 | }
242 | }
243 |
244 | // 8.5 Save columns to database
245 | //saveToMongoDB(replaceInType(key), toSaveToDB, dsName, dsIRI);
246 |
247 | StructType schema = DataTypes.createStructType(table_columns);
248 |
249 | // 8.6. Map RDD of (subject, Iter(property, object)) to an RDD of Row
250 | JavaRDD returnValues = rddByKey.map((Function>>, Row>) i -> {
251 |
252 | Row values_list;
253 | LinkedHashMap po = new LinkedHashMap<>();
254 |
255 | // 8.6.1 Initialize the hashmap values with null (they're previously initialized with a String "", so if a certain value is an int => a cast error)
256 | for (String j : readColumns) { // TO INHENCE
257 | if(j.contains("--TI"))
258 | po.put(j.replace("--TI", ""),null);
259 | else if(j.contains("--TD") && !readColumns.contains(j + "--TAD"))
260 | po.put(j.replace("--TD", ""),null);
261 | else if(j.contains("--TB"))
262 | po.put(j.replace("--TB", ""),null);
263 | else if(j.contains("--TTS"))
264 | po.put(j.replace("--TTS", ""),null);
265 | else if(j.contains("--TAI"))
266 | po.put(j.replace("--TAI", ""),null);
267 | else if(j.contains("--TAD"))
268 | po.put(j.replace("--TAD", ""),null);
269 | else if(j.contains("**"))
270 | po.put(j.replace("**", ""),null);
271 | else
272 | po.put(j,null);
273 | }
274 |
275 | // 8.6.2 Iterate through all the (property, object) pairs to save data in the collected columns
276 | String subject = i._1;
277 |
278 | for(Tuple2 temp : i._2) {
279 | String property = temp._1();
280 | String object = reverse(temp._2());
281 | Object newobject = null;
282 |
283 | if (readColumns.contains(property + "--TD") && !readColumns.contains(property + "--TAD")) {
284 | newobject = Double.parseDouble(object.replace("^^www.w3.org/2001/XMLSchema#double", "").replace("\"", ""));
285 | po.put(property, newobject);
286 | } else if (readColumns.contains(property + "--TI")) {
287 | newobject = Integer.parseInt(object.replace("^^www.w3.org/2001/XMLSchema#integer", "").replace("^^www.w3.org/2001/XMLSchema#int", "").replace("\"", ""));
288 | po.put(property, newobject);
289 | } else if (readColumns.contains(property + "--TB")) {
290 | newobject = Boolean.parseBoolean(object.replace("^^www.w3.org/2001/XMLSchema#boolean", "").replace("\"", ""));
291 | po.put(property, newobject);
292 | } else if (readColumns.contains(property + "--TTS")) {
293 | //SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
294 | newobject = Timestamp.valueOf(object.replace("^^www.w3.org/2001/XMLSchema#dateTime", "").replace("\"", "").replace("T", " "));
295 | po.put(property, newobject);
296 | } else if (readColumns.contains(property + "--TAD")) {
297 | ArrayList arr;
298 | newobject = Double.parseDouble(object.replace("^^www.w3.org/2001/XMLSchema#double", "").replace("\"", ""));
299 | if (po.get(property) != null) {
300 | //System.out.println("TYPE (" + po.get(property) + "): ");
301 | arr = (ArrayList) po.get(property);
302 | arr.add((Double) newobject);
303 | } else {
304 | //System.out.println("TYPE (" + po.get(property) + ")");
305 | arr = new ArrayList<>();
306 | arr.add((Double) newobject);
307 | }
308 | po.put(property, arr);
309 | } else if (readColumns.contains(property + "--TAI")) {
310 | ArrayList arr = new ArrayList<>();
311 | if (po.containsKey(property)) {
312 | arr = (ArrayList) po.get(property);
313 | arr.add((Integer) newobject);
314 | } else {
315 | arr.add((Integer) newobject);
316 | }
317 | po.put(property, arr);
318 | } else if (readColumns.contains(property + "**")) {
319 | //ArrayList arr = new ArrayList();
320 | ArrayList temparr; // In new Parquet, ArrayString type saves only String[]s not ArrayLists, so needs to change back and forth from String to ArrayList String
321 | String[] arr;
322 | newobject = object.replace("**", "").replace("\"", "");
323 | if (po.get(property) != null) {
324 | //System.out.println("TYPE (" + po.get(property) + "): ");
325 |
326 | arr = (String[]) po.get(property);
327 | temparr = new ArrayList<>(Arrays.asList(arr));
328 | //arr = (ArrayList) po.get(property);
329 | // create arraylist
330 | temparr.add((String) newobject);
331 |
332 | arr = temparr.toArray(new String[0]);
333 | } else {
334 | arr = new String[]{(String) newobject};
335 | //arr = new ArrayList();
336 | //arr.add((String) newobject);
337 | }
338 | //String[] ary = new String[arr.size()];
339 | //ary = arr.toArray(ary);
340 | po.put(property, arr);
341 | } else
342 | po.put(property, object);
343 | }
344 |
345 | // 8.6.3 Add the subject finally as the ID to the hashmap
346 | po.put("id", subject);
347 |
348 | //System.out.println("Values to be inserted under this schema: " + po.keySet());
349 |
350 | // 8.6.4 Create the row from the hashmap values
351 | List