├── .gitattributes
├── .gitignore
├── LICENSE
├── README.md
├── azure-pipelines.yml
├── before-build.sh
├── databricks-spline
└── pom.xml
├── notebooks
├── TPC-H.scala
└── sample-spark-job.scala
├── pom.xml
├── provision-databricks.sh
└── provision-webapp.sh
/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Compiled class file
2 | *.class
3 |
4 | # Log file
5 | *.log
6 |
7 | # BlueJ files
8 | *.ctxt
9 |
10 | # Mobile Tools for Java (J2ME)
11 | .mtj.tmp/
12 |
13 | # Package Files #
14 | *.jar
15 | *.war
16 | *.nar
17 | *.ear
18 | *.zip
19 | *.tar.gz
20 | *.rar
21 |
22 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
23 | hs_err_pid*
24 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2019 algattik
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # databricks-lineage-tutorial
2 |
--------------------------------------------------------------------------------
/azure-pipelines.yml:
--------------------------------------------------------------------------------
1 | trigger:
2 | - master
3 |
4 | variables:
5 | RESOURCE_GROUP: lineagetutorial
6 | RESOURCE_NAME_PREFIX: lineagetutorial
7 | SPARK_VERSION: 2.4
8 | GIT_BRANCH: release/0.3
9 | DATABRICKS_HOST: https://northeurope.azuredatabricks.net/
10 | DATABRICKS_TOKEN: dapi00000000000000000000000000000000
11 |
12 | pool:
13 | vmImage: 'Ubuntu-16.04'
14 |
15 | steps:
16 |
17 | - task: AzureCLI@1
18 | displayName: Create Azure resources
19 | inputs:
20 | azureSubscription: ARMConnection
21 | scriptPath: before-build.sh
22 |
23 | - task: Maven@3
24 | displayName: Build Spline Library and Web UI
25 | inputs:
26 | mavenOptions: '-Xmx3072m'
27 | javaHomeOption: 'JDKVersion'
28 | jdkVersionOption: '1.8'
29 | jdkArchitectureOption: 'x64'
30 | publishJUnitResults: false
31 | goals: package --batch-mode --activate-profiles spark-2.4 --projects databricks-spline,spline/web --also-make --define skipTests
32 |
33 | - bash: $(System.DefaultWorkingDirectory)/provision-databricks.sh
34 | displayName: Provision Databricks
35 |
36 | # task: Maven@3
37 | # displayName: Run Spline tests
38 | # inputs:
39 | # mavenPomFile: 'spline/pom.xml'
40 | # mavenOptions: '-Xmx3072m'
41 | # javaHomeOption: 'JDKVersion'
42 | # jdkVersionOption: '1.8'
43 | # jdkArchitectureOption: 'x64'
44 | # goals: test --batch-mode --activate-profiles spark-2.4 --projects web --also-make --define test.spline.mongodb.url="$(COSMOSDB_CONN_STRING)"
45 |
46 | - task: AzureCLI@1
47 | displayName: Provision Webapp
48 | inputs:
49 | azureSubscription: ARMConnection
50 | scriptPath: provision-webapp.sh
51 |
52 | - task: AzureRmWebAppDeployment@3
53 | displayName: 'Deploy Spline UI WAR to Azure App Service'
54 | inputs:
55 | azureSubscription: ARMConnection
56 | WebAppName: $(WEBAPP_NAME)
57 | Package: '$(System.DefaultWorkingDirectory)/ROOT.war'
58 |
59 | - bash: for i in {1..40}; do curl --silent --show-error --fail $WEBAPP_URL && break || sleep 10; done
60 | displayName: Prewarm Webapp
61 |
62 | - bash: echo "$WEBAPP_URL"
63 | displayName: Display Webapp URL
64 |
--------------------------------------------------------------------------------
/before-build.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Strict mode, fail on any error
4 | set -euo pipefail
5 |
6 | # Clone the repository to be built
7 | git clone --single-branch --branch $GIT_BRANCH https://github.com/AbsaOSS/spline.git
8 |
9 | # The name of the Cosmos DB instance to be deployed. Generate a unique name.
10 | COSMOSDB_INSTANCE="$RESOURCE_NAME_PREFIX$BUILD_BUILDID"
11 |
12 | # Create a Cosmos DB database. This command has no effect if the database already exists.
13 | az cosmosdb create -g $RESOURCE_GROUP -n $COSMOSDB_INSTANCE --kind MongoDB --capabilities EnableAggregationPipeline -o table
14 |
15 | # Get the connection string (in mongodb:// format) to the Cosmos DB account.
16 | # The connection string contains the account key.
17 | # Example connection string:
18 | # mongodb://mycosmosdb:kmRux...XBQ==@mycosmosdb.documents.azure.com:10255/?ssl=true&replicaSet=globaldb
19 | cosmosdb_conn_string=$(az cosmosdb list-connection-strings -g $RESOURCE_GROUP -n $COSMOSDB_INSTANCE --query connectionStrings[0].connectionString -o tsv)
20 |
21 | # Add the database name within the connection string (before the '?' delimiter).
22 | COSMOSDB_CONN_STRING=${cosmosdb_conn_string/\?/spline?}
23 |
24 | # Set job variables from script
25 | echo "##vso[task.setvariable variable=COSMOSDB_CONN_STRING]$COSMOSDB_CONN_STRING"
26 |
--------------------------------------------------------------------------------
/databricks-spline/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 | 4.0.0
4 | databricks-spline
5 | jar
6 |
7 | com.cloudarchitected.spline
8 | databricks-spline-parent
9 | 0.3.7-SNAPSHOT
10 |
11 |
12 | ${project.version}
13 | 2.4
14 |
15 |
16 |
17 | za.co.absa.spline
18 | spline-core
19 | ${spline.version}
20 |
21 |
22 | za.co.absa.spline
23 | spline-core-spark-adapter-${spark.version}
24 | ${spline.version}
25 |
26 |
27 | org.apache.spark
28 | *
29 |
30 |
31 |
32 |
33 | za.co.absa.spline
34 | spline-persistence-mongo
35 | ${spline.version}
36 |
37 |
38 |
39 |
40 |
41 | org.apache.maven.plugins
42 | maven-shade-plugin
43 | 3.2.1
44 |
45 |
46 |
47 | package
48 |
49 | shade
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
--------------------------------------------------------------------------------
/notebooks/TPC-H.scala:
--------------------------------------------------------------------------------
1 | // Databricks notebook source
2 | System.setProperty("spline.mode", "REQUIRED")
3 | System.setProperty("spline.persistence.factory", "za.co.absa.spline.persistence.mongo.MongoPersistenceFactory")
4 | System.setProperty("spline.mongodb.url", dbutils.secrets.get("spline", "spline.mongodb.url"))
5 | import za.co.absa.spline.core.SparkLineageInitializer._
6 | spark.enableLineageTracking()
7 |
8 | // COMMAND ----------
9 |
10 | val dbname = "lineage_tutorial_" + java.util.UUID.randomUUID.toString.replaceAll("-","")
11 | spark.sql(s"CREATE DATABASE $dbname")
12 | spark.sql(s"USE $dbname")
13 |
14 | // COMMAND ----------
15 |
16 | // MAGIC %sql
17 | // MAGIC CREATE TEMPORARY TABLE raw_nation (n_nationkey int, n_name string, n_regionkey int, n_comment string)
18 | // MAGIC USING com.databricks.spark.csv
19 | // MAGIC OPTIONS (path "/databricks-datasets/tpch/data-001/nation/nation.tbl", header "false", delimiter "|")
20 |
21 | // COMMAND ----------
22 |
23 | // MAGIC %sql
24 | // MAGIC CREATE TEMPORARY TABLE raw_region (r_regionkey int, r_name string, r_comment string)
25 | // MAGIC USING com.databricks.spark.csv
26 | // MAGIC OPTIONS (path "/databricks-datasets/tpch/data-001/region/region.tbl", header "false", delimiter "|")
27 |
28 | // COMMAND ----------
29 |
30 | // MAGIC %sql
31 | // MAGIC CREATE TEMPORARY TABLE raw_orders (o_orderkey int, o_custkey int, o_orderstatus string, o_totalprice double, o_orderdate string, o_orderpriority string, o_clerk string, o_shippriority int, o_comment string)
32 | // MAGIC USING com.databricks.spark.csv
33 | // MAGIC OPTIONS (path "/databricks-datasets/tpch/data-001/orders/orders.tbl", header "false", delimiter "|")
34 |
35 | // COMMAND ----------
36 |
37 | // MAGIC %sql
38 | // MAGIC CREATE TEMPORARY TABLE raw_lineitem (l_orderkey int, l_partkey int, l_suppkey int, l_linenumber int, l_quantity double, l_extendedprice double, l_discount double, l_tax double, l_returnflag string, l_linestatus string, l_shipdate string, l_commitdate string, l_receiptdate string, l_shipinstruct string, l_shipmode string, l_comment string)
39 | // MAGIC USING com.databricks.spark.csv
40 | // MAGIC OPTIONS (path "/databricks-datasets/tpch/data-001/lineitem/lineitem.tbl", header "false", delimiter "|")
41 |
42 | // COMMAND ----------
43 |
44 | // MAGIC %sql
45 | // MAGIC CREATE TEMPORARY TABLE raw_customer (c_custkey int, c_name string, c_address string, c_nationkey int, c_phone string, c_acctbal double, c_mktsegment string , c_comment string)
46 | // MAGIC USING com.databricks.spark.csv
47 | // MAGIC OPTIONS (path "/databricks-datasets/tpch/data-001/customer/customer.tbl", header "false", delimiter "|")
48 |
49 | // COMMAND ----------
50 |
51 | // MAGIC %sql
52 | // MAGIC CREATE TEMPORARY TABLE raw_part (p_partkey int, p_name string, p_mfgr string, p_brand string, p_type string, p_size int, p_container string, p_retailprice double, p_comment string)
53 | // MAGIC USING com.databricks.spark.csv
54 | // MAGIC OPTIONS (path "/databricks-datasets/tpch/data-001/part/part.tbl", header "false", delimiter "|")
55 |
56 | // COMMAND ----------
57 |
58 | // MAGIC %sql
59 | // MAGIC CREATE TEMPORARY TABLE raw_supplier (s_suppkey int, s_name string, s_address string, s_nationkey int, s_phone string, s_acctbal double, s_comment string)
60 | // MAGIC USING com.databricks.spark.csv
61 | // MAGIC OPTIONS (path "/databricks-datasets/tpch/data-001/supplier/supplier.tbl", header "false", delimiter "|")
62 |
63 | // COMMAND ----------
64 |
65 | // MAGIC %sql
66 | // MAGIC CREATE TEMPORARY TABLE raw_partsupp (ps_partkey int, ps_suppkey int, ps_availqty int, ps_supplycost decimal, ps_comment string)
67 | // MAGIC USING com.databricks.spark.csv
68 | // MAGIC OPTIONS (path "/databricks-datasets/tpch/data-001/partsupp/partsupp.tbl", header "false", delimiter "|")
69 |
70 | // COMMAND ----------
71 |
72 | // MAGIC %sql
73 | // MAGIC CREATE TABLE nation AS SELECT * FROM raw_nation
74 |
75 | // COMMAND ----------
76 |
77 | // MAGIC %sql
78 | // MAGIC CREATE TABLE region AS SELECT * FROM raw_region
79 |
80 | // COMMAND ----------
81 |
82 | // MAGIC %sql
83 | // MAGIC CREATE TABLE orders AS SELECT * FROM raw_orders
84 |
85 | // COMMAND ----------
86 |
87 | // MAGIC %sql
88 | // MAGIC CREATE TABLE lineitem AS SELECT * FROM raw_lineitem
89 |
90 | // COMMAND ----------
91 |
92 | // MAGIC %sql
93 | // MAGIC CREATE TABLE customer AS SELECT * FROM raw_customer
94 |
95 | // COMMAND ----------
96 |
97 | // MAGIC %sql
98 | // MAGIC CREATE TABLE part AS SELECT * FROM raw_part
99 |
100 | // COMMAND ----------
101 |
102 | // MAGIC %sql
103 | // MAGIC CREATE TABLE supplier AS SELECT * FROM raw_supplier
104 |
105 | // COMMAND ----------
106 |
107 | // MAGIC %sql
108 | // MAGIC CREATE TABLE partsupp AS SELECT * FROM raw_partsupp
109 |
110 | // COMMAND ----------
111 |
112 | spark.sql("""
113 | --
114 | -- TPC-H/TPC-R Pricing Summary Report Query (Q1)
115 | -- Functional Query Definition
116 | -- Approved February 1998
117 |
118 |
119 | select
120 | l_returnflag,
121 | l_linestatus,
122 | sum(l_quantity) as sum_qty,
123 | sum(l_extendedprice) as sum_base_price,
124 | sum(l_extendedprice * (1 - l_discount)) as sum_disc_price,
125 | sum(l_extendedprice * (1 - l_discount) * (1 + l_tax)) as sum_charge,
126 | avg(l_quantity) as avg_qty,
127 | avg(l_extendedprice) as avg_price,
128 | avg(l_discount) as avg_disc,
129 | count(*) as count_order
130 | from
131 | lineitem
132 | where
133 | l_shipdate <= date '1998-12-01' - interval '90' day
134 | group by
135 | l_returnflag,
136 | l_linestatus
137 | order by
138 | l_returnflag,
139 | l_linestatus
140 |
141 |
142 | """).write.mode("overwrite").parquet("/tmp/tpch/TPCDH1.parquet")
143 |
144 | // COMMAND ----------
145 |
146 | spark.sql("""
147 | --
148 | -- TPC-H/TPC-R Minimum Cost Supplier Query (Q2)
149 | -- Functional Query Definition
150 | -- Approved February 1998
151 |
152 |
153 | select
154 | s_acctbal,
155 | s_name,
156 | n_name,
157 | p_partkey,
158 | p_mfgr,
159 | s_address,
160 | s_phone,
161 | s_comment
162 | from
163 | part,
164 | supplier,
165 | partsupp,
166 | nation,
167 | region
168 | where
169 | p_partkey = ps_partkey
170 | and s_suppkey = ps_suppkey
171 | and p_size = 15
172 | and p_type like '%BRASS'
173 | and s_nationkey = n_nationkey
174 | and n_regionkey = r_regionkey
175 | and r_name = 'EUROPE'
176 | and ps_supplycost = (
177 | select
178 | min(ps_supplycost)
179 | from
180 | partsupp,
181 | supplier,
182 | nation,
183 | region
184 | where
185 | p_partkey = ps_partkey
186 | and s_suppkey = ps_suppkey
187 | and s_nationkey = n_nationkey
188 | and n_regionkey = r_regionkey
189 | and r_name = 'EUROPE'
190 | )
191 | order by
192 | s_acctbal desc,
193 | n_name,
194 | s_name,
195 | p_partkey
196 | LIMIT 100
197 |
198 | """).write.mode("overwrite").parquet("/tmp/tpch/TPCDH2.parquet")
199 |
200 | // COMMAND ----------
201 |
202 | spark.sql("""
203 | --
204 | -- TPC-H/TPC-R Shipping Priority Query (Q3)
205 | -- Functional Query Definition
206 | -- Approved February 1998
207 |
208 |
209 | select
210 | l_orderkey,
211 | sum(l_extendedprice * (1 - l_discount)) as revenue,
212 | o_orderdate,
213 | o_shippriority
214 | from
215 | customer,
216 | orders,
217 | lineitem
218 | where
219 | c_mktsegment = 'BUILDING'
220 | and c_custkey = o_custkey
221 | and l_orderkey = o_orderkey
222 | and o_orderdate < date '1995-03-15'
223 | and l_shipdate > date '1995-03-15'
224 | group by
225 | l_orderkey,
226 | o_orderdate,
227 | o_shippriority
228 | order by
229 | revenue desc,
230 | o_orderdate
231 | LIMIT 10
232 |
233 | """).write.mode("overwrite").parquet("/tmp/tpch/TPCDH3.parquet")
234 |
235 | // COMMAND ----------
236 |
237 | spark.sql("""
238 | --
239 | -- TPC-H/TPC-R Order Priority Checking Query (Q4)
240 | -- Functional Query Definition
241 | -- Approved February 1998
242 |
243 |
244 | select
245 | o_orderpriority,
246 | count(*) as order_count
247 | from
248 | orders
249 | where
250 | o_orderdate >= date '1993-07-01'
251 | and o_orderdate < date '1993-07-01' + interval '3' month
252 | and exists (
253 | select
254 | *
255 | from
256 | lineitem
257 | where
258 | l_orderkey = o_orderkey
259 | and l_commitdate < l_receiptdate
260 | )
261 | group by
262 | o_orderpriority
263 | order by
264 | o_orderpriority
265 |
266 |
267 | """).write.mode("overwrite").parquet("/tmp/tpch/TPCDH4.parquet")
268 |
269 | // COMMAND ----------
270 |
271 | spark.sql("""
272 | --
273 | -- TPC-H/TPC-R Local Supplier Volume Query (Q5)
274 | -- Functional Query Definition
275 | -- Approved February 1998
276 |
277 |
278 | select
279 | n_name,
280 | sum(l_extendedprice * (1 - l_discount)) as revenue
281 | from
282 | customer,
283 | orders,
284 | lineitem,
285 | supplier,
286 | nation,
287 | region
288 | where
289 | c_custkey = o_custkey
290 | and l_orderkey = o_orderkey
291 | and l_suppkey = s_suppkey
292 | and c_nationkey = s_nationkey
293 | and s_nationkey = n_nationkey
294 | and n_regionkey = r_regionkey
295 | and r_name = 'ASIA'
296 | and o_orderdate >= date '1994-01-01'
297 | and o_orderdate < date '1994-01-01' + interval '1' year
298 | group by
299 | n_name
300 | order by
301 | revenue desc
302 |
303 |
304 | """).write.mode("overwrite").parquet("/tmp/tpch/TPCDH5.parquet")
305 |
306 | // COMMAND ----------
307 |
308 | spark.sql("""
309 | --
310 | -- TPC-H/TPC-R Forecasting Revenue Change Query (Q6)
311 | -- Functional Query Definition
312 | -- Approved February 1998
313 |
314 |
315 | select
316 | sum(l_extendedprice * l_discount) as revenue
317 | from
318 | lineitem
319 | where
320 | l_shipdate >= date '1994-01-01'
321 | and l_shipdate < date '1994-01-01' + interval '1' year
322 | and l_discount between .06 - 0.01 and .06 + 0.01
323 | and l_quantity < 24
324 |
325 |
326 | """).write.mode("overwrite").parquet("/tmp/tpch/TPCDH6.parquet")
327 |
328 | // COMMAND ----------
329 |
330 | spark.sql("""
331 | --
332 | -- TPC-H/TPC-R Volume Shipping Query (Q7)
333 | -- Functional Query Definition
334 | -- Approved February 1998
335 |
336 |
337 | select
338 | supp_nation,
339 | cust_nation,
340 | l_year,
341 | sum(volume) as revenue
342 | from
343 | (
344 | select
345 | n1.n_name as supp_nation,
346 | n2.n_name as cust_nation,
347 | extract(year from l_shipdate) as l_year,
348 | l_extendedprice * (1 - l_discount) as volume
349 | from
350 | supplier,
351 | lineitem,
352 | orders,
353 | customer,
354 | nation n1,
355 | nation n2
356 | where
357 | s_suppkey = l_suppkey
358 | and o_orderkey = l_orderkey
359 | and c_custkey = o_custkey
360 | and s_nationkey = n1.n_nationkey
361 | and c_nationkey = n2.n_nationkey
362 | and (
363 | (n1.n_name = 'FRANCE' and n2.n_name = 'GERMANY')
364 | or (n1.n_name = 'GERMANY' and n2.n_name = 'FRANCE')
365 | )
366 | and l_shipdate between date '1995-01-01' and date '1996-12-31'
367 | ) as shipping
368 | group by
369 | supp_nation,
370 | cust_nation,
371 | l_year
372 | order by
373 | supp_nation,
374 | cust_nation,
375 | l_year
376 |
377 |
378 | """).write.mode("overwrite").parquet("/tmp/tpch/TPCDH7.parquet")
379 |
380 | // COMMAND ----------
381 |
382 | spark.sql("""
383 | --
384 | -- TPC-H/TPC-R National Market Share Query (Q8)
385 | -- Functional Query Definition
386 | -- Approved February 1998
387 |
388 |
389 | select
390 | o_year,
391 | sum(case
392 | when nation = 'BRAZIL' then volume
393 | else 0
394 | end) / sum(volume) as mkt_share
395 | from
396 | (
397 | select
398 | extract(year from o_orderdate) as o_year,
399 | l_extendedprice * (1 - l_discount) as volume,
400 | n2.n_name as nation
401 | from
402 | part,
403 | supplier,
404 | lineitem,
405 | orders,
406 | customer,
407 | nation n1,
408 | nation n2,
409 | region
410 | where
411 | p_partkey = l_partkey
412 | and s_suppkey = l_suppkey
413 | and l_orderkey = o_orderkey
414 | and o_custkey = c_custkey
415 | and c_nationkey = n1.n_nationkey
416 | and n1.n_regionkey = r_regionkey
417 | and r_name = 'AMERICA'
418 | and s_nationkey = n2.n_nationkey
419 | and o_orderdate between date '1995-01-01' and date '1996-12-31'
420 | and p_type = 'ECONOMY ANODIZED STEEL'
421 | ) as all_nations
422 | group by
423 | o_year
424 | order by
425 | o_year
426 |
427 |
428 | """).write.mode("overwrite").parquet("/tmp/tpch/TPCDH8.parquet")
429 |
430 | // COMMAND ----------
431 |
432 | spark.sql("""
433 | --
434 | -- TPC-H/TPC-R Product Type Profit Measure Query (Q9)
435 | -- Functional Query Definition
436 | -- Approved February 1998
437 |
438 |
439 | select
440 | nation,
441 | o_year,
442 | sum(amount) as sum_profit
443 | from
444 | (
445 | select
446 | n_name as nation,
447 | extract(year from o_orderdate) as o_year,
448 | l_extendedprice * (1 - l_discount) - ps_supplycost * l_quantity as amount
449 | from
450 | part,
451 | supplier,
452 | lineitem,
453 | partsupp,
454 | orders,
455 | nation
456 | where
457 | s_suppkey = l_suppkey
458 | and ps_suppkey = l_suppkey
459 | and ps_partkey = l_partkey
460 | and p_partkey = l_partkey
461 | and o_orderkey = l_orderkey
462 | and s_nationkey = n_nationkey
463 | and p_name like '%green%'
464 | ) as profit
465 | group by
466 | nation,
467 | o_year
468 | order by
469 | nation,
470 | o_year desc
471 |
472 |
473 | """).write.mode("overwrite").parquet("/tmp/tpch/TPCDH9.parquet")
474 |
475 | // COMMAND ----------
476 |
477 | spark.sql("""
478 | --
479 | -- TPC-H/TPC-R Returned Item Reporting Query (Q10)
480 | -- Functional Query Definition
481 | -- Approved February 1998
482 |
483 |
484 | select
485 | c_custkey,
486 | c_name,
487 | sum(l_extendedprice * (1 - l_discount)) as revenue,
488 | c_acctbal,
489 | n_name,
490 | c_address,
491 | c_phone,
492 | c_comment
493 | from
494 | customer,
495 | orders,
496 | lineitem,
497 | nation
498 | where
499 | c_custkey = o_custkey
500 | and l_orderkey = o_orderkey
501 | and o_orderdate >= date '1993-10-01'
502 | and o_orderdate < date '1993-10-01' + interval '3' month
503 | and l_returnflag = 'R'
504 | and c_nationkey = n_nationkey
505 | group by
506 | c_custkey,
507 | c_name,
508 | c_acctbal,
509 | c_phone,
510 | n_name,
511 | c_address,
512 | c_comment
513 | order by
514 | revenue desc
515 | LIMIT 20
516 |
517 | """).write.mode("overwrite").parquet("/tmp/tpch/TPCDH10.parquet")
518 |
519 | // COMMAND ----------
520 |
521 | spark.sql("""
522 | --
523 | -- TPC-H/TPC-R Important Stock Identification Query (Q11)
524 | -- Functional Query Definition
525 | -- Approved February 1998
526 |
527 |
528 | select
529 | ps_partkey,
530 | sum(ps_supplycost * ps_availqty) as value
531 | from
532 | partsupp,
533 | supplier,
534 | nation
535 | where
536 | ps_suppkey = s_suppkey
537 | and s_nationkey = n_nationkey
538 | and n_name = 'GERMANY'
539 | group by
540 | ps_partkey having
541 | sum(ps_supplycost * ps_availqty) > (
542 | select
543 | sum(ps_supplycost * ps_availqty) * 0.0001
544 | from
545 | partsupp,
546 | supplier,
547 | nation
548 | where
549 | ps_suppkey = s_suppkey
550 | and s_nationkey = n_nationkey
551 | and n_name = 'GERMANY'
552 | )
553 | order by
554 | value desc
555 |
556 |
557 | """).write.mode("overwrite").parquet("/tmp/tpch/TPCDH11.parquet")
558 |
559 | // COMMAND ----------
560 |
561 | spark.sql("""
562 | --
563 | -- TPC-H/TPC-R Shipping Modes and Order Priority Query (Q12)
564 | -- Functional Query Definition
565 | -- Approved February 1998
566 |
567 |
568 | select
569 | l_shipmode,
570 | sum(case
571 | when o_orderpriority = '1-URGENT'
572 | or o_orderpriority = '2-HIGH'
573 | then 1
574 | else 0
575 | end) as high_line_count,
576 | sum(case
577 | when o_orderpriority <> '1-URGENT'
578 | and o_orderpriority <> '2-HIGH'
579 | then 1
580 | else 0
581 | end) as low_line_count
582 | from
583 | orders,
584 | lineitem
585 | where
586 | o_orderkey = l_orderkey
587 | and l_shipmode in ('MAIL', 'SHIP')
588 | and l_commitdate < l_receiptdate
589 | and l_shipdate < l_commitdate
590 | and l_receiptdate >= date '1994-01-01'
591 | and l_receiptdate < date '1994-01-01' + interval '1' year
592 | group by
593 | l_shipmode
594 | order by
595 | l_shipmode
596 |
597 |
598 | """).write.mode("overwrite").parquet("/tmp/tpch/TPCDH12.parquet")
599 |
600 | // COMMAND ----------
601 |
602 | spark.sql("""
603 | --
604 | -- TPC-H/TPC-R Customer Distribution Query (Q13)
605 | -- Functional Query Definition
606 | -- Approved February 1998
607 |
608 |
609 | select
610 | c_count,
611 | count(*) as custdist
612 | from
613 | (
614 | select
615 | c_custkey,
616 | count(o_orderkey)
617 | from
618 | customer left outer join orders on
619 | c_custkey = o_custkey
620 | and o_comment not like '%special%requests%'
621 | group by
622 | c_custkey
623 | ) as c_orders (c_custkey, c_count)
624 | group by
625 | c_count
626 | order by
627 | custdist desc,
628 | c_count desc
629 |
630 |
631 | """).write.mode("overwrite").parquet("/tmp/tpch/TPCDH13.parquet")
632 |
633 | // COMMAND ----------
634 |
635 | spark.sql("""
636 | --
637 | -- TPC-H/TPC-R Promotion Effect Query (Q14)
638 | -- Functional Query Definition
639 | -- Approved February 1998
640 |
641 |
642 | select
643 | 100.00 * sum(case
644 | when p_type like 'PROMO%'
645 | then l_extendedprice * (1 - l_discount)
646 | else 0
647 | end) / sum(l_extendedprice * (1 - l_discount)) as promo_revenue
648 | from
649 | lineitem,
650 | part
651 | where
652 | l_partkey = p_partkey
653 | and l_shipdate >= date '1995-09-01'
654 | and l_shipdate < date '1995-09-01' + interval '1' month
655 |
656 |
657 | """).write.mode("overwrite").parquet("/tmp/tpch/TPCDH14.parquet")
658 |
659 | // COMMAND ----------
660 |
661 | spark.sql("""
662 | --
663 | -- TPC-H/TPC-R Parts/Supplier Relationship Query (Q16)
664 | -- Functional Query Definition
665 | -- Approved February 1998
666 |
667 |
668 | select
669 | p_brand,
670 | p_type,
671 | p_size,
672 | count(distinct ps_suppkey) as supplier_cnt
673 | from
674 | partsupp,
675 | part
676 | where
677 | p_partkey = ps_partkey
678 | and p_brand <> 'Brand#45'
679 | and p_type not like 'MEDIUM POLISHED%'
680 | and p_size in (49, 14, 23, 45, 19, 3, 36, 9)
681 | and ps_suppkey not in (
682 | select
683 | s_suppkey
684 | from
685 | supplier
686 | where
687 | s_comment like '%Customer%Complaints%'
688 | )
689 | group by
690 | p_brand,
691 | p_type,
692 | p_size
693 | order by
694 | supplier_cnt desc,
695 | p_brand,
696 | p_type,
697 | p_size
698 |
699 |
700 | """).write.mode("overwrite").parquet("/tmp/tpch/TPCDH16.parquet")
701 |
702 | // COMMAND ----------
703 |
704 | spark.sql("""
705 | --
706 | -- TPC-H/TPC-R Small-Quantity-Order Revenue Query (Q17)
707 | -- Functional Query Definition
708 | -- Approved February 1998
709 |
710 |
711 | select
712 | sum(l_extendedprice) / 7.0 as avg_yearly
713 | from
714 | lineitem,
715 | part
716 | where
717 | p_partkey = l_partkey
718 | and p_brand = 'Brand#23'
719 | and p_container = 'MED BOX'
720 | and l_quantity < (
721 | select
722 | 0.2 * avg(l_quantity)
723 | from
724 | lineitem
725 | where
726 | l_partkey = p_partkey
727 | )
728 |
729 |
730 | """).write.mode("overwrite").parquet("/tmp/tpch/TPCDH17.parquet")
731 |
732 | // COMMAND ----------
733 |
734 | spark.sql("""
735 | --
736 | -- TPC-H/TPC-R Large Volume Customer Query (Q18)
737 | -- Function Query Definition
738 | -- Approved February 1998
739 |
740 |
741 | select
742 | c_name,
743 | c_custkey,
744 | o_orderkey,
745 | o_orderdate,
746 | o_totalprice,
747 | sum(l_quantity) AS sum_l_quantity
748 | from
749 | customer,
750 | orders,
751 | lineitem
752 | where
753 | o_orderkey in (
754 | select
755 | l_orderkey
756 | from
757 | lineitem
758 | group by
759 | l_orderkey having
760 | sum(l_quantity) > 300
761 | )
762 | and c_custkey = o_custkey
763 | and o_orderkey = l_orderkey
764 | group by
765 | c_name,
766 | c_custkey,
767 | o_orderkey,
768 | o_orderdate,
769 | o_totalprice
770 | order by
771 | o_totalprice desc,
772 | o_orderdate
773 | LIMIT 100
774 |
775 | """).write.mode("overwrite").parquet("/tmp/tpch/TPCDH18.parquet")
776 |
777 | // COMMAND ----------
778 |
779 | spark.sql("""
780 | --
781 | -- TPC-H/TPC-R Discounted Revenue Query (Q19)
782 | -- Functional Query Definition
783 | -- Approved February 1998
784 |
785 |
786 | select
787 | sum(l_extendedprice* (1 - l_discount)) as revenue
788 | from
789 | lineitem,
790 | part
791 | where
792 | (
793 | p_partkey = l_partkey
794 | and p_brand = 'Brand#12'
795 | and p_container in ('SM CASE', 'SM BOX', 'SM PACK', 'SM PKG')
796 | and l_quantity >= 1 and l_quantity <= 1 + 10
797 | and p_size between 1 and 5
798 | and l_shipmode in ('AIR', 'AIR REG')
799 | and l_shipinstruct = 'DELIVER IN PERSON'
800 | )
801 | or
802 | (
803 | p_partkey = l_partkey
804 | and p_brand = 'Brand#23'
805 | and p_container in ('MED BAG', 'MED BOX', 'MED PKG', 'MED PACK')
806 | and l_quantity >= 10 and l_quantity <= 10 + 10
807 | and p_size between 1 and 10
808 | and l_shipmode in ('AIR', 'AIR REG')
809 | and l_shipinstruct = 'DELIVER IN PERSON'
810 | )
811 | or
812 | (
813 | p_partkey = l_partkey
814 | and p_brand = 'Brand#34'
815 | and p_container in ('LG CASE', 'LG BOX', 'LG PACK', 'LG PKG')
816 | and l_quantity >= 20 and l_quantity <= 20 + 10
817 | and p_size between 1 and 15
818 | and l_shipmode in ('AIR', 'AIR REG')
819 | and l_shipinstruct = 'DELIVER IN PERSON'
820 | )
821 |
822 |
823 | """).write.mode("overwrite").parquet("/tmp/tpch/TPCDH19.parquet")
824 |
825 | // COMMAND ----------
826 |
827 | spark.sql("""
828 | --
829 | -- TPC-H/TPC-R Potential Part Promotion Query (Q20)
830 | -- Function Query Definition
831 | -- Approved February 1998
832 |
833 |
834 | select
835 | s_name,
836 | s_address
837 | from
838 | supplier,
839 | nation
840 | where
841 | s_suppkey in (
842 | select
843 | ps_suppkey
844 | from
845 | partsupp
846 | where
847 | ps_partkey in (
848 | select
849 | p_partkey
850 | from
851 | part
852 | where
853 | p_name like 'forest%'
854 | )
855 | and ps_availqty > (
856 | select
857 | 0.5 * sum(l_quantity)
858 | from
859 | lineitem
860 | where
861 | l_partkey = ps_partkey
862 | and l_suppkey = ps_suppkey
863 | and l_shipdate >= date '1994-01-01'
864 | and l_shipdate < date '1994-01-01' + interval '1' year
865 | )
866 | )
867 | and s_nationkey = n_nationkey
868 | and n_name = 'CANADA'
869 | order by
870 | s_name
871 |
872 |
873 | """).write.mode("overwrite").parquet("/tmp/tpch/TPCDH20.parquet")
874 |
875 | // COMMAND ----------
876 |
877 | spark.sql("""
878 | --
879 | -- TPC-H/TPC-R Suppliers Who Kept Orders Waiting Query (Q21)
880 | -- Functional Query Definition
881 | -- Approved February 1998
882 |
883 |
884 | select
885 | s_name,
886 | count(*) as numwait
887 | from
888 | supplier,
889 | lineitem l1,
890 | orders,
891 | nation
892 | where
893 | s_suppkey = l1.l_suppkey
894 | and o_orderkey = l1.l_orderkey
895 | and o_orderstatus = 'F'
896 | and l1.l_receiptdate > l1.l_commitdate
897 | and exists (
898 | select
899 | *
900 | from
901 | lineitem l2
902 | where
903 | l2.l_orderkey = l1.l_orderkey
904 | and l2.l_suppkey <> l1.l_suppkey
905 | )
906 | and not exists (
907 | select
908 | *
909 | from
910 | lineitem l3
911 | where
912 | l3.l_orderkey = l1.l_orderkey
913 | and l3.l_suppkey <> l1.l_suppkey
914 | and l3.l_receiptdate > l3.l_commitdate
915 | )
916 | and s_nationkey = n_nationkey
917 | and n_name = 'SAUDI ARABIA'
918 | group by
919 | s_name
920 | order by
921 | numwait desc,
922 | s_name
923 | LIMIT 100
924 |
925 | """).write.mode("overwrite").parquet("/tmp/tpch/TPCDH21.parquet")
926 |
927 | // COMMAND ----------
928 |
929 | spark.sql("""
930 | --
931 | -- TPC-H/TPC-R Global Sales Opportunity Query (Q22)
932 | -- Functional Query Definition
933 | -- Approved February 1998
934 |
935 |
936 | select
937 | cntrycode,
938 | count(*) as numcust,
939 | sum(c_acctbal) as totacctbal
940 | from
941 | (
942 | select
943 | substring(c_phone, 0, 2) as cntrycode,
944 | c_acctbal
945 | from
946 | customer
947 | where
948 | substring(c_phone, 0, 2) in
949 | ('13', '31', '23', '29', '30', '18', '17')
950 | and c_acctbal > (
951 | select
952 | avg(c_acctbal)
953 | from
954 | customer
955 | where
956 | c_acctbal > 0.00
957 | and substring(c_phone, 0, 2) in
958 | ('13', '31', '23', '29', '30', '18', '17')
959 | )
960 | and not exists (
961 | select
962 | *
963 | from
964 | orders
965 | where
966 | o_custkey = c_custkey
967 | )
968 | ) as custsale
969 | group by
970 | cntrycode
971 | order by
972 | cntrycode
973 |
974 |
975 | """).write.mode("overwrite").parquet("/tmp/tpch/TPCDH22.parquet")
976 |
--------------------------------------------------------------------------------
/notebooks/sample-spark-job.scala:
--------------------------------------------------------------------------------
1 | // Databricks notebook source
2 | System.setProperty("spline.mode", "REQUIRED")
3 | System.setProperty("spline.persistence.factory", "za.co.absa.spline.persistence.mongo.MongoPersistenceFactory")
4 | System.setProperty("spline.mongodb.url", dbutils.secrets.get("spline", "spline.mongodb.url"))
5 | import za.co.absa.spline.core.SparkLineageInitializer._
6 | spark.enableLineageTracking()
7 |
8 | // COMMAND ----------
9 |
10 | // MAGIC %python
11 | // MAGIC rawData = spark.read.option("inferSchema", "true").json("/databricks-datasets/structured-streaming/events/")
12 | // MAGIC rawData.createOrReplaceTempView("rawData")
13 | // MAGIC sql("select r1.action, count(*) as actionCount from rawData as r1 join rawData as r2 on r1.action = r2.action group by r1.action").write.mode('overwrite').csv("/tmp/pyaggaction.csv")
14 |
15 | // COMMAND ----------
16 |
17 |
18 |
--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 | 4.0.0
4 | com.cloudarchitected.spline
5 | databricks-spline-parent
6 | pom
7 |
8 | za.co.absa.spline
9 | spline
10 | 0.3.7-SNAPSHOT
11 | spline
12 |
13 |
14 | spline
15 | databricks-spline
16 |
17 |
18 |
--------------------------------------------------------------------------------
/provision-databricks.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Strict mode, fail on any error
4 | set -euo pipefail
5 |
6 | echo "Installing Databricks CLI"
7 | sudo apt-get install -y python3-setuptools
8 | pip3 install wheel
9 | pip3 install databricks-cli
10 | sudo ln -s /home/vsts/.local/bin/* /usr/local/bin/
11 |
12 | # Databricks cluster to be created
13 | cluster_name="$RESOURCE_NAME_PREFIX$BUILD_BUILDID"
14 |
15 | echo "Creating Databricks cluster"
16 | cluster=$(databricks clusters create --json "$(cat << JSON
17 | {
18 | "cluster_name": "$cluster_name",
19 | "spark_version": "5.3.x-scala2.11",
20 | "node_type_id": "Standard_DS3_v2",
21 | "autoscale": {
22 | "min_workers": 1,
23 | "max_workers": 3
24 | },
25 | "autotermination_minutes": 120
26 | }
27 | JSON
28 | )"
29 | )
30 |
31 | cluster_id=$(echo $cluster | jq -r .cluster_id)
32 | sleep 10 #avoid race conditions
33 |
34 | echo "Installing Spline libraries"
35 | databricks_spline=$(ls databricks-spline/target/databricks-spline-*.jar | head -1)
36 | echo "Installing library $databricks_spline"
37 | databricks_spline_base=$(basename "$databricks_spline")
38 | databricks_spline_dbfs="dbfs:/lib/spline/$databricks_spline_base"
39 | databricks fs cp "$databricks_spline" "$databricks_spline_dbfs" --overwrite
40 | databricks libraries install --cluster-id $cluster_id --jar "$databricks_spline_dbfs"
41 |
42 | echo "Provisioning Spline connection string as Databricks secret"
43 | if ! databricks secrets list-scopes --output JSON | jq -e '.scopes[] | select (.name == "spline")'; then
44 | databricks secrets create-scope --scope spline --initial-manage-principal "users"
45 | fi
46 | databricks secrets put --scope spline --key spline.mongodb.url --string-value "$COSMOSDB_CONN_STRING"
47 |
48 |
49 | # Copy and run sample notebooks
50 |
51 | echo "Copying sample notebooks"
52 | databricks workspace import_dir notebooks /Shared/lineage-tutorial --overwrite
53 |
54 | for notebook in notebooks/*.scala; do
55 |
56 | notebook_name=$(basename $notebook .scala)
57 | notebook_path="/Shared/lineage-tutorial/$notebook_name"
58 | echo "Running notebook $notebook_path"
59 | run=$(databricks runs submit --json "$(cat << JSON
60 | {
61 | "name": "SampleRun",
62 | "existing_cluster_id": "$cluster_id",
63 | "timeout_seconds": 1200,
64 | "notebook_task": {
65 | "notebook_path": "$notebook_path"
66 | }
67 | }
68 | JSON
69 | )")
70 |
71 | # Echo job web page URL to task output to facilitate debugging
72 | run_id=$(echo $run | jq .run_id)
73 | databricks runs get --run-id "$run_id" | jq -r .run_page_url
74 |
75 |
76 | done
77 |
78 |
--------------------------------------------------------------------------------
/provision-webapp.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Strict mode, fail on any error
4 | set -euo pipefail
5 |
6 | mv spline/web/target/spline-ui*.war ROOT.war
7 |
8 | WEBAPP_NAME="$RESOURCE_NAME_PREFIX$BUILD_BUILDID"
9 |
10 | az appservice plan create -g $RESOURCE_GROUP -n $WEBAPP_NAME -o table
11 |
12 | az webapp create -g $RESOURCE_GROUP -n $WEBAPP_NAME --plan $WEBAPP_NAME -o table
13 |
14 | az webapp config set -g $RESOURCE_GROUP -n $WEBAPP_NAME --java-container TOMCAT --java-container-version 7.0.62 --java-version 1.8 -o table
15 |
16 | az webapp config appsettings set -g $RESOURCE_GROUP -n $WEBAPP_NAME --settings "spline.mongodb.url=$COSMOSDB_CONN_STRING" -o table
17 |
18 | az webapp config set -g $RESOURCE_GROUP -n $WEBAPP_NAME --always-on true -o table
19 |
20 | WEBAPP_URL="https://$(az webapp show -g $RESOURCE_GROUP -n $WEBAPP_NAME | jq -r .defaultHostName)"
21 |
22 | # Set job variables from script
23 | echo "##vso[task.setvariable variable=WEBAPP_NAME]$WEBAPP_NAME"
24 | echo "##vso[task.setvariable variable=WEBAPP_URL]$WEBAPP_URL"
25 |
--------------------------------------------------------------------------------