├── .gitattributes ├── .gitignore ├── LICENSE ├── README.md ├── azure-pipelines.yml ├── before-build.sh ├── databricks-spline └── pom.xml ├── notebooks ├── TPC-H.scala └── sample-spark-job.scala ├── pom.xml ├── provision-databricks.sh └── provision-webapp.sh /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Compiled class file 2 | *.class 3 | 4 | # Log file 5 | *.log 6 | 7 | # BlueJ files 8 | *.ctxt 9 | 10 | # Mobile Tools for Java (J2ME) 11 | .mtj.tmp/ 12 | 13 | # Package Files # 14 | *.jar 15 | *.war 16 | *.nar 17 | *.ear 18 | *.zip 19 | *.tar.gz 20 | *.rar 21 | 22 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml 23 | hs_err_pid* 24 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 algattik 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # databricks-lineage-tutorial 2 | -------------------------------------------------------------------------------- /azure-pipelines.yml: -------------------------------------------------------------------------------- 1 | trigger: 2 | - master 3 | 4 | variables: 5 | RESOURCE_GROUP: lineagetutorial 6 | RESOURCE_NAME_PREFIX: lineagetutorial 7 | SPARK_VERSION: 2.4 8 | GIT_BRANCH: release/0.3 9 | DATABRICKS_HOST: https://northeurope.azuredatabricks.net/ 10 | DATABRICKS_TOKEN: dapi00000000000000000000000000000000 11 | 12 | pool: 13 | vmImage: 'Ubuntu-16.04' 14 | 15 | steps: 16 | 17 | - task: AzureCLI@1 18 | displayName: Create Azure resources 19 | inputs: 20 | azureSubscription: ARMConnection 21 | scriptPath: before-build.sh 22 | 23 | - task: Maven@3 24 | displayName: Build Spline Library and Web UI 25 | inputs: 26 | mavenOptions: '-Xmx3072m' 27 | javaHomeOption: 'JDKVersion' 28 | jdkVersionOption: '1.8' 29 | jdkArchitectureOption: 'x64' 30 | publishJUnitResults: false 31 | goals: package --batch-mode --activate-profiles spark-2.4 --projects databricks-spline,spline/web --also-make --define skipTests 32 | 33 | - bash: $(System.DefaultWorkingDirectory)/provision-databricks.sh 34 | displayName: Provision Databricks 35 | 36 | # task: Maven@3 37 | # displayName: Run Spline tests 38 | # inputs: 39 | # mavenPomFile: 'spline/pom.xml' 40 | # mavenOptions: '-Xmx3072m' 41 | # javaHomeOption: 'JDKVersion' 42 | # jdkVersionOption: '1.8' 43 | # jdkArchitectureOption: 'x64' 44 | # goals: test --batch-mode --activate-profiles spark-2.4 --projects web --also-make --define test.spline.mongodb.url="$(COSMOSDB_CONN_STRING)" 45 | 46 | - task: AzureCLI@1 47 | displayName: Provision Webapp 48 | inputs: 49 | azureSubscription: ARMConnection 50 | scriptPath: provision-webapp.sh 51 | 52 | - task: AzureRmWebAppDeployment@3 53 | displayName: 'Deploy Spline UI WAR to Azure App Service' 54 | inputs: 55 | azureSubscription: ARMConnection 56 | WebAppName: $(WEBAPP_NAME) 57 | Package: '$(System.DefaultWorkingDirectory)/ROOT.war' 58 | 59 | - bash: for i in {1..40}; do curl --silent --show-error --fail $WEBAPP_URL && break || sleep 10; done 60 | displayName: Prewarm Webapp 61 | 62 | - bash: echo "$WEBAPP_URL" 63 | displayName: Display Webapp URL 64 | -------------------------------------------------------------------------------- /before-build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Strict mode, fail on any error 4 | set -euo pipefail 5 | 6 | # Clone the repository to be built 7 | git clone --single-branch --branch $GIT_BRANCH https://github.com/AbsaOSS/spline.git 8 | 9 | # The name of the Cosmos DB instance to be deployed. Generate a unique name. 10 | COSMOSDB_INSTANCE="$RESOURCE_NAME_PREFIX$BUILD_BUILDID" 11 | 12 | # Create a Cosmos DB database. This command has no effect if the database already exists. 13 | az cosmosdb create -g $RESOURCE_GROUP -n $COSMOSDB_INSTANCE --kind MongoDB --capabilities EnableAggregationPipeline -o table 14 | 15 | # Get the connection string (in mongodb:// format) to the Cosmos DB account. 16 | # The connection string contains the account key. 17 | # Example connection string: 18 | # mongodb://mycosmosdb:kmRux...XBQ==@mycosmosdb.documents.azure.com:10255/?ssl=true&replicaSet=globaldb 19 | cosmosdb_conn_string=$(az cosmosdb list-connection-strings -g $RESOURCE_GROUP -n $COSMOSDB_INSTANCE --query connectionStrings[0].connectionString -o tsv) 20 | 21 | # Add the database name within the connection string (before the '?' delimiter). 22 | COSMOSDB_CONN_STRING=${cosmosdb_conn_string/\?/spline?} 23 | 24 | # Set job variables from script 25 | echo "##vso[task.setvariable variable=COSMOSDB_CONN_STRING]$COSMOSDB_CONN_STRING" 26 | -------------------------------------------------------------------------------- /databricks-spline/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4.0.0 4 | databricks-spline 5 | jar 6 | 7 | com.cloudarchitected.spline 8 | databricks-spline-parent 9 | 0.3.7-SNAPSHOT 10 | 11 | 12 | ${project.version} 13 | 2.4 14 | 15 | 16 | 17 | za.co.absa.spline 18 | spline-core 19 | ${spline.version} 20 | 21 | 22 | za.co.absa.spline 23 | spline-core-spark-adapter-${spark.version} 24 | ${spline.version} 25 | 26 | 27 | org.apache.spark 28 | * 29 | 30 | 31 | 32 | 33 | za.co.absa.spline 34 | spline-persistence-mongo 35 | ${spline.version} 36 | 37 | 38 | 39 | 40 | 41 | org.apache.maven.plugins 42 | maven-shade-plugin 43 | 3.2.1 44 | 45 | 46 | 47 | package 48 | 49 | shade 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | -------------------------------------------------------------------------------- /notebooks/TPC-H.scala: -------------------------------------------------------------------------------- 1 | // Databricks notebook source 2 | System.setProperty("spline.mode", "REQUIRED") 3 | System.setProperty("spline.persistence.factory", "za.co.absa.spline.persistence.mongo.MongoPersistenceFactory") 4 | System.setProperty("spline.mongodb.url", dbutils.secrets.get("spline", "spline.mongodb.url")) 5 | import za.co.absa.spline.core.SparkLineageInitializer._ 6 | spark.enableLineageTracking() 7 | 8 | // COMMAND ---------- 9 | 10 | val dbname = "lineage_tutorial_" + java.util.UUID.randomUUID.toString.replaceAll("-","") 11 | spark.sql(s"CREATE DATABASE $dbname") 12 | spark.sql(s"USE $dbname") 13 | 14 | // COMMAND ---------- 15 | 16 | // MAGIC %sql 17 | // MAGIC CREATE TEMPORARY TABLE raw_nation (n_nationkey int, n_name string, n_regionkey int, n_comment string) 18 | // MAGIC USING com.databricks.spark.csv 19 | // MAGIC OPTIONS (path "/databricks-datasets/tpch/data-001/nation/nation.tbl", header "false", delimiter "|") 20 | 21 | // COMMAND ---------- 22 | 23 | // MAGIC %sql 24 | // MAGIC CREATE TEMPORARY TABLE raw_region (r_regionkey int, r_name string, r_comment string) 25 | // MAGIC USING com.databricks.spark.csv 26 | // MAGIC OPTIONS (path "/databricks-datasets/tpch/data-001/region/region.tbl", header "false", delimiter "|") 27 | 28 | // COMMAND ---------- 29 | 30 | // MAGIC %sql 31 | // MAGIC CREATE TEMPORARY TABLE raw_orders (o_orderkey int, o_custkey int, o_orderstatus string, o_totalprice double, o_orderdate string, o_orderpriority string, o_clerk string, o_shippriority int, o_comment string) 32 | // MAGIC USING com.databricks.spark.csv 33 | // MAGIC OPTIONS (path "/databricks-datasets/tpch/data-001/orders/orders.tbl", header "false", delimiter "|") 34 | 35 | // COMMAND ---------- 36 | 37 | // MAGIC %sql 38 | // MAGIC CREATE TEMPORARY TABLE raw_lineitem (l_orderkey int, l_partkey int, l_suppkey int, l_linenumber int, l_quantity double, l_extendedprice double, l_discount double, l_tax double, l_returnflag string, l_linestatus string, l_shipdate string, l_commitdate string, l_receiptdate string, l_shipinstruct string, l_shipmode string, l_comment string) 39 | // MAGIC USING com.databricks.spark.csv 40 | // MAGIC OPTIONS (path "/databricks-datasets/tpch/data-001/lineitem/lineitem.tbl", header "false", delimiter "|") 41 | 42 | // COMMAND ---------- 43 | 44 | // MAGIC %sql 45 | // MAGIC CREATE TEMPORARY TABLE raw_customer (c_custkey int, c_name string, c_address string, c_nationkey int, c_phone string, c_acctbal double, c_mktsegment string , c_comment string) 46 | // MAGIC USING com.databricks.spark.csv 47 | // MAGIC OPTIONS (path "/databricks-datasets/tpch/data-001/customer/customer.tbl", header "false", delimiter "|") 48 | 49 | // COMMAND ---------- 50 | 51 | // MAGIC %sql 52 | // MAGIC CREATE TEMPORARY TABLE raw_part (p_partkey int, p_name string, p_mfgr string, p_brand string, p_type string, p_size int, p_container string, p_retailprice double, p_comment string) 53 | // MAGIC USING com.databricks.spark.csv 54 | // MAGIC OPTIONS (path "/databricks-datasets/tpch/data-001/part/part.tbl", header "false", delimiter "|") 55 | 56 | // COMMAND ---------- 57 | 58 | // MAGIC %sql 59 | // MAGIC CREATE TEMPORARY TABLE raw_supplier (s_suppkey int, s_name string, s_address string, s_nationkey int, s_phone string, s_acctbal double, s_comment string) 60 | // MAGIC USING com.databricks.spark.csv 61 | // MAGIC OPTIONS (path "/databricks-datasets/tpch/data-001/supplier/supplier.tbl", header "false", delimiter "|") 62 | 63 | // COMMAND ---------- 64 | 65 | // MAGIC %sql 66 | // MAGIC CREATE TEMPORARY TABLE raw_partsupp (ps_partkey int, ps_suppkey int, ps_availqty int, ps_supplycost decimal, ps_comment string) 67 | // MAGIC USING com.databricks.spark.csv 68 | // MAGIC OPTIONS (path "/databricks-datasets/tpch/data-001/partsupp/partsupp.tbl", header "false", delimiter "|") 69 | 70 | // COMMAND ---------- 71 | 72 | // MAGIC %sql 73 | // MAGIC CREATE TABLE nation AS SELECT * FROM raw_nation 74 | 75 | // COMMAND ---------- 76 | 77 | // MAGIC %sql 78 | // MAGIC CREATE TABLE region AS SELECT * FROM raw_region 79 | 80 | // COMMAND ---------- 81 | 82 | // MAGIC %sql 83 | // MAGIC CREATE TABLE orders AS SELECT * FROM raw_orders 84 | 85 | // COMMAND ---------- 86 | 87 | // MAGIC %sql 88 | // MAGIC CREATE TABLE lineitem AS SELECT * FROM raw_lineitem 89 | 90 | // COMMAND ---------- 91 | 92 | // MAGIC %sql 93 | // MAGIC CREATE TABLE customer AS SELECT * FROM raw_customer 94 | 95 | // COMMAND ---------- 96 | 97 | // MAGIC %sql 98 | // MAGIC CREATE TABLE part AS SELECT * FROM raw_part 99 | 100 | // COMMAND ---------- 101 | 102 | // MAGIC %sql 103 | // MAGIC CREATE TABLE supplier AS SELECT * FROM raw_supplier 104 | 105 | // COMMAND ---------- 106 | 107 | // MAGIC %sql 108 | // MAGIC CREATE TABLE partsupp AS SELECT * FROM raw_partsupp 109 | 110 | // COMMAND ---------- 111 | 112 | spark.sql(""" 113 | -- 114 | -- TPC-H/TPC-R Pricing Summary Report Query (Q1) 115 | -- Functional Query Definition 116 | -- Approved February 1998 117 | 118 | 119 | select 120 | l_returnflag, 121 | l_linestatus, 122 | sum(l_quantity) as sum_qty, 123 | sum(l_extendedprice) as sum_base_price, 124 | sum(l_extendedprice * (1 - l_discount)) as sum_disc_price, 125 | sum(l_extendedprice * (1 - l_discount) * (1 + l_tax)) as sum_charge, 126 | avg(l_quantity) as avg_qty, 127 | avg(l_extendedprice) as avg_price, 128 | avg(l_discount) as avg_disc, 129 | count(*) as count_order 130 | from 131 | lineitem 132 | where 133 | l_shipdate <= date '1998-12-01' - interval '90' day 134 | group by 135 | l_returnflag, 136 | l_linestatus 137 | order by 138 | l_returnflag, 139 | l_linestatus 140 | 141 | 142 | """).write.mode("overwrite").parquet("/tmp/tpch/TPCDH1.parquet") 143 | 144 | // COMMAND ---------- 145 | 146 | spark.sql(""" 147 | -- 148 | -- TPC-H/TPC-R Minimum Cost Supplier Query (Q2) 149 | -- Functional Query Definition 150 | -- Approved February 1998 151 | 152 | 153 | select 154 | s_acctbal, 155 | s_name, 156 | n_name, 157 | p_partkey, 158 | p_mfgr, 159 | s_address, 160 | s_phone, 161 | s_comment 162 | from 163 | part, 164 | supplier, 165 | partsupp, 166 | nation, 167 | region 168 | where 169 | p_partkey = ps_partkey 170 | and s_suppkey = ps_suppkey 171 | and p_size = 15 172 | and p_type like '%BRASS' 173 | and s_nationkey = n_nationkey 174 | and n_regionkey = r_regionkey 175 | and r_name = 'EUROPE' 176 | and ps_supplycost = ( 177 | select 178 | min(ps_supplycost) 179 | from 180 | partsupp, 181 | supplier, 182 | nation, 183 | region 184 | where 185 | p_partkey = ps_partkey 186 | and s_suppkey = ps_suppkey 187 | and s_nationkey = n_nationkey 188 | and n_regionkey = r_regionkey 189 | and r_name = 'EUROPE' 190 | ) 191 | order by 192 | s_acctbal desc, 193 | n_name, 194 | s_name, 195 | p_partkey 196 | LIMIT 100 197 | 198 | """).write.mode("overwrite").parquet("/tmp/tpch/TPCDH2.parquet") 199 | 200 | // COMMAND ---------- 201 | 202 | spark.sql(""" 203 | -- 204 | -- TPC-H/TPC-R Shipping Priority Query (Q3) 205 | -- Functional Query Definition 206 | -- Approved February 1998 207 | 208 | 209 | select 210 | l_orderkey, 211 | sum(l_extendedprice * (1 - l_discount)) as revenue, 212 | o_orderdate, 213 | o_shippriority 214 | from 215 | customer, 216 | orders, 217 | lineitem 218 | where 219 | c_mktsegment = 'BUILDING' 220 | and c_custkey = o_custkey 221 | and l_orderkey = o_orderkey 222 | and o_orderdate < date '1995-03-15' 223 | and l_shipdate > date '1995-03-15' 224 | group by 225 | l_orderkey, 226 | o_orderdate, 227 | o_shippriority 228 | order by 229 | revenue desc, 230 | o_orderdate 231 | LIMIT 10 232 | 233 | """).write.mode("overwrite").parquet("/tmp/tpch/TPCDH3.parquet") 234 | 235 | // COMMAND ---------- 236 | 237 | spark.sql(""" 238 | -- 239 | -- TPC-H/TPC-R Order Priority Checking Query (Q4) 240 | -- Functional Query Definition 241 | -- Approved February 1998 242 | 243 | 244 | select 245 | o_orderpriority, 246 | count(*) as order_count 247 | from 248 | orders 249 | where 250 | o_orderdate >= date '1993-07-01' 251 | and o_orderdate < date '1993-07-01' + interval '3' month 252 | and exists ( 253 | select 254 | * 255 | from 256 | lineitem 257 | where 258 | l_orderkey = o_orderkey 259 | and l_commitdate < l_receiptdate 260 | ) 261 | group by 262 | o_orderpriority 263 | order by 264 | o_orderpriority 265 | 266 | 267 | """).write.mode("overwrite").parquet("/tmp/tpch/TPCDH4.parquet") 268 | 269 | // COMMAND ---------- 270 | 271 | spark.sql(""" 272 | -- 273 | -- TPC-H/TPC-R Local Supplier Volume Query (Q5) 274 | -- Functional Query Definition 275 | -- Approved February 1998 276 | 277 | 278 | select 279 | n_name, 280 | sum(l_extendedprice * (1 - l_discount)) as revenue 281 | from 282 | customer, 283 | orders, 284 | lineitem, 285 | supplier, 286 | nation, 287 | region 288 | where 289 | c_custkey = o_custkey 290 | and l_orderkey = o_orderkey 291 | and l_suppkey = s_suppkey 292 | and c_nationkey = s_nationkey 293 | and s_nationkey = n_nationkey 294 | and n_regionkey = r_regionkey 295 | and r_name = 'ASIA' 296 | and o_orderdate >= date '1994-01-01' 297 | and o_orderdate < date '1994-01-01' + interval '1' year 298 | group by 299 | n_name 300 | order by 301 | revenue desc 302 | 303 | 304 | """).write.mode("overwrite").parquet("/tmp/tpch/TPCDH5.parquet") 305 | 306 | // COMMAND ---------- 307 | 308 | spark.sql(""" 309 | -- 310 | -- TPC-H/TPC-R Forecasting Revenue Change Query (Q6) 311 | -- Functional Query Definition 312 | -- Approved February 1998 313 | 314 | 315 | select 316 | sum(l_extendedprice * l_discount) as revenue 317 | from 318 | lineitem 319 | where 320 | l_shipdate >= date '1994-01-01' 321 | and l_shipdate < date '1994-01-01' + interval '1' year 322 | and l_discount between .06 - 0.01 and .06 + 0.01 323 | and l_quantity < 24 324 | 325 | 326 | """).write.mode("overwrite").parquet("/tmp/tpch/TPCDH6.parquet") 327 | 328 | // COMMAND ---------- 329 | 330 | spark.sql(""" 331 | -- 332 | -- TPC-H/TPC-R Volume Shipping Query (Q7) 333 | -- Functional Query Definition 334 | -- Approved February 1998 335 | 336 | 337 | select 338 | supp_nation, 339 | cust_nation, 340 | l_year, 341 | sum(volume) as revenue 342 | from 343 | ( 344 | select 345 | n1.n_name as supp_nation, 346 | n2.n_name as cust_nation, 347 | extract(year from l_shipdate) as l_year, 348 | l_extendedprice * (1 - l_discount) as volume 349 | from 350 | supplier, 351 | lineitem, 352 | orders, 353 | customer, 354 | nation n1, 355 | nation n2 356 | where 357 | s_suppkey = l_suppkey 358 | and o_orderkey = l_orderkey 359 | and c_custkey = o_custkey 360 | and s_nationkey = n1.n_nationkey 361 | and c_nationkey = n2.n_nationkey 362 | and ( 363 | (n1.n_name = 'FRANCE' and n2.n_name = 'GERMANY') 364 | or (n1.n_name = 'GERMANY' and n2.n_name = 'FRANCE') 365 | ) 366 | and l_shipdate between date '1995-01-01' and date '1996-12-31' 367 | ) as shipping 368 | group by 369 | supp_nation, 370 | cust_nation, 371 | l_year 372 | order by 373 | supp_nation, 374 | cust_nation, 375 | l_year 376 | 377 | 378 | """).write.mode("overwrite").parquet("/tmp/tpch/TPCDH7.parquet") 379 | 380 | // COMMAND ---------- 381 | 382 | spark.sql(""" 383 | -- 384 | -- TPC-H/TPC-R National Market Share Query (Q8) 385 | -- Functional Query Definition 386 | -- Approved February 1998 387 | 388 | 389 | select 390 | o_year, 391 | sum(case 392 | when nation = 'BRAZIL' then volume 393 | else 0 394 | end) / sum(volume) as mkt_share 395 | from 396 | ( 397 | select 398 | extract(year from o_orderdate) as o_year, 399 | l_extendedprice * (1 - l_discount) as volume, 400 | n2.n_name as nation 401 | from 402 | part, 403 | supplier, 404 | lineitem, 405 | orders, 406 | customer, 407 | nation n1, 408 | nation n2, 409 | region 410 | where 411 | p_partkey = l_partkey 412 | and s_suppkey = l_suppkey 413 | and l_orderkey = o_orderkey 414 | and o_custkey = c_custkey 415 | and c_nationkey = n1.n_nationkey 416 | and n1.n_regionkey = r_regionkey 417 | and r_name = 'AMERICA' 418 | and s_nationkey = n2.n_nationkey 419 | and o_orderdate between date '1995-01-01' and date '1996-12-31' 420 | and p_type = 'ECONOMY ANODIZED STEEL' 421 | ) as all_nations 422 | group by 423 | o_year 424 | order by 425 | o_year 426 | 427 | 428 | """).write.mode("overwrite").parquet("/tmp/tpch/TPCDH8.parquet") 429 | 430 | // COMMAND ---------- 431 | 432 | spark.sql(""" 433 | -- 434 | -- TPC-H/TPC-R Product Type Profit Measure Query (Q9) 435 | -- Functional Query Definition 436 | -- Approved February 1998 437 | 438 | 439 | select 440 | nation, 441 | o_year, 442 | sum(amount) as sum_profit 443 | from 444 | ( 445 | select 446 | n_name as nation, 447 | extract(year from o_orderdate) as o_year, 448 | l_extendedprice * (1 - l_discount) - ps_supplycost * l_quantity as amount 449 | from 450 | part, 451 | supplier, 452 | lineitem, 453 | partsupp, 454 | orders, 455 | nation 456 | where 457 | s_suppkey = l_suppkey 458 | and ps_suppkey = l_suppkey 459 | and ps_partkey = l_partkey 460 | and p_partkey = l_partkey 461 | and o_orderkey = l_orderkey 462 | and s_nationkey = n_nationkey 463 | and p_name like '%green%' 464 | ) as profit 465 | group by 466 | nation, 467 | o_year 468 | order by 469 | nation, 470 | o_year desc 471 | 472 | 473 | """).write.mode("overwrite").parquet("/tmp/tpch/TPCDH9.parquet") 474 | 475 | // COMMAND ---------- 476 | 477 | spark.sql(""" 478 | -- 479 | -- TPC-H/TPC-R Returned Item Reporting Query (Q10) 480 | -- Functional Query Definition 481 | -- Approved February 1998 482 | 483 | 484 | select 485 | c_custkey, 486 | c_name, 487 | sum(l_extendedprice * (1 - l_discount)) as revenue, 488 | c_acctbal, 489 | n_name, 490 | c_address, 491 | c_phone, 492 | c_comment 493 | from 494 | customer, 495 | orders, 496 | lineitem, 497 | nation 498 | where 499 | c_custkey = o_custkey 500 | and l_orderkey = o_orderkey 501 | and o_orderdate >= date '1993-10-01' 502 | and o_orderdate < date '1993-10-01' + interval '3' month 503 | and l_returnflag = 'R' 504 | and c_nationkey = n_nationkey 505 | group by 506 | c_custkey, 507 | c_name, 508 | c_acctbal, 509 | c_phone, 510 | n_name, 511 | c_address, 512 | c_comment 513 | order by 514 | revenue desc 515 | LIMIT 20 516 | 517 | """).write.mode("overwrite").parquet("/tmp/tpch/TPCDH10.parquet") 518 | 519 | // COMMAND ---------- 520 | 521 | spark.sql(""" 522 | -- 523 | -- TPC-H/TPC-R Important Stock Identification Query (Q11) 524 | -- Functional Query Definition 525 | -- Approved February 1998 526 | 527 | 528 | select 529 | ps_partkey, 530 | sum(ps_supplycost * ps_availqty) as value 531 | from 532 | partsupp, 533 | supplier, 534 | nation 535 | where 536 | ps_suppkey = s_suppkey 537 | and s_nationkey = n_nationkey 538 | and n_name = 'GERMANY' 539 | group by 540 | ps_partkey having 541 | sum(ps_supplycost * ps_availqty) > ( 542 | select 543 | sum(ps_supplycost * ps_availqty) * 0.0001 544 | from 545 | partsupp, 546 | supplier, 547 | nation 548 | where 549 | ps_suppkey = s_suppkey 550 | and s_nationkey = n_nationkey 551 | and n_name = 'GERMANY' 552 | ) 553 | order by 554 | value desc 555 | 556 | 557 | """).write.mode("overwrite").parquet("/tmp/tpch/TPCDH11.parquet") 558 | 559 | // COMMAND ---------- 560 | 561 | spark.sql(""" 562 | -- 563 | -- TPC-H/TPC-R Shipping Modes and Order Priority Query (Q12) 564 | -- Functional Query Definition 565 | -- Approved February 1998 566 | 567 | 568 | select 569 | l_shipmode, 570 | sum(case 571 | when o_orderpriority = '1-URGENT' 572 | or o_orderpriority = '2-HIGH' 573 | then 1 574 | else 0 575 | end) as high_line_count, 576 | sum(case 577 | when o_orderpriority <> '1-URGENT' 578 | and o_orderpriority <> '2-HIGH' 579 | then 1 580 | else 0 581 | end) as low_line_count 582 | from 583 | orders, 584 | lineitem 585 | where 586 | o_orderkey = l_orderkey 587 | and l_shipmode in ('MAIL', 'SHIP') 588 | and l_commitdate < l_receiptdate 589 | and l_shipdate < l_commitdate 590 | and l_receiptdate >= date '1994-01-01' 591 | and l_receiptdate < date '1994-01-01' + interval '1' year 592 | group by 593 | l_shipmode 594 | order by 595 | l_shipmode 596 | 597 | 598 | """).write.mode("overwrite").parquet("/tmp/tpch/TPCDH12.parquet") 599 | 600 | // COMMAND ---------- 601 | 602 | spark.sql(""" 603 | -- 604 | -- TPC-H/TPC-R Customer Distribution Query (Q13) 605 | -- Functional Query Definition 606 | -- Approved February 1998 607 | 608 | 609 | select 610 | c_count, 611 | count(*) as custdist 612 | from 613 | ( 614 | select 615 | c_custkey, 616 | count(o_orderkey) 617 | from 618 | customer left outer join orders on 619 | c_custkey = o_custkey 620 | and o_comment not like '%special%requests%' 621 | group by 622 | c_custkey 623 | ) as c_orders (c_custkey, c_count) 624 | group by 625 | c_count 626 | order by 627 | custdist desc, 628 | c_count desc 629 | 630 | 631 | """).write.mode("overwrite").parquet("/tmp/tpch/TPCDH13.parquet") 632 | 633 | // COMMAND ---------- 634 | 635 | spark.sql(""" 636 | -- 637 | -- TPC-H/TPC-R Promotion Effect Query (Q14) 638 | -- Functional Query Definition 639 | -- Approved February 1998 640 | 641 | 642 | select 643 | 100.00 * sum(case 644 | when p_type like 'PROMO%' 645 | then l_extendedprice * (1 - l_discount) 646 | else 0 647 | end) / sum(l_extendedprice * (1 - l_discount)) as promo_revenue 648 | from 649 | lineitem, 650 | part 651 | where 652 | l_partkey = p_partkey 653 | and l_shipdate >= date '1995-09-01' 654 | and l_shipdate < date '1995-09-01' + interval '1' month 655 | 656 | 657 | """).write.mode("overwrite").parquet("/tmp/tpch/TPCDH14.parquet") 658 | 659 | // COMMAND ---------- 660 | 661 | spark.sql(""" 662 | -- 663 | -- TPC-H/TPC-R Parts/Supplier Relationship Query (Q16) 664 | -- Functional Query Definition 665 | -- Approved February 1998 666 | 667 | 668 | select 669 | p_brand, 670 | p_type, 671 | p_size, 672 | count(distinct ps_suppkey) as supplier_cnt 673 | from 674 | partsupp, 675 | part 676 | where 677 | p_partkey = ps_partkey 678 | and p_brand <> 'Brand#45' 679 | and p_type not like 'MEDIUM POLISHED%' 680 | and p_size in (49, 14, 23, 45, 19, 3, 36, 9) 681 | and ps_suppkey not in ( 682 | select 683 | s_suppkey 684 | from 685 | supplier 686 | where 687 | s_comment like '%Customer%Complaints%' 688 | ) 689 | group by 690 | p_brand, 691 | p_type, 692 | p_size 693 | order by 694 | supplier_cnt desc, 695 | p_brand, 696 | p_type, 697 | p_size 698 | 699 | 700 | """).write.mode("overwrite").parquet("/tmp/tpch/TPCDH16.parquet") 701 | 702 | // COMMAND ---------- 703 | 704 | spark.sql(""" 705 | -- 706 | -- TPC-H/TPC-R Small-Quantity-Order Revenue Query (Q17) 707 | -- Functional Query Definition 708 | -- Approved February 1998 709 | 710 | 711 | select 712 | sum(l_extendedprice) / 7.0 as avg_yearly 713 | from 714 | lineitem, 715 | part 716 | where 717 | p_partkey = l_partkey 718 | and p_brand = 'Brand#23' 719 | and p_container = 'MED BOX' 720 | and l_quantity < ( 721 | select 722 | 0.2 * avg(l_quantity) 723 | from 724 | lineitem 725 | where 726 | l_partkey = p_partkey 727 | ) 728 | 729 | 730 | """).write.mode("overwrite").parquet("/tmp/tpch/TPCDH17.parquet") 731 | 732 | // COMMAND ---------- 733 | 734 | spark.sql(""" 735 | -- 736 | -- TPC-H/TPC-R Large Volume Customer Query (Q18) 737 | -- Function Query Definition 738 | -- Approved February 1998 739 | 740 | 741 | select 742 | c_name, 743 | c_custkey, 744 | o_orderkey, 745 | o_orderdate, 746 | o_totalprice, 747 | sum(l_quantity) AS sum_l_quantity 748 | from 749 | customer, 750 | orders, 751 | lineitem 752 | where 753 | o_orderkey in ( 754 | select 755 | l_orderkey 756 | from 757 | lineitem 758 | group by 759 | l_orderkey having 760 | sum(l_quantity) > 300 761 | ) 762 | and c_custkey = o_custkey 763 | and o_orderkey = l_orderkey 764 | group by 765 | c_name, 766 | c_custkey, 767 | o_orderkey, 768 | o_orderdate, 769 | o_totalprice 770 | order by 771 | o_totalprice desc, 772 | o_orderdate 773 | LIMIT 100 774 | 775 | """).write.mode("overwrite").parquet("/tmp/tpch/TPCDH18.parquet") 776 | 777 | // COMMAND ---------- 778 | 779 | spark.sql(""" 780 | -- 781 | -- TPC-H/TPC-R Discounted Revenue Query (Q19) 782 | -- Functional Query Definition 783 | -- Approved February 1998 784 | 785 | 786 | select 787 | sum(l_extendedprice* (1 - l_discount)) as revenue 788 | from 789 | lineitem, 790 | part 791 | where 792 | ( 793 | p_partkey = l_partkey 794 | and p_brand = 'Brand#12' 795 | and p_container in ('SM CASE', 'SM BOX', 'SM PACK', 'SM PKG') 796 | and l_quantity >= 1 and l_quantity <= 1 + 10 797 | and p_size between 1 and 5 798 | and l_shipmode in ('AIR', 'AIR REG') 799 | and l_shipinstruct = 'DELIVER IN PERSON' 800 | ) 801 | or 802 | ( 803 | p_partkey = l_partkey 804 | and p_brand = 'Brand#23' 805 | and p_container in ('MED BAG', 'MED BOX', 'MED PKG', 'MED PACK') 806 | and l_quantity >= 10 and l_quantity <= 10 + 10 807 | and p_size between 1 and 10 808 | and l_shipmode in ('AIR', 'AIR REG') 809 | and l_shipinstruct = 'DELIVER IN PERSON' 810 | ) 811 | or 812 | ( 813 | p_partkey = l_partkey 814 | and p_brand = 'Brand#34' 815 | and p_container in ('LG CASE', 'LG BOX', 'LG PACK', 'LG PKG') 816 | and l_quantity >= 20 and l_quantity <= 20 + 10 817 | and p_size between 1 and 15 818 | and l_shipmode in ('AIR', 'AIR REG') 819 | and l_shipinstruct = 'DELIVER IN PERSON' 820 | ) 821 | 822 | 823 | """).write.mode("overwrite").parquet("/tmp/tpch/TPCDH19.parquet") 824 | 825 | // COMMAND ---------- 826 | 827 | spark.sql(""" 828 | -- 829 | -- TPC-H/TPC-R Potential Part Promotion Query (Q20) 830 | -- Function Query Definition 831 | -- Approved February 1998 832 | 833 | 834 | select 835 | s_name, 836 | s_address 837 | from 838 | supplier, 839 | nation 840 | where 841 | s_suppkey in ( 842 | select 843 | ps_suppkey 844 | from 845 | partsupp 846 | where 847 | ps_partkey in ( 848 | select 849 | p_partkey 850 | from 851 | part 852 | where 853 | p_name like 'forest%' 854 | ) 855 | and ps_availqty > ( 856 | select 857 | 0.5 * sum(l_quantity) 858 | from 859 | lineitem 860 | where 861 | l_partkey = ps_partkey 862 | and l_suppkey = ps_suppkey 863 | and l_shipdate >= date '1994-01-01' 864 | and l_shipdate < date '1994-01-01' + interval '1' year 865 | ) 866 | ) 867 | and s_nationkey = n_nationkey 868 | and n_name = 'CANADA' 869 | order by 870 | s_name 871 | 872 | 873 | """).write.mode("overwrite").parquet("/tmp/tpch/TPCDH20.parquet") 874 | 875 | // COMMAND ---------- 876 | 877 | spark.sql(""" 878 | -- 879 | -- TPC-H/TPC-R Suppliers Who Kept Orders Waiting Query (Q21) 880 | -- Functional Query Definition 881 | -- Approved February 1998 882 | 883 | 884 | select 885 | s_name, 886 | count(*) as numwait 887 | from 888 | supplier, 889 | lineitem l1, 890 | orders, 891 | nation 892 | where 893 | s_suppkey = l1.l_suppkey 894 | and o_orderkey = l1.l_orderkey 895 | and o_orderstatus = 'F' 896 | and l1.l_receiptdate > l1.l_commitdate 897 | and exists ( 898 | select 899 | * 900 | from 901 | lineitem l2 902 | where 903 | l2.l_orderkey = l1.l_orderkey 904 | and l2.l_suppkey <> l1.l_suppkey 905 | ) 906 | and not exists ( 907 | select 908 | * 909 | from 910 | lineitem l3 911 | where 912 | l3.l_orderkey = l1.l_orderkey 913 | and l3.l_suppkey <> l1.l_suppkey 914 | and l3.l_receiptdate > l3.l_commitdate 915 | ) 916 | and s_nationkey = n_nationkey 917 | and n_name = 'SAUDI ARABIA' 918 | group by 919 | s_name 920 | order by 921 | numwait desc, 922 | s_name 923 | LIMIT 100 924 | 925 | """).write.mode("overwrite").parquet("/tmp/tpch/TPCDH21.parquet") 926 | 927 | // COMMAND ---------- 928 | 929 | spark.sql(""" 930 | -- 931 | -- TPC-H/TPC-R Global Sales Opportunity Query (Q22) 932 | -- Functional Query Definition 933 | -- Approved February 1998 934 | 935 | 936 | select 937 | cntrycode, 938 | count(*) as numcust, 939 | sum(c_acctbal) as totacctbal 940 | from 941 | ( 942 | select 943 | substring(c_phone, 0, 2) as cntrycode, 944 | c_acctbal 945 | from 946 | customer 947 | where 948 | substring(c_phone, 0, 2) in 949 | ('13', '31', '23', '29', '30', '18', '17') 950 | and c_acctbal > ( 951 | select 952 | avg(c_acctbal) 953 | from 954 | customer 955 | where 956 | c_acctbal > 0.00 957 | and substring(c_phone, 0, 2) in 958 | ('13', '31', '23', '29', '30', '18', '17') 959 | ) 960 | and not exists ( 961 | select 962 | * 963 | from 964 | orders 965 | where 966 | o_custkey = c_custkey 967 | ) 968 | ) as custsale 969 | group by 970 | cntrycode 971 | order by 972 | cntrycode 973 | 974 | 975 | """).write.mode("overwrite").parquet("/tmp/tpch/TPCDH22.parquet") 976 | -------------------------------------------------------------------------------- /notebooks/sample-spark-job.scala: -------------------------------------------------------------------------------- 1 | // Databricks notebook source 2 | System.setProperty("spline.mode", "REQUIRED") 3 | System.setProperty("spline.persistence.factory", "za.co.absa.spline.persistence.mongo.MongoPersistenceFactory") 4 | System.setProperty("spline.mongodb.url", dbutils.secrets.get("spline", "spline.mongodb.url")) 5 | import za.co.absa.spline.core.SparkLineageInitializer._ 6 | spark.enableLineageTracking() 7 | 8 | // COMMAND ---------- 9 | 10 | // MAGIC %python 11 | // MAGIC rawData = spark.read.option("inferSchema", "true").json("/databricks-datasets/structured-streaming/events/") 12 | // MAGIC rawData.createOrReplaceTempView("rawData") 13 | // MAGIC sql("select r1.action, count(*) as actionCount from rawData as r1 join rawData as r2 on r1.action = r2.action group by r1.action").write.mode('overwrite').csv("/tmp/pyaggaction.csv") 14 | 15 | // COMMAND ---------- 16 | 17 | 18 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4.0.0 4 | com.cloudarchitected.spline 5 | databricks-spline-parent 6 | pom 7 | 8 | za.co.absa.spline 9 | spline 10 | 0.3.7-SNAPSHOT 11 | spline 12 | 13 | 14 | spline 15 | databricks-spline 16 | 17 | 18 | -------------------------------------------------------------------------------- /provision-databricks.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Strict mode, fail on any error 4 | set -euo pipefail 5 | 6 | echo "Installing Databricks CLI" 7 | sudo apt-get install -y python3-setuptools 8 | pip3 install wheel 9 | pip3 install databricks-cli 10 | sudo ln -s /home/vsts/.local/bin/* /usr/local/bin/ 11 | 12 | # Databricks cluster to be created 13 | cluster_name="$RESOURCE_NAME_PREFIX$BUILD_BUILDID" 14 | 15 | echo "Creating Databricks cluster" 16 | cluster=$(databricks clusters create --json "$(cat << JSON 17 | { 18 | "cluster_name": "$cluster_name", 19 | "spark_version": "5.3.x-scala2.11", 20 | "node_type_id": "Standard_DS3_v2", 21 | "autoscale": { 22 | "min_workers": 1, 23 | "max_workers": 3 24 | }, 25 | "autotermination_minutes": 120 26 | } 27 | JSON 28 | )" 29 | ) 30 | 31 | cluster_id=$(echo $cluster | jq -r .cluster_id) 32 | sleep 10 #avoid race conditions 33 | 34 | echo "Installing Spline libraries" 35 | databricks_spline=$(ls databricks-spline/target/databricks-spline-*.jar | head -1) 36 | echo "Installing library $databricks_spline" 37 | databricks_spline_base=$(basename "$databricks_spline") 38 | databricks_spline_dbfs="dbfs:/lib/spline/$databricks_spline_base" 39 | databricks fs cp "$databricks_spline" "$databricks_spline_dbfs" --overwrite 40 | databricks libraries install --cluster-id $cluster_id --jar "$databricks_spline_dbfs" 41 | 42 | echo "Provisioning Spline connection string as Databricks secret" 43 | if ! databricks secrets list-scopes --output JSON | jq -e '.scopes[] | select (.name == "spline")'; then 44 | databricks secrets create-scope --scope spline --initial-manage-principal "users" 45 | fi 46 | databricks secrets put --scope spline --key spline.mongodb.url --string-value "$COSMOSDB_CONN_STRING" 47 | 48 | 49 | # Copy and run sample notebooks 50 | 51 | echo "Copying sample notebooks" 52 | databricks workspace import_dir notebooks /Shared/lineage-tutorial --overwrite 53 | 54 | for notebook in notebooks/*.scala; do 55 | 56 | notebook_name=$(basename $notebook .scala) 57 | notebook_path="/Shared/lineage-tutorial/$notebook_name" 58 | echo "Running notebook $notebook_path" 59 | run=$(databricks runs submit --json "$(cat << JSON 60 | { 61 | "name": "SampleRun", 62 | "existing_cluster_id": "$cluster_id", 63 | "timeout_seconds": 1200, 64 | "notebook_task": { 65 | "notebook_path": "$notebook_path" 66 | } 67 | } 68 | JSON 69 | )") 70 | 71 | # Echo job web page URL to task output to facilitate debugging 72 | run_id=$(echo $run | jq .run_id) 73 | databricks runs get --run-id "$run_id" | jq -r .run_page_url 74 | 75 | 76 | done 77 | 78 | -------------------------------------------------------------------------------- /provision-webapp.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Strict mode, fail on any error 4 | set -euo pipefail 5 | 6 | mv spline/web/target/spline-ui*.war ROOT.war 7 | 8 | WEBAPP_NAME="$RESOURCE_NAME_PREFIX$BUILD_BUILDID" 9 | 10 | az appservice plan create -g $RESOURCE_GROUP -n $WEBAPP_NAME -o table 11 | 12 | az webapp create -g $RESOURCE_GROUP -n $WEBAPP_NAME --plan $WEBAPP_NAME -o table 13 | 14 | az webapp config set -g $RESOURCE_GROUP -n $WEBAPP_NAME --java-container TOMCAT --java-container-version 7.0.62 --java-version 1.8 -o table 15 | 16 | az webapp config appsettings set -g $RESOURCE_GROUP -n $WEBAPP_NAME --settings "spline.mongodb.url=$COSMOSDB_CONN_STRING" -o table 17 | 18 | az webapp config set -g $RESOURCE_GROUP -n $WEBAPP_NAME --always-on true -o table 19 | 20 | WEBAPP_URL="https://$(az webapp show -g $RESOURCE_GROUP -n $WEBAPP_NAME | jq -r .defaultHostName)" 21 | 22 | # Set job variables from script 23 | echo "##vso[task.setvariable variable=WEBAPP_NAME]$WEBAPP_NAME" 24 | echo "##vso[task.setvariable variable=WEBAPP_URL]$WEBAPP_URL" 25 | --------------------------------------------------------------------------------