├── .gitignore
├── README.md
├── adls_demo.dbc
├── create_delta_tables.dbc
├── create_delta_tables.html
├── create_parquet_tables (delta demo in parquet).dbc
├── create_parquet_tables (delta demo in parquet).html
├── data_lake_load.dbc
├── data_lake_load.html
├── data_lake_sql_demo.dbc
├── data_lake_sql_demo.html
├── data_lake_sql_taxi_demo.dbc
├── data_lake_sql_taxi_demo.html
├── stackoverflow_db_ingest_parallel.ipynb
├── streaming_demo.dbc
├── streaming_demo.html
├── structured_streaming_taxi.dbc
├── structured_streaming_taxi.ipynb
└── structured_streaming_video_views.ipynb
/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 |
3 | # Byte-compiled / optimized / DLL files
4 | __pycache__/
5 | *.py[cod]
6 | *$py.class
7 |
8 | # C extensions
9 | *.so
10 |
11 | # Distribution / packaging
12 | .Python
13 | build/
14 | develop-eggs/
15 | dist/
16 | downloads/
17 | eggs/
18 | .eggs/
19 | lib/
20 | lib64/
21 | parts/
22 | sdist/
23 | var/
24 | wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 |
30 | # PyInstaller
31 | # Usually these files are written by a python script from a template
32 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 |
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 |
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .coverage
44 | .coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *.cover
49 | .hypothesis/
50 | .pytest_cache/
51 |
52 | # Translations
53 | *.mo
54 | *.pot
55 |
56 | # Django stuff:
57 | *.log
58 | local_settings.py
59 | db.sqlite3
60 |
61 | # Flask stuff:
62 | instance/
63 | .webassets-cache
64 |
65 | # Scrapy stuff:
66 | .scrapy
67 |
68 | # Sphinx documentation
69 | docs/_build/
70 |
71 | # PyBuilder
72 | target/
73 |
74 | # Jupyter Notebook
75 | .ipynb_checkpoints
76 |
77 | # pyenv
78 | .python-version
79 |
80 | # celery beat schedule file
81 | celerybeat-schedule
82 |
83 | # SageMath parsed files
84 | *.sage.py
85 |
86 | # Environments
87 | .env
88 | .venv
89 | env/
90 | venv/
91 | ENV/
92 | env.bak/
93 | venv.bak/
94 |
95 | # Spyder project settings
96 | .spyderproject
97 | .spyproject
98 |
99 | # Rope project settings
100 | .ropeproject
101 |
102 | # mkdocs documentation
103 | /site
104 |
105 | # mypy
106 | .mypy_cache/
107 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # databricks-notebooks
--------------------------------------------------------------------------------
/adls_demo.dbc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datakickstart/databricks-notebooks/69d9d8dff18eaa62ca3a897a7f107d46a307b495/adls_demo.dbc
--------------------------------------------------------------------------------
/create_delta_tables.dbc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datakickstart/databricks-notebooks/69d9d8dff18eaa62ca3a897a7f107d46a307b495/create_delta_tables.dbc
--------------------------------------------------------------------------------
/create_parquet_tables (delta demo in parquet).dbc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datakickstart/databricks-notebooks/69d9d8dff18eaa62ca3a897a7f107d46a307b495/create_parquet_tables (delta demo in parquet).dbc
--------------------------------------------------------------------------------
/data_lake_load.dbc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datakickstart/databricks-notebooks/69d9d8dff18eaa62ca3a897a7f107d46a307b495/data_lake_load.dbc
--------------------------------------------------------------------------------
/data_lake_sql_demo.dbc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datakickstart/databricks-notebooks/69d9d8dff18eaa62ca3a897a7f107d46a307b495/data_lake_sql_demo.dbc
--------------------------------------------------------------------------------
/data_lake_sql_taxi_demo.dbc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datakickstart/databricks-notebooks/69d9d8dff18eaa62ca3a897a7f107d46a307b495/data_lake_sql_taxi_demo.dbc
--------------------------------------------------------------------------------
/stackoverflow_db_ingest_parallel.ipynb:
--------------------------------------------------------------------------------
1 | {"cells":[{"cell_type":"code","source":["# On Databricks, need to add library for com.microsoft.sqlserver.jdbc.spark and set secrets\ndatabase = \"StackOverflow2010\"\ndb_host_name = \"sandbox-2-sqlserver.database.windows.net\"\ndb_url = f\"jdbc:sqlserver://{db_host_name};databaseName={database}\"\ndb_user = dbutils.secrets.get(\"demo\", \"sql-user-stackoverflow\") # databricks\ndb_password = dbutils.secrets.get(\"demo\", \"sql-pwd-stackoverflow\") #databricks"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"33b22a3a-0fd8-49e6-aabe-1d5527348733"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"
","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"html","arguments":{}}},"output_type":"display_data","data":{"text/html":["\n"]}}],"execution_count":0},{"cell_type":"code","source":["table_list = [\"Badges\", \"Comments\", \"LinkTypes\", \"PostLinks\", \"Posts\", \"PostTypes\", \"Users\", \"Votes\", \"VoteTypes\"]\nspark.sql(f\"CREATE DATABASE IF NOT EXISTS raw_stackoverflow LOCATION '/demo/raw_stackoverflow'\")\n\ndef load_table(table):\n print(table)\n destination_table = \"raw_stackoverflow.\" + table\n\n df = (\n spark.read\n .format(\"com.microsoft.sqlserver.jdbc.spark\")\n .option(\"url\", db_url)\n .option(\"dbtable\", table)\n .option(\"user\", db_user)\n .option(\"password\", db_password)\n .load()\n )\n\n df.write.format(\"parquet\").mode(\"overwrite\").saveAsTable(destination_table)"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"22552cb2-2b51-4312-a392-93426131fc59"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"html","arguments":{}}},"output_type":"display_data","data":{"text/html":["\n"]}}],"execution_count":0},{"cell_type":"code","source":["load_table(\"LinkTypes\")"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"47cab839-83de-4399-96ba-bd0f863669a2"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"LinkTypes\n
","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"html","arguments":{}}},"output_type":"display_data","data":{"text/html":["\nLinkTypes\n
"]}}],"execution_count":0},{"cell_type":"code","source":["for table in table_list:\n load_table(table)"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"3e256ff8-d6ee-4c86-bcee-e7c4881d4c35"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"Badges\nComments\nLinkTypes\nPostLinks\nPosts\nPostTypes\nUsers\nVotes\nVoteTypes\n
","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"html","arguments":{}}},"output_type":"display_data","data":{"text/html":["\nBadges\nComments\nLinkTypes\nPostLinks\nPosts\nPostTypes\nUsers\nVotes\nVoteTypes\n
"]}}],"execution_count":0},{"cell_type":"code","source":["from threading import Thread\nfrom queue import Queue\n\nq = Queue()\n\nworker_count = 2\n\ndef run_tasks(function, q):\n while not q.empty():\n value = q.get()\n function(value)\n q.task_done()\n\n\nprint(table_list)\n\nfor table in table_list:\n q.put(table)\n\nfor i in range(worker_count):\n t=Thread(target=run_tasks, args=(load_table, q))\n t.daemon = True\n t.start()\n\nq.join()\n"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"083455fd-9139-4171-bab1-acce3654d10e"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"['Badges', 'Comments', 'LinkTypes', 'PostLinks', 'Posts', 'PostTypes', 'Users', 'Votes', 'VoteTypes']\nBadges\nComments\nLinkTypes\nPostLinks\nPosts\nPostTypes\nUsers\nVotes\nVoteTypes\n
","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"html","arguments":{}}},"output_type":"display_data","data":{"text/html":["\n['Badges', 'Comments', 'LinkTypes', 'PostLinks', 'Posts', 'PostTypes', 'Users', 'Votes', 'VoteTypes']\nBadges\nComments\nLinkTypes\nPostLinks\nPosts\nPostTypes\nUsers\nVotes\nVoteTypes\n
"]}}],"execution_count":0}],"metadata":{"application/vnd.databricks.v1+notebook":{"notebookName":"stackoverflow_db_ingest_parallel","dashboards":[],"notebookMetadata":{"pythonIndentUnit":2},"language":"python","widgets":{},"notebookOrigID":2273523373550625}},"nbformat":4,"nbformat_minor":0}
2 |
--------------------------------------------------------------------------------
/streaming_demo.dbc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datakickstart/databricks-notebooks/69d9d8dff18eaa62ca3a897a7f107d46a307b495/streaming_demo.dbc
--------------------------------------------------------------------------------
/structured_streaming_taxi.dbc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datakickstart/databricks-notebooks/69d9d8dff18eaa62ca3a897a7f107d46a307b495/structured_streaming_taxi.dbc
--------------------------------------------------------------------------------
/structured_streaming_video_views.ipynb:
--------------------------------------------------------------------------------
1 | {"cells":[{"cell_type":"markdown","source":["## Stream Processing To Delta Lake\nMust have data streaming to the topic \"video_usage\".\n\n*Note: If not working, try changing the GROUP_ID and Consumer Group values to reset"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"b1e8b926-300f-492d-8a89-b8cb8f00b8e1"}}},{"cell_type":"markdown","source":["### Shared imports and variables\nRun this first since most cells below need at least one of these imports or variables"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"3878a9bc-b383-406b-b5bb-17610b42bb81"}}},{"cell_type":"code","source":["from pyspark.sql.functions import col, desc, regexp_replace, substring, to_date, from_json, explode, expr\nfrom pyspark.sql.types import StructType, StructField, ArrayType, StringType, IntegerType, BooleanType, TimestampType\n\ndate_format = \"yyyy-MM-dd HH:mm:ss\"\n\nvideo_views_delta_path = \"/mnt/adlsdemo/usage/video\"\n\n# Define a schema that Spark understands. This is one of several ways to do it.\nusage_schema = StructType([\n StructField(\"usageId\", IntegerType()),\n StructField(\"user\", StringType()),\n StructField(\"completed\", BooleanType()),\n StructField(\"durationSeconds\", IntegerType()),\n StructField(\"eventTimestamp\", TimestampType())\n])\n"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"1344cb4f-3e6f-4ddb-82cc-ef73e434e599"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"html","arguments":{}}},"output_type":"display_data","data":{"text/html":["\n"]}}],"execution_count":0},{"cell_type":"markdown","source":["### Stream Load of incoming data - Video Views\nRead streaming data from Confluent Cloud or Event Hubs (using Apache Kafka API) and save in the same delta location within Azure Data Lake Storage (ADLS)."],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"89164e3f-9f4e-44d0-860b-6356a4024014"}}},{"cell_type":"code","source":["spark.conf.set(\"spark.sql.legacy.timeParserPolicy\", \"LEGACY\")\n\nrun_version = \"v0.5\"\n\ntopic = 'video_usage'\nGROUP_ID = f'tst-group-video-usage-{run_version}'\n\n# To setup Key Vault backed secret scope for this first time, replace items in url and follow instructions: \n# https:///#secrets/createScopeSetup\n\ndef get_event_hub_config():\n # Password is really a Event Hub connection string, for example ->\n # Endpoint=sb://.servicebus.windows.net/;SharedAccessKeyName=ReadWriteTmp;SharedAccessKey=vhNXxXXXXXxxxXXXXXXXxx=;EntityPath=demo-message-1\n password = dbutils.secrets.get(scope = \"demo\", key = \"eh-sasl-{0}\".format(topic))\n\n EH_SASL = 'kafkashaded.org.apache.kafka.common.security.plain.PlainLoginModule required username=\"$ConnectionString\" password=\"{0}\";'.format(password)\n\n config = {\n 'kafka.bootstrap.servers': 'dustin-demo-eh.servicebus.windows.net:9093',\n 'kafka.security.protocol': 'SASL_SSL',\n 'kafka.sasl.mechanism': 'PLAIN',\n 'kafka.group.id': GROUP_ID,\n 'kafka.request.timeout.ms': \"60000\",\n 'kafka.session.timeout.ms': \"20000\",\n 'kafka.heartbeat.interval.ms': \"10000\",\n 'kafka.sasl.jaas.config': EH_SASL,\n 'subscribe': topic\n }\n return config\n\n\ndef get_confluent_cloud_config():\n bootstrap_servers = dbutils.secrets.get(scope = \"demo\", key = \"confluent-cloud-brokers\")\n username = dbutils.secrets.get(scope = \"demo\", key = \"confluent-cloud-user\")\n password = dbutils.secrets.get(scope = \"demo\", key = \"confluent-cloud-password\")\n SASL = 'kafkashaded.org.apache.kafka.common.security.plain.PlainLoginModule required username=\"{0}\" password=\"{1}\";'.format(username, password)\n\n config = {\n 'kafka.bootstrap.servers': bootstrap_servers,\n 'kafka.security.protocol': 'SASL_SSL',\n 'kafka.sasl.mechanism': 'PLAIN',\n 'kafka.group.id': GROUP_ID,\n 'kafka.request.timeout.ms': \"60000\",\n 'kafka.session.timeout.ms': \"20000\",\n 'kafka.heartbeat.interval.ms': \"10000\",\n 'kafka.sasl.jaas.config': SASL,\n 'subscribe': topic\n }\n return config\n\n\nconsumer_config = get_confluent_cloud_config()\n \n# Read from Kafka, format will be a kafka record\ninput_df = spark.readStream.format(\"kafka\").options(**consumer_config).load()\n\n# Cast just the value as a string (instead of bytes) then use from_json to convert to an object matching the schema\njson_df = (\n input_df.select(\n from_json(col(\"value\").cast(\"string\"), usage_schema).alias(\"json\"),\n col(\"value\").cast(\"string\").alias(\"value_raw\")\n )\n)\n\n# Select all attribues from json as individual columns, cast trip_distance, add columns\ntransformed_df = (\n json_df\n .select(\"json.*\", \"value_raw\")\n)\n\n# display(transformed_df)\n"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"6cb3f955-8a03-4bad-813f-00f260eba662"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"html","arguments":{}}},"output_type":"display_data","data":{"text/html":["\n"]}}],"execution_count":0},{"cell_type":"markdown","source":["## Azure Storage as a destination\n* One option for streaming output is to write directly to you data lake storage (Azure Data Lake Storage Gen 2 or standard Azure Blob Storage).\n* Databricks Delta / Delta Lake file format makes this more efficient, but could do with Parquet, Avro or other formats."],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"78ebdbd7-98d2-4b1c-b9de-b7e171203223"}}},{"cell_type":"code","source":["video_views_delta_path_2 = video_views_delta_path + \"_\" + run_version\n\n(\ntransformed_df.writeStream\n .queryName(\"StreamingVideoViewsDelta\")\n .format(\"delta\")\n .outputMode(\"append\")\n .trigger(processingTime=\"5 seconds\")\n .option(\"checkpointLocation\", f\"/delta/events/_checkpoints/streaming_video_views_{run_version}\")\n .start(video_views_delta_path_2)\n)"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"555ac728-8e1e-4a1e-98b9-f0d188997bc1"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"Out[8]: <pyspark.sql.streaming.StreamingQuery at 0x7f1e501f3b80>
","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"html","arguments":{}}},"output_type":"display_data","data":{"text/html":["\nOut[8]: <pyspark.sql.streaming.StreamingQuery at 0x7f1e501f3b80>
"]}}],"execution_count":0},{"cell_type":"code","source":["# Read data out of delta table\n# delta_stream_df = spark.readStream.format(\"delta\").load(video_views_delta_path_2)\n# display(delta_stream_df)"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"432ab17f-e654-49e9-8353-e13885031b6a"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"html","arguments":{}}},"output_type":"display_data","data":{"text/html":["\n"]}}],"execution_count":0},{"cell_type":"markdown","source":["### Alternatively: Send transformed data to Event Hubs for next steps in pipeline"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"e6ad06ad-cc0d-47ae-ad8c-0197ae317832"}}},{"cell_type":"code","source":["# topic2 = 'demo-message-transformed'\n\n# producer_config = consumer_config\n# producer_config.pop('subscribe')\n# producer_config['topic'] = topic2\n\n# kafka_output_df = trip_df.selectExpr(\n# \"CAST(VendorId as STRING) as key\",\n# \"to_json(struct(*)) as value\")\n\n# # display(kafka_output_df)\n# kafka_output_df.writeStream \\\n# .format(\"kafka\") \\\n# .options(**producer_config) \\\n# .option(\"checkpointLocation\", f\"/delta/events/_checkpoints/cp_{run_version}\") \\\n# .start()"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"38ac40d7-7eb0-40ba-ac24-79f86a84fdd1"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"html","arguments":{}}},"output_type":"display_data","data":{"text/html":["\n"]}}],"execution_count":0}],"metadata":{"application/vnd.databricks.v1+notebook":{"notebookName":"structured_streaming_video_views","dashboards":[],"notebookMetadata":{"pythonIndentUnit":2},"language":"python","widgets":{},"notebookOrigID":872950405059611}},"nbformat":4,"nbformat_minor":0}
2 |
--------------------------------------------------------------------------------