├── .gitignore
├── README.md
├── adls_demo.dbc
├── create_delta_tables.dbc
├── create_delta_tables.html
├── create_parquet_tables (delta demo in parquet).dbc
├── create_parquet_tables (delta demo in parquet).html
├── data_lake_load.dbc
├── data_lake_load.html
├── data_lake_sql_demo.dbc
├── data_lake_sql_demo.html
├── data_lake_sql_taxi_demo.dbc
├── data_lake_sql_taxi_demo.html
├── stackoverflow_db_ingest_parallel.ipynb
├── streaming_demo.dbc
├── streaming_demo.html
├── structured_streaming_taxi.dbc
├── structured_streaming_taxi.ipynb
└── structured_streaming_video_views.ipynb


/.gitignore:
--------------------------------------------------------------------------------
  1 | .DS_Store
  2 | 
  3 | # Byte-compiled / optimized / DLL files
  4 | __pycache__/
  5 | *.py[cod]
  6 | *$py.class
  7 | 
  8 | # C extensions
  9 | *.so
 10 | 
 11 | # Distribution / packaging
 12 | .Python
 13 | build/
 14 | develop-eggs/
 15 | dist/
 16 | downloads/
 17 | eggs/
 18 | .eggs/
 19 | lib/
 20 | lib64/
 21 | parts/
 22 | sdist/
 23 | var/
 24 | wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | .hypothesis/
 50 | .pytest_cache/
 51 | 
 52 | # Translations
 53 | *.mo
 54 | *.pot
 55 | 
 56 | # Django stuff:
 57 | *.log
 58 | local_settings.py
 59 | db.sqlite3
 60 | 
 61 | # Flask stuff:
 62 | instance/
 63 | .webassets-cache
 64 | 
 65 | # Scrapy stuff:
 66 | .scrapy
 67 | 
 68 | # Sphinx documentation
 69 | docs/_build/
 70 | 
 71 | # PyBuilder
 72 | target/
 73 | 
 74 | # Jupyter Notebook
 75 | .ipynb_checkpoints
 76 | 
 77 | # pyenv
 78 | .python-version
 79 | 
 80 | # celery beat schedule file
 81 | celerybeat-schedule
 82 | 
 83 | # SageMath parsed files
 84 | *.sage.py
 85 | 
 86 | # Environments
 87 | .env
 88 | .venv
 89 | env/
 90 | venv/
 91 | ENV/
 92 | env.bak/
 93 | venv.bak/
 94 | 
 95 | # Spyder project settings
 96 | .spyderproject
 97 | .spyproject
 98 | 
 99 | # Rope project settings
100 | .ropeproject
101 | 
102 | # mkdocs documentation
103 | /site
104 | 
105 | # mypy
106 | .mypy_cache/
107 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # databricks-notebooks


--------------------------------------------------------------------------------
/adls_demo.dbc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datakickstart/databricks-notebooks/69d9d8dff18eaa62ca3a897a7f107d46a307b495/adls_demo.dbc


--------------------------------------------------------------------------------
/create_delta_tables.dbc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datakickstart/databricks-notebooks/69d9d8dff18eaa62ca3a897a7f107d46a307b495/create_delta_tables.dbc


--------------------------------------------------------------------------------
/create_parquet_tables (delta demo in parquet).dbc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datakickstart/databricks-notebooks/69d9d8dff18eaa62ca3a897a7f107d46a307b495/create_parquet_tables (delta demo in parquet).dbc


--------------------------------------------------------------------------------
/data_lake_load.dbc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datakickstart/databricks-notebooks/69d9d8dff18eaa62ca3a897a7f107d46a307b495/data_lake_load.dbc


--------------------------------------------------------------------------------
/data_lake_sql_demo.dbc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datakickstart/databricks-notebooks/69d9d8dff18eaa62ca3a897a7f107d46a307b495/data_lake_sql_demo.dbc


--------------------------------------------------------------------------------
/data_lake_sql_taxi_demo.dbc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datakickstart/databricks-notebooks/69d9d8dff18eaa62ca3a897a7f107d46a307b495/data_lake_sql_taxi_demo.dbc


--------------------------------------------------------------------------------
/stackoverflow_db_ingest_parallel.ipynb:
--------------------------------------------------------------------------------
1 | {"cells":[{"cell_type":"code","source":["# On Databricks, need to add library for com.microsoft.sqlserver.jdbc.spark and set secrets\ndatabase = \"StackOverflow2010\"\ndb_host_name = \"sandbox-2-sqlserver.database.windows.net\"\ndb_url = f\"jdbc:sqlserver://{db_host_name};databaseName={database}\"\ndb_user = dbutils.secrets.get(\"demo\", \"sql-user-stackoverflow\") # databricks\ndb_password = dbutils.secrets.get(\"demo\", \"sql-pwd-stackoverflow\") #databricks"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"33b22a3a-0fd8-49e6-aabe-1d5527348733"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"<div class=\"ansiout\"></div>","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"html","arguments":{}}},"output_type":"display_data","data":{"text/html":["<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>\n<div class=\"ansiout\"></div>"]}}],"execution_count":0},{"cell_type":"code","source":["table_list = [\"Badges\", \"Comments\", \"LinkTypes\", \"PostLinks\", \"Posts\", \"PostTypes\", \"Users\", \"Votes\", \"VoteTypes\"]\nspark.sql(f\"CREATE DATABASE IF NOT EXISTS raw_stackoverflow LOCATION '/demo/raw_stackoverflow'\")\n\ndef load_table(table):\n    print(table)\n    destination_table = \"raw_stackoverflow.\" + table\n\n    df = (\n        spark.read\n        .format(\"com.microsoft.sqlserver.jdbc.spark\")\n        .option(\"url\", db_url)\n        .option(\"dbtable\", table)\n        .option(\"user\", db_user)\n        .option(\"password\", db_password)\n        .load()\n    )\n\n    df.write.format(\"parquet\").mode(\"overwrite\").saveAsTable(destination_table)"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"22552cb2-2b51-4312-a392-93426131fc59"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"<div class=\"ansiout\"></div>","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"html","arguments":{}}},"output_type":"display_data","data":{"text/html":["<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>\n<div class=\"ansiout\"></div>"]}}],"execution_count":0},{"cell_type":"code","source":["load_table(\"LinkTypes\")"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"47cab839-83de-4399-96ba-bd0f863669a2"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"<div class=\"ansiout\">LinkTypes\n</div>","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"html","arguments":{}}},"output_type":"display_data","data":{"text/html":["<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>\n<div class=\"ansiout\">LinkTypes\n</div>"]}}],"execution_count":0},{"cell_type":"code","source":["for table in table_list:\n  load_table(table)"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"3e256ff8-d6ee-4c86-bcee-e7c4881d4c35"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"<div class=\"ansiout\">Badges\nComments\nLinkTypes\nPostLinks\nPosts\nPostTypes\nUsers\nVotes\nVoteTypes\n</div>","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"html","arguments":{}}},"output_type":"display_data","data":{"text/html":["<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>\n<div class=\"ansiout\">Badges\nComments\nLinkTypes\nPostLinks\nPosts\nPostTypes\nUsers\nVotes\nVoteTypes\n</div>"]}}],"execution_count":0},{"cell_type":"code","source":["from threading import Thread\nfrom queue import Queue\n\nq = Queue()\n\nworker_count = 2\n\ndef run_tasks(function, q):\n    while not q.empty():\n        value = q.get()\n        function(value)\n        q.task_done()\n\n\nprint(table_list)\n\nfor table in table_list:\n    q.put(table)\n\nfor i in range(worker_count):\n    t=Thread(target=run_tasks, args=(load_table, q))\n    t.daemon = True\n    t.start()\n\nq.join()\n"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"083455fd-9139-4171-bab1-acce3654d10e"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"<div class=\"ansiout\">[&#39;Badges&#39;, &#39;Comments&#39;, &#39;LinkTypes&#39;, &#39;PostLinks&#39;, &#39;Posts&#39;, &#39;PostTypes&#39;, &#39;Users&#39;, &#39;Votes&#39;, &#39;VoteTypes&#39;]\nBadges\nComments\nLinkTypes\nPostLinks\nPosts\nPostTypes\nUsers\nVotes\nVoteTypes\n</div>","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"html","arguments":{}}},"output_type":"display_data","data":{"text/html":["<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>\n<div class=\"ansiout\">[&#39;Badges&#39;, &#39;Comments&#39;, &#39;LinkTypes&#39;, &#39;PostLinks&#39;, &#39;Posts&#39;, &#39;PostTypes&#39;, &#39;Users&#39;, &#39;Votes&#39;, &#39;VoteTypes&#39;]\nBadges\nComments\nLinkTypes\nPostLinks\nPosts\nPostTypes\nUsers\nVotes\nVoteTypes\n</div>"]}}],"execution_count":0}],"metadata":{"application/vnd.databricks.v1+notebook":{"notebookName":"stackoverflow_db_ingest_parallel","dashboards":[],"notebookMetadata":{"pythonIndentUnit":2},"language":"python","widgets":{},"notebookOrigID":2273523373550625}},"nbformat":4,"nbformat_minor":0}
2 | 


--------------------------------------------------------------------------------
/streaming_demo.dbc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datakickstart/databricks-notebooks/69d9d8dff18eaa62ca3a897a7f107d46a307b495/streaming_demo.dbc


--------------------------------------------------------------------------------
/structured_streaming_taxi.dbc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datakickstart/databricks-notebooks/69d9d8dff18eaa62ca3a897a7f107d46a307b495/structured_streaming_taxi.dbc


--------------------------------------------------------------------------------
/structured_streaming_video_views.ipynb:
--------------------------------------------------------------------------------
1 | {"cells":[{"cell_type":"markdown","source":["## Stream Processing To Delta Lake\nMust have data streaming to the topic \"video_usage\".\n\n*Note: If not working, try changing the GROUP_ID and Consumer Group values to reset"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"b1e8b926-300f-492d-8a89-b8cb8f00b8e1"}}},{"cell_type":"markdown","source":["### Shared imports and variables\nRun this first since most cells below need at least one of these imports or variables"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"3878a9bc-b383-406b-b5bb-17610b42bb81"}}},{"cell_type":"code","source":["from pyspark.sql.functions import col, desc, regexp_replace, substring, to_date, from_json, explode, expr\nfrom pyspark.sql.types import StructType, StructField, ArrayType, StringType, IntegerType, BooleanType, TimestampType\n\ndate_format = \"yyyy-MM-dd HH:mm:ss\"\n\nvideo_views_delta_path = \"/mnt/adlsdemo/usage/video\"\n\n# Define a schema that Spark understands. This is one of several ways to do it.\nusage_schema = StructType([\n    StructField(\"usageId\", IntegerType()),\n    StructField(\"user\", StringType()),\n    StructField(\"completed\", BooleanType()),\n    StructField(\"durationSeconds\", IntegerType()),\n    StructField(\"eventTimestamp\", TimestampType())\n])\n"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"1344cb4f-3e6f-4ddb-82cc-ef73e434e599"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"<div class=\"ansiout\"></div>","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"html","arguments":{}}},"output_type":"display_data","data":{"text/html":["<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>\n<div class=\"ansiout\"></div>"]}}],"execution_count":0},{"cell_type":"markdown","source":["### Stream Load of incoming data - Video Views\nRead streaming data from Confluent Cloud or Event Hubs (using Apache Kafka API) and save in the same delta location within Azure Data Lake Storage (ADLS)."],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"89164e3f-9f4e-44d0-860b-6356a4024014"}}},{"cell_type":"code","source":["spark.conf.set(\"spark.sql.legacy.timeParserPolicy\", \"LEGACY\")\n\nrun_version = \"v0.5\"\n\ntopic = 'video_usage'\nGROUP_ID = f'tst-group-video-usage-{run_version}'\n\n# To setup Key Vault backed secret scope for this first time, replace items in url and follow instructions: \n#   https://<databricks-instance>/#secrets/createScopeSetup\n\ndef get_event_hub_config():\n    # Password is really a Event Hub connection string, for example ->\n    # Endpoint=sb://<namespace>.servicebus.windows.net/;SharedAccessKeyName=ReadWriteTmp;SharedAccessKey=vhNXxXXXXXxxxXXXXXXXxx=;EntityPath=demo-message-1\n  password = dbutils.secrets.get(scope = \"demo\", key = \"eh-sasl-{0}\".format(topic))\n\n  EH_SASL = 'kafkashaded.org.apache.kafka.common.security.plain.PlainLoginModule required username=\"$ConnectionString\" password=\"{0}\";'.format(password)\n\n  config = {\n      'kafka.bootstrap.servers': 'dustin-demo-eh.servicebus.windows.net:9093',\n      'kafka.security.protocol': 'SASL_SSL',\n      'kafka.sasl.mechanism': 'PLAIN',\n      'kafka.group.id': GROUP_ID,\n      'kafka.request.timeout.ms': \"60000\",\n      'kafka.session.timeout.ms': \"20000\",\n      'kafka.heartbeat.interval.ms': \"10000\",\n      'kafka.sasl.jaas.config': EH_SASL,\n      'subscribe': topic\n  }\n  return config\n\n\ndef get_confluent_cloud_config():\n  bootstrap_servers = dbutils.secrets.get(scope = \"demo\", key = \"confluent-cloud-brokers\")\n  username = dbutils.secrets.get(scope = \"demo\", key = \"confluent-cloud-user\")\n  password = dbutils.secrets.get(scope = \"demo\", key = \"confluent-cloud-password\")\n  SASL = 'kafkashaded.org.apache.kafka.common.security.plain.PlainLoginModule required username=\"{0}\" password=\"{1}\";'.format(username, password)\n\n  config = {\n      'kafka.bootstrap.servers': bootstrap_servers,\n      'kafka.security.protocol': 'SASL_SSL',\n      'kafka.sasl.mechanism': 'PLAIN',\n      'kafka.group.id': GROUP_ID,\n      'kafka.request.timeout.ms': \"60000\",\n      'kafka.session.timeout.ms': \"20000\",\n      'kafka.heartbeat.interval.ms': \"10000\",\n      'kafka.sasl.jaas.config': SASL,\n      'subscribe': topic\n  }\n  return config\n\n\nconsumer_config = get_confluent_cloud_config()\n                                 \n# Read from Kafka, format will be a kafka record\ninput_df = spark.readStream.format(\"kafka\").options(**consumer_config).load()\n\n# Cast just the value as a string (instead of bytes) then use from_json to convert to an object matching the schema\njson_df = (\n  input_df.select(\n    from_json(col(\"value\").cast(\"string\"), usage_schema).alias(\"json\"),\n    col(\"value\").cast(\"string\").alias(\"value_raw\")\n  )\n)\n\n# Select all attribues from json as individual columns, cast trip_distance, add columns\ntransformed_df = (\n    json_df\n      .select(\"json.*\", \"value_raw\")\n)\n\n# display(transformed_df)\n"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"6cb3f955-8a03-4bad-813f-00f260eba662"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"<div class=\"ansiout\"></div>","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"html","arguments":{}}},"output_type":"display_data","data":{"text/html":["<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>\n<div class=\"ansiout\"></div>"]}}],"execution_count":0},{"cell_type":"markdown","source":["## Azure Storage as a destination\n* One option for streaming output is to write directly to you data lake storage (Azure Data Lake Storage Gen 2 or standard Azure Blob Storage).\n* Databricks Delta / Delta Lake file format makes this more efficient, but could do with Parquet, Avro or other formats."],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"78ebdbd7-98d2-4b1c-b9de-b7e171203223"}}},{"cell_type":"code","source":["video_views_delta_path_2 = video_views_delta_path + \"_\" + run_version\n\n(\ntransformed_df.writeStream\n  .queryName(\"StreamingVideoViewsDelta\")\n  .format(\"delta\")\n  .outputMode(\"append\")\n  .trigger(processingTime=\"5 seconds\")\n  .option(\"checkpointLocation\", f\"/delta/events/_checkpoints/streaming_video_views_{run_version}\")\n  .start(video_views_delta_path_2)\n)"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"555ac728-8e1e-4a1e-98b9-f0d188997bc1"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"<div class=\"ansiout\">Out[8]: &lt;pyspark.sql.streaming.StreamingQuery at 0x7f1e501f3b80&gt;</div>","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"html","arguments":{}}},"output_type":"display_data","data":{"text/html":["<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>\n<div class=\"ansiout\">Out[8]: &lt;pyspark.sql.streaming.StreamingQuery at 0x7f1e501f3b80&gt;</div>"]}}],"execution_count":0},{"cell_type":"code","source":["# Read data out of delta table\n# delta_stream_df = spark.readStream.format(\"delta\").load(video_views_delta_path_2)\n# display(delta_stream_df)"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"432ab17f-e654-49e9-8353-e13885031b6a"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"<div class=\"ansiout\"></div>","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"html","arguments":{}}},"output_type":"display_data","data":{"text/html":["<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>\n<div class=\"ansiout\"></div>"]}}],"execution_count":0},{"cell_type":"markdown","source":["### Alternatively: Send transformed data to Event Hubs for next steps in pipeline"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"e6ad06ad-cc0d-47ae-ad8c-0197ae317832"}}},{"cell_type":"code","source":["# topic2 = 'demo-message-transformed'\n\n# producer_config = consumer_config\n# producer_config.pop('subscribe')\n# producer_config['topic'] = topic2\n\n# kafka_output_df = trip_df.selectExpr(\n#     \"CAST(VendorId as STRING) as key\",\n#     \"to_json(struct(*)) as value\")\n\n# # display(kafka_output_df)\n# kafka_output_df.writeStream \\\n#   .format(\"kafka\") \\\n#   .options(**producer_config) \\\n#   .option(\"checkpointLocation\", f\"/delta/events/_checkpoints/cp_{run_version}\") \\\n#   .start()"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"38ac40d7-7eb0-40ba-ac24-79f86a84fdd1"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"<div class=\"ansiout\"></div>","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"html","arguments":{}}},"output_type":"display_data","data":{"text/html":["<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>\n<div class=\"ansiout\"></div>"]}}],"execution_count":0}],"metadata":{"application/vnd.databricks.v1+notebook":{"notebookName":"structured_streaming_video_views","dashboards":[],"notebookMetadata":{"pythonIndentUnit":2},"language":"python","widgets":{},"notebookOrigID":872950405059611}},"nbformat":4,"nbformat_minor":0}
2 | 


--------------------------------------------------------------------------------