├── demo
    ├── uv_workflow
    │   ├── .python-version
    │   ├── src
    │   │   ├── uv_workflow
    │   │   │   ├── __init__.py
    │   │   │   └── main.py
    │   │   └── notebook.ipynb
    │   ├── .gitignore
    │   ├── uv.lock
    │   ├── tests
    │   │   └── main_test.py
    │   ├── README.md
    │   ├── pyproject.toml
    │   ├── databricks.yml
    │   └── resources
    │   │   └── uv_workflow.job.yml
    ├── pydantic_workflow
    │   ├── .python-version
    │   ├── src
    │   │   ├── pydantic_workflow
    │   │   │   ├── __init__.py
    │   │   │   ├── trip.py
    │   │   │   └── main.py
    │   │   └── notebook.ipynb
    │   ├── .gitignore
    │   ├── tests
    │   │   └── test_trip.py
    │   ├── resources
    │   │   └── pydantic_workflow.job.yml
    │   ├── README.md
    │   ├── databricks.yml
    │   └── pyproject.toml
    ├── delta-live-tables
    │   ├── input-data
    │   │   ├── 1.csv
    │   │   └── 2.csv
    │   ├── read_file.py
    │   ├── dlt_two.sql
    │   ├── main.tf
    │   ├── variables.tf
    │   ├── outputs.tf
    │   ├── dlt_one.sql
    │   ├── resources.tf
    │   ├── .gitignore
    │   ├── .terraform.lock.hcl
    │   ├── README.md
    │   └── my_streaming_table.sql
    ├── dlt-config-file.json
    ├── bestsellers
    │   ├── Book Ranks.sql
    │   ├── books.py
    │   └── pipeline_clone.py
    └── README.md
├── review_me
    ├── Databricks Runtime.py
    ├── dbfs-intro.py
    ├── DLT Step 1.sql
    ├── My Very First Notebook.py
    ├── Python example.py
    ├── CREATE TABLE IF NOT EXISTS jacek_laskowski.my_table.py
    └── python only please.py
├── Databricks SQL
    ├── Databricks SQL.py
    ├── Alerts.py
    ├── Dashboards.py
    ├── Agenda.py
    └── Queries.py
├── Databricks Asset Bundles
    ├── delta_live_tables_demo
    │   ├── src
    │   │   ├── delta_live_tables_demo
    │   │   │   ├── __init__.py
    │   │   │   └── main.py
    │   │   ├── notebook.ipynb
    │   │   └── dlt_pipeline.ipynb
    │   ├── pytest.ini
    │   ├── my_project
    │   │   ├── resources
    │   │   │   └── .gitkeep
    │   │   ├── .gitignore
    │   │   ├── scratch
    │   │   │   └── README.md
    │   │   ├── fixtures
    │   │   │   └── .gitkeep
    │   │   ├── README.md
    │   │   └── databricks.yml
    │   ├── scratch
    │   │   ├── README.md
    │   │   └── exploration.ipynb
    │   ├── resources
    │   │   ├── delta_live_tables_demo_pipeline.yml
    │   │   └── delta_live_tables_demo_job.yml
    │   ├── tests
    │   │   └── main_test.py
    │   ├── fixtures
    │   │   └── .gitkeep
    │   ├── requirements-dev.txt
    │   ├── setup.py
    │   ├── README.md
    │   └── databricks.yml
    ├── Tips and Tricks.py
    └── Job and Task Parameters.py
├── meetups
    ├── uv_workflow_job.png
    ├── poll-meetup-stationary.png
    ├── databricks_ml_model_versions.png
    ├── mlflow-pipeline-predictions.png
    ├── databricks_ml_registered_models.png
    ├── pydantic_workflow_main_task_python_wheel.png
    ├── Meetup_2025_01_09.sql
    ├── Meetup_2025_02_06.sql
    ├── README.md
    ├── Meetup_2025_01_30.sql
    └── MLflow on Databricks.py
├── Delta Live Tables
    ├── delta-live-tables-bundle
    │   ├── bronze_table.sql
    │   └── five_record_table.py
    ├── The Latest and Greatest meetup.sql
    ├── TODOs.py
    ├── Delta Live Tables Python.py
    ├── Deep Dive into DLTs.sql
    ├── DLT Lab.py
    ├── Pipeline settings.py
    ├── Full Refresh.sql
    ├── Materialization.sql
    ├── Agenda.sql
    ├── Expectations.sql
    ├── Storage location.sql
    └── Building Delta Live Tables pipelines with SQL.sql
├── Databricks Workflows
    ├── workflows-run-job-task.png
    ├── for_each_task_concurrency.png
    ├── for_each_task_demo
    │   ├── databricks.yml
    │   ├── src
    │   │   ├── Nested_Task.py
    │   │   └── Load_googlesheets_csv.py
    │   ├── resources
    │   │   └── for_each_task_job.yml
    │   └── README.md
    ├── Step 2. Transform.sql
    ├── Step 3. Build Aggregates.sql
    ├── 01 Conditional Workflows.py
    ├── Step 1. Load Raw Data.sql
    └── 02 Modular Orchestration with Run Job Task.py
├── Databricks Machine Learning
    ├── databricks-mlflow.png
    ├── databricks-mlflow-components.png
    ├── databricks-mlflow-model-tracking.png
    ├── databricks-machine-learning-lifecycle.png
    ├── Machine_Learning_Model_Deployment_course_completed.png
    └── databricks-machine-learning-model-deployment-lesson.png
├── .gitignore
├── Generative AI
    ├── dbdemos.py
    ├── Prompt Engineering.py
    ├── llm-rag-chatbot
    │   ├── _resources
    │   │   ├── README.py
    │   │   ├── NOTICE.py
    │   │   └── LICENSE.py
    │   ├── config.py
    │   └── 00-RAG-LLM-RAG-Introduction.py
    ├── AI Playground.py
    ├── SentenceTransformers.sql
    ├── Diffusion Models.sql
    ├── Foundation Models.py
    ├── Generative Pretrained Transformer.sql
    ├── Retrieval Augmented Generation.sql
    ├── Llama.py
    ├── Databricks Mosaic AI.sql
    ├── Model Serving.py
    └── 00 Generative AI.sql
├── Apache Spark
    ├── SparkSession across Python and Scala.py
    ├── Parameterized Queries.sql
    ├── Bucketing.sql
    ├── Parquet Connector.scala
    └── ANTI and SEMI joins in SQL and DataFrame API.py
├── workshops
    ├── Questions.py
    ├── Databricks Workshop Half-Day 5a.sql
    ├── Course Agenda 2 Days.py
    └── Databricks Workshop Day 3.sql
├── Development Tools
    └── Notebooks.py
├── README.md
├── PySpark
    ├── pyspark-jupyter-poetry
    │   ├── pyproject.toml
    │   ├── README.md
    │   └── install-pyspark.md
    └── PySpark.py
├── terraform
    ├── .terraform.lock.hcl
    ├── .gitignore
    └── pipeline.tf
├── Administration
    └── Databricks Administration.sql
├── Table-Valued Functions.sql
├── Delta Lake
    ├── Merge.sql
    ├── DESCRIBE HISTORY.sql
    ├── Delta Lake 3.1.0.sql
    ├── TRUNCATE TABLE in Delta Lake.sql
    └── Generated Columns.sql
├── Photon.py
├── Python
    └── Pyenv.py
└── Data Visualization
    └── Data Visualization on Databricks.py


/demo/uv_workflow/.python-version:
--------------------------------------------------------------------------------
1 | 3.10
2 | 


--------------------------------------------------------------------------------
/demo/pydantic_workflow/.python-version:
--------------------------------------------------------------------------------
1 | 3.12.3
2 | 


--------------------------------------------------------------------------------
/demo/delta-live-tables/input-data/1.csv:
--------------------------------------------------------------------------------
1 | id,name
2 | 0,zero
3 | 1,un


--------------------------------------------------------------------------------
/demo/dlt-config-file.json:
--------------------------------------------------------------------------------
1 | {
2 |     "project_name": "dlt_demo"
3 | }


--------------------------------------------------------------------------------
/demo/uv_workflow/src/uv_workflow/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = "0.0.1"
2 | 


--------------------------------------------------------------------------------
/review_me/Databricks Runtime.py:
--------------------------------------------------------------------------------
1 | # Databricks notebook source
2 | 
3 | 


--------------------------------------------------------------------------------
/demo/delta-live-tables/input-data/2.csv:
--------------------------------------------------------------------------------
1 | id,name
2 | 2,deux
3 | 3,trois
4 | 4,quatre


--------------------------------------------------------------------------------
/demo/pydantic_workflow/src/pydantic_workflow/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = "0.0.1"
2 | 


--------------------------------------------------------------------------------
/Databricks SQL/Databricks SQL.py:
--------------------------------------------------------------------------------
1 | # Databricks notebook source
2 | # MAGIC %md # Databricks SQL
3 | 


--------------------------------------------------------------------------------
/Databricks SQL/Alerts.py:
--------------------------------------------------------------------------------
1 | # Databricks notebook source
2 | # MAGIC %md # Databricks SQL » Alerts
3 | 


--------------------------------------------------------------------------------
/Databricks SQL/Dashboards.py:
--------------------------------------------------------------------------------
1 | # Databricks notebook source
2 | # MAGIC %md # Databricks SQL » Dashboards
3 | 


--------------------------------------------------------------------------------
/Databricks Asset Bundles/delta_live_tables_demo/src/delta_live_tables_demo/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = "0.0.1"
2 | 


--------------------------------------------------------------------------------
/Databricks Asset Bundles/delta_live_tables_demo/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | testpaths = tests
3 | pythonpath = src
4 | 


--------------------------------------------------------------------------------
/meetups/uv_workflow_job.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jaceklaskowski/learn-databricks/HEAD/meetups/uv_workflow_job.png


--------------------------------------------------------------------------------
/Delta Live Tables/delta-live-tables-bundle/bronze_table.sql:
--------------------------------------------------------------------------------
1 | CREATE MATERIALIZED VIEW bronze_table
2 | AS SELECT * FROM range(5)


--------------------------------------------------------------------------------
/meetups/poll-meetup-stationary.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jaceklaskowski/learn-databricks/HEAD/meetups/poll-meetup-stationary.png


--------------------------------------------------------------------------------
/demo/uv_workflow/.gitignore:
--------------------------------------------------------------------------------
1 | .databricks/
2 | build/
3 | dist/
4 | __pycache__/
5 | *.egg-info
6 | .venv/
7 | scratch/**
8 | !scratch/README.md
9 | 


--------------------------------------------------------------------------------
/meetups/databricks_ml_model_versions.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jaceklaskowski/learn-databricks/HEAD/meetups/databricks_ml_model_versions.png


--------------------------------------------------------------------------------
/meetups/mlflow-pipeline-predictions.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jaceklaskowski/learn-databricks/HEAD/meetups/mlflow-pipeline-predictions.png


--------------------------------------------------------------------------------
/demo/pydantic_workflow/.gitignore:
--------------------------------------------------------------------------------
1 | .databricks/
2 | build/
3 | dist/
4 | __pycache__/
5 | *.egg-info
6 | .venv/
7 | scratch/**
8 | !scratch/README.md
9 | 


--------------------------------------------------------------------------------
/meetups/databricks_ml_registered_models.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jaceklaskowski/learn-databricks/HEAD/meetups/databricks_ml_registered_models.png


--------------------------------------------------------------------------------
/Databricks Asset Bundles/delta_live_tables_demo/my_project/resources/.gitkeep:
--------------------------------------------------------------------------------
1 | This folder is reserved for Databricks Asset Bundles resource definitions.
2 | 


--------------------------------------------------------------------------------
/Databricks Workflows/workflows-run-job-task.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jaceklaskowski/learn-databricks/HEAD/Databricks Workflows/workflows-run-job-task.png


--------------------------------------------------------------------------------
/demo/delta-live-tables/read_file.py:
--------------------------------------------------------------------------------
1 | # Databricks notebook source
2 | import dlt
3 | 
4 | @dlt.table
5 | def five_records():
6 |     return spark.range(5)
7 | 
8 | 


--------------------------------------------------------------------------------
/Databricks Machine Learning/databricks-mlflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jaceklaskowski/learn-databricks/HEAD/Databricks Machine Learning/databricks-mlflow.png


--------------------------------------------------------------------------------
/Databricks Workflows/for_each_task_concurrency.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jaceklaskowski/learn-databricks/HEAD/Databricks Workflows/for_each_task_concurrency.png


--------------------------------------------------------------------------------
/demo/uv_workflow/uv.lock:
--------------------------------------------------------------------------------
1 | version = 1
2 | requires-python = ">=3.10"
3 | 
4 | [[package]]
5 | name = "uv-workflow"
6 | version = "1.0.0"
7 | source = { editable = "." }
8 | 


--------------------------------------------------------------------------------
/meetups/pydantic_workflow_main_task_python_wheel.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jaceklaskowski/learn-databricks/HEAD/meetups/pydantic_workflow_main_task_python_wheel.png


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .databricks/
 2 | build/
 3 | dist/
 4 | __pycache__/
 5 | *.egg-info
 6 | .venv/
 7 | scratch/**
 8 | !scratch/README.md
 9 | .ipynb_checkpoints/
10 | .vscode/
11 | 


--------------------------------------------------------------------------------
/Databricks Machine Learning/databricks-mlflow-components.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jaceklaskowski/learn-databricks/HEAD/Databricks Machine Learning/databricks-mlflow-components.png


--------------------------------------------------------------------------------
/demo/uv_workflow/tests/main_test.py:
--------------------------------------------------------------------------------
1 | from uv_workflow.main import get_taxis, get_spark
2 | 
3 | 
4 | def test_main():
5 |     taxis = get_taxis(get_spark())
6 |     assert taxis.count() > 5
7 | 


--------------------------------------------------------------------------------
/Databricks Machine Learning/databricks-mlflow-model-tracking.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jaceklaskowski/learn-databricks/HEAD/Databricks Machine Learning/databricks-mlflow-model-tracking.png


--------------------------------------------------------------------------------
/Databricks Asset Bundles/delta_live_tables_demo/my_project/.gitignore:
--------------------------------------------------------------------------------
 1 | 
 2 | .databricks/
 3 | build/
 4 | dist/
 5 | __pycache__/
 6 | *.egg-info
 7 | .venv/
 8 | scratch/**
 9 | !scratch/README.md
10 | 


--------------------------------------------------------------------------------
/Databricks Machine Learning/databricks-machine-learning-lifecycle.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jaceklaskowski/learn-databricks/HEAD/Databricks Machine Learning/databricks-machine-learning-lifecycle.png


--------------------------------------------------------------------------------
/Generative AI/dbdemos.py:
--------------------------------------------------------------------------------
1 | # Databricks notebook source
2 | # MAGIC %pip install dbdemos
3 | 
4 | # COMMAND ----------
5 | 
6 | import dbdemos
7 | dbdemos.install('llm-rag-chatbot', catalog='main', schema='rag_chatbot')
8 | 


--------------------------------------------------------------------------------
/Databricks Machine Learning/Machine_Learning_Model_Deployment_course_completed.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jaceklaskowski/learn-databricks/HEAD/Databricks Machine Learning/Machine_Learning_Model_Deployment_course_completed.png


--------------------------------------------------------------------------------
/Databricks Machine Learning/databricks-machine-learning-model-deployment-lesson.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jaceklaskowski/learn-databricks/HEAD/Databricks Machine Learning/databricks-machine-learning-model-deployment-lesson.png


--------------------------------------------------------------------------------
/demo/delta-live-tables/dlt_two.sql:
--------------------------------------------------------------------------------
 1 | -- Databricks notebook source
 2 | CREATE OR REFRESH LIVE TABLE dlt_two
 3 | COMMENT "live table dlt_two"
 4 | AS
 5 | SELECT * FROM live.dlt_one
 6 | 
 7 | -- COMMAND ----------
 8 | 
 9 | 
10 | 


--------------------------------------------------------------------------------
/Databricks Asset Bundles/delta_live_tables_demo/scratch/README.md:
--------------------------------------------------------------------------------
1 | # scratch
2 | 
3 | This folder is reserved for personal, exploratory notebooks.
4 | By default these are not committed to Git, as 'scratch' is listed in .gitignore.
5 | 


--------------------------------------------------------------------------------
/Databricks Asset Bundles/delta_live_tables_demo/my_project/scratch/README.md:
--------------------------------------------------------------------------------
1 | # scratch
2 | 
3 | This folder is reserved for personal, exploratory notebooks.
4 | By default these are not committed to Git, as 'scratch' is listed in .gitignore.
5 | 


--------------------------------------------------------------------------------
/demo/uv_workflow/README.md:
--------------------------------------------------------------------------------
1 | # uv_workflow
2 | 
3 | The 'uv_workflow' project was generated by using the default-python template.
4 | 
5 | > [!TIP]
6 | >
7 | > Use [Meetup_2025_01_09](../../meetups/Meetup_2025_01_09.sql) notebook for guidance.
8 | 


--------------------------------------------------------------------------------
/Apache Spark/SparkSession across Python and Scala.py:
--------------------------------------------------------------------------------
1 | # Databricks notebook source
2 | # MAGIC %scala
3 | # MAGIC
4 | # MAGIC spark.range(5).createTempView("kevin_view_scala")
5 | 
6 | # COMMAND ----------
7 | 
8 | spark.table('kevin_view_scala')
9 | 


--------------------------------------------------------------------------------
/demo/delta-live-tables/main.tf:
--------------------------------------------------------------------------------
 1 | terraform {
 2 |   required_providers {
 3 |     databricks = {
 4 |       source  = "databricks/databricks"
 5 |       version = "1.13.0"
 6 |     }
 7 |   }
 8 |   required_version = ">= 1.4.0"
 9 | }
10 | 
11 | provider "databricks" {}
12 | 


--------------------------------------------------------------------------------
/demo/delta-live-tables/variables.tf:
--------------------------------------------------------------------------------
1 | variable "input_dir" {
2 |   description = "The input directory for Auto Loader to load CSV files from"
3 |   type        = string
4 |   nullable    = false
5 |   default     = "/FileStore/jacek_laskowski/delta-live-tables-demo-input"
6 | }
7 | 


--------------------------------------------------------------------------------
/demo/bestsellers/Book Ranks.sql:
--------------------------------------------------------------------------------
 1 | -- Databricks notebook source
 2 | -- CTAS = Create Table As Select
 3 | -- CTE = Common Table Expressions (WITH)
 4 | CREATE LIVE TABLE book_ranks
 5 | AS SELECT
 6 |   *,
 7 |   RANK() over (
 8 |     PARTITION BY genre
 9 |     ORDER BY
10 |       quantity DESC
11 |   ) as book_rank
12 | FROM LIVE.books
13 | 


--------------------------------------------------------------------------------
/demo/pydantic_workflow/tests/test_trip.py:
--------------------------------------------------------------------------------
 1 | from pydantic_workflow.trip import Trip
 2 | 
 3 | import pytest
 4 | 
 5 | 
 6 | def test_valid_trip():
 7 |     Trip(id=10, pickup_zip=10103, dropoff_zip=10110)
 8 | 
 9 | 
10 | def test_invalid_trip():
11 |     with pytest.raises(ValueError):
12 |         Trip(id=10, pickup_zip=10023, dropoff_zip=10023)


--------------------------------------------------------------------------------
/Databricks Asset Bundles/delta_live_tables_demo/src/delta_live_tables_demo/main.py:
--------------------------------------------------------------------------------
 1 | from pyspark.sql import SparkSession
 2 | 
 3 | def get_taxis():
 4 |   spark = SparkSession.builder.getOrCreate()
 5 |   return spark.read.table("samples.nyctaxi.trips")
 6 | 
 7 | def main():
 8 |   get_taxis().show(5)
 9 | 
10 | if __name__ == '__main__':
11 |   main()
12 | 


--------------------------------------------------------------------------------
/demo/README.md:
--------------------------------------------------------------------------------
1 | # Demo
2 | 
3 | You can find different demo in this directory:
4 | 
5 | 1. [uv_workflow](./uv_workflow/) - Use [uv](https://docs.astral.sh/uv/) to manage a Python project that uses [Databricks Asset Bundles](https://docs.databricks.com/en/dev-tools/bundles/index.html) to manage a Databricks job (that uses the Python library).
6 | 1. _others_
7 | 


--------------------------------------------------------------------------------
/Generative AI/Prompt Engineering.py:
--------------------------------------------------------------------------------
 1 | # Databricks notebook source
 2 | # MAGIC %md
 3 | # MAGIC
 4 | # MAGIC # Prompt Engineering for Generative AI
 5 | 
 6 | # COMMAND ----------
 7 | 
 8 | # MAGIC %md
 9 | # MAGIC
10 | # MAGIC ## Resources
11 | # MAGIC
12 | # MAGIC * [developers.google.com](https://developers.google.com/machine-learning/resources/prompt-eng)
13 | 


--------------------------------------------------------------------------------
/demo/uv_workflow/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "uv-workflow"
 3 | version = "1.0.0"
 4 | description = "Add your description here"
 5 | readme = "README.md"
 6 | requires-python = ">=3.10"
 7 | dependencies = []
 8 | 
 9 | # uv init --lib
10 | # Uses src directory for sources
11 | [build-system]
12 | requires = ["hatchling"]
13 | build-backend = "hatchling.build"


--------------------------------------------------------------------------------
/workshops/Questions.py:
--------------------------------------------------------------------------------
 1 | # Databricks notebook source
 2 | # MAGIC %md # Questions
 3 | # MAGIC
 4 | # MAGIC Open questions to explore
 5 | 
 6 | # COMMAND ----------
 7 | 
 8 | # MAGIC %md
 9 | # MAGIC
10 | # MAGIC 1. Job Notification Templates
11 | # MAGIC 1. Auto Optimize
12 | # MAGIC 1. Auto Compact
13 | # MAGIC 1. Can `CommitHook`s help executing `OPTIMIZE` at write time
14 | 


--------------------------------------------------------------------------------
/demo/delta-live-tables/outputs.tf:
--------------------------------------------------------------------------------
 1 | output "storage" {
 2 |   description = "Storage location"
 3 |   value = databricks_pipeline.this.storage
 4 | }
 5 | 
 6 | output "pipeline_id" {
 7 |   description = "Pipeline ID"
 8 |   value = databricks_pipeline.this.id
 9 | }
10 | 
11 | output "input_dir" {
12 |   description = "Input directory to load CSV data from"
13 |   value = var.input_dir
14 | }
15 | 


--------------------------------------------------------------------------------
/Development Tools/Notebooks.py:
--------------------------------------------------------------------------------
 1 | # Databricks notebook source
 2 | # MAGIC %md # Notebooks
 3 | 
 4 | # COMMAND ----------
 5 | 
 6 | # MAGIC %md
 7 | # MAGIC
 8 | # MAGIC ## Command Palette
 9 | # MAGIC
10 | # MAGIC * Read [Command palette](https://docs.databricks.com/en/notebooks/notebooks-code.html#command-palette)
11 | # MAGIC * Use `Cmd + Shift + P` on macOS (or `Ctrl + Shift + P` on Windows) 🔥
12 | 


--------------------------------------------------------------------------------
/Databricks SQL/Agenda.py:
--------------------------------------------------------------------------------
 1 | # Databricks notebook source
 2 | # MAGIC %md # Databricks SQL » Agenda
 3 | 
 4 | # COMMAND ----------
 5 | 
 6 | # MAGIC %md
 7 | # MAGIC
 8 | # MAGIC | # | Module |
 9 | # MAGIC | --- | --- |
10 | # MAGIC | 0 | [Introduction]($./Databricks SQL) |
11 | # MAGIC | 1 | [Queries]($./Queries) |
12 | # MAGIC | 2 | [Dashboards]($./Dashboards) |
13 | # MAGIC | 3 | [Alerts]($./Alerts) |
14 | 


--------------------------------------------------------------------------------
/demo/pydantic_workflow/resources/pydantic_workflow.job.yml:
--------------------------------------------------------------------------------
 1 | resources:
 2 |   jobs:
 3 |     pydantic_workflow_job:
 4 |       name: pydantic_workflow_job
 5 |       tasks:
 6 |         - task_key: taxi_qc
 7 |           existing_cluster_id: 1128-165651-khbd6ndl
 8 |           notebook_task:
 9 |             notebook_path: ../src/taxi_qc.ipynb
10 |           libraries:
11 |             - whl: ../dist/*.whl
12 | 


--------------------------------------------------------------------------------
/Generative AI/llm-rag-chatbot/_resources/README.py:
--------------------------------------------------------------------------------
1 | # Databricks notebook source
2 | # MAGIC %md
3 | # MAGIC ## DBDemos asset
4 | # MAGIC
5 | # MAGIC The notebooks available under `_/resources` are technical resources.
6 | # MAGIC
7 | # MAGIC Do not edit these notebooks or try to run them directly. These notebooks will load data / run some setup. They are indirectly called from the main notebook (`%run ./_resources/.....`)
8 | 


--------------------------------------------------------------------------------
/Delta Live Tables/The Latest and Greatest meetup.sql:
--------------------------------------------------------------------------------
 1 | -- Databricks notebook source
 2 | -- MAGIC %md # The Latest and Greatest in Delta Live Tables
 3 | 
 4 | -- COMMAND ----------
 5 | 
 6 | -- MAGIC %md
 7 | -- MAGIC
 8 | -- MAGIC 1. Graph (the default view) vs **List** 🔥
 9 | -- MAGIC 1. **Flows** tab in the details of a live table
10 | -- MAGIC 1. [Validate update](https://docs.databricks.com/en/delta-live-tables/updates.html#validate-update)
11 | 


--------------------------------------------------------------------------------
/Delta Live Tables/delta-live-tables-bundle/five_record_table.py:
--------------------------------------------------------------------------------
 1 | # Databricks notebook source
 2 | # MAGIC %md
 3 | # MAGIC
 4 | # MAGIC Define a Delta Live Tables dataset 
 5 | # MAGIC
 6 | # MAGIC It must return either a Spark or Koalas DataFrame.
 7 | 
 8 | # COMMAND ----------
 9 | 
10 | import dlt
11 | from pyspark.sql import DataFrame
12 | 
13 | @dlt.table()
14 | def five_record_table() -> DataFrame:
15 |     print('Hello world')
16 |     return spark.range(5)
17 | 


--------------------------------------------------------------------------------
/Databricks Workflows/for_each_task_demo/databricks.yml:
--------------------------------------------------------------------------------
 1 | # This is a Databricks asset bundle definition for for_each_task_demo.
 2 | # See https://docs.databricks.com/dev-tools/bundles/index.html for documentation.
 3 | bundle:
 4 |   name: for_each_task_demo
 5 | 
 6 | include:
 7 |   - resources/for_each_task_job.yml
 8 | 
 9 | variables:
10 |   cluster_id:
11 |     default: 0401-114149-an94vdde
12 | 
13 | targets:
14 |   dev:
15 |     mode: development
16 |     default: true
17 | 


--------------------------------------------------------------------------------
/Generative AI/AI Playground.py:
--------------------------------------------------------------------------------
 1 | # Databricks notebook source
 2 | # MAGIC %md
 3 | # MAGIC # AI Playground
 4 | 
 5 | # COMMAND ----------
 6 | 
 7 | # MAGIC %md
 8 | # MAGIC
 9 | # MAGIC [Chat with supported LLMs using AI Playground](https://docs.databricks.com/en/large-language-models/ai-playground.html):
10 | # MAGIC
11 | # MAGIC * Interact with supported LLMs
12 | # MAGIC * A chat-like environment to test, prompt, and compare LLMs
13 | # MAGIC * Available in your Databricks workspace
14 | 


--------------------------------------------------------------------------------
/demo/pydantic_workflow/README.md:
--------------------------------------------------------------------------------
 1 | # pydantic_workflow
 2 | 
 3 | The 'pydantic_workflow' project was generated by using the default-python template.
 4 | 
 5 | ## Run
 6 | 
 7 | ```bash
 8 | databricks bundle deploy && \
 9 | databricks bundle run pydantic_workflow_job
10 | ```
11 | 
12 | ## Clean Up
13 | 
14 | ```bash
15 | databricks bundle destroy --auto-approve
16 | ```
17 | 
18 | ## Learn More
19 | 
20 | * [What are Databricks Asset Bundles?](https://docs.databricks.com/aws/en/dev-tools/bundles).
21 | 


--------------------------------------------------------------------------------
/demo/delta-live-tables/dlt_one.sql:
--------------------------------------------------------------------------------
 1 | -- Databricks notebook source
 2 | --- Almost like CTAS
 3 | CREATE OR REFRESH LIVE TABLE dlt_one
 4 | (
 5 |   id INTEGER COMMENT 'Identifier',
 6 |   auto_generated BIGINT GENERATED ALWAYS AS IDENTITY (START WITH 0 INCREMENT BY 1) COMMENT 'Auto-generated using GENERATED ALWAYS AS IDENTITY'
 7 | )
 8 | COMMENT "live table dlt_one with ${jacek.pipeline.message}" 
 9 | AS
10 |   SELECT
11 |     INT((rand() * ID) * 100) AS id
12 |   FROM VALUES
13 |     (1),
14 |     (2),
15 |     (3) t(id)
16 | 


--------------------------------------------------------------------------------
/Databricks Asset Bundles/delta_live_tables_demo/resources/delta_live_tables_demo_pipeline.yml:
--------------------------------------------------------------------------------
 1 | # The main pipeline for delta_live_tables_demo
 2 | resources:
 3 |   pipelines:
 4 |     delta_live_tables_demo_pipeline:
 5 |       name: delta_live_tables_demo_pipeline
 6 |       target: delta_live_tables_demo_${bundle.environment}
 7 |       libraries:
 8 |         - notebook:
 9 |             path: ../src/dlt_pipeline.ipynb
10 |     meetup_pipeline:
11 |       name: dlt_pipeline_v${bundle.git.commit}
12 |       libraries:
13 |         - notebook:
14 |             path: ../src/dlt_pipeline.ipynb
15 | 


--------------------------------------------------------------------------------
/Databricks Asset Bundles/delta_live_tables_demo/tests/main_test.py:
--------------------------------------------------------------------------------
 1 | from databricks.connect import DatabricksSession
 2 | from pyspark.sql import SparkSession
 3 | from delta_live_tables_demo import main
 4 | 
 5 | # Create a new Databricks Connect session. If this fails,
 6 | # check that you have configured Databricks Connect correctly.
 7 | # See https://docs.databricks.com/dev-tools/databricks-connect.html.
 8 | 
 9 | SparkSession.builder = DatabricksSession.builder
10 | SparkSession.builder.getOrCreate()
11 | 
12 | def test_main():
13 |     taxis = main.get_taxis()
14 |     assert taxis.count() > 5
15 | 


--------------------------------------------------------------------------------
/Generative AI/SentenceTransformers.sql:
--------------------------------------------------------------------------------
 1 | -- Databricks notebook source
 2 | -- MAGIC %md # SentenceTransformers
 3 | 
 4 | -- COMMAND ----------
 5 | 
 6 | -- MAGIC %md
 7 | -- MAGIC
 8 | -- MAGIC [SentenceTransformers](https://www.sbert.net/)
 9 | -- MAGIC
10 | -- MAGIC Sentence, text and image embeddings
11 | -- MAGIC
12 | -- MAGIC Compute sentence / text embeddings for more than 100 languages.
13 | -- MAGIC
14 | -- MAGIC Embeddings can then be compared e.g. with cosine-similarity to find sentences with a similar meaning. This can be useful for semantic textual similar, semantic search, or paraphrase mining.
15 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Learn Databricks Lakehouse Platform
 2 | 
 3 | This repository contains notebooks to learn [Databricks Lakehouse Platform](https://www.databricks.com/product/data-lakehouse), featuring but not limited to:
 4 | 
 5 | 1. [Delta Live Tables](Delta%20Live%20Tables/)
 6 | 1. [Workflow Jobs](Workflow%20Jobs/)
 7 | 1. [Unity Catalog](Unity%20Catalog/)
 8 | 
 9 | ## Databricks Workshop
10 | 
11 | Loose notes about the topics of future Databricks workshops.
12 | 
13 | 1. Developer Settings (under `/settings/user/developer/`)
14 |     * review available options
15 |     * what are the other URLs? (under `/settings/user/`)
16 | 
17 | 


--------------------------------------------------------------------------------
/Databricks Workflows/for_each_task_demo/src/Nested_Task.py:
--------------------------------------------------------------------------------
 1 | # Databricks notebook source
 2 | # MAGIC %md
 3 | # MAGIC
 4 | # MAGIC # Nested Task
 5 | 
 6 | # COMMAND ----------
 7 | 
 8 | # MAGIC %md
 9 | # MAGIC
10 | # MAGIC This notebook is a **nested task** used as part of a For each task in the For each Task demo.
11 | # MAGIC
12 | # MAGIC 1. The nested task is the task to run for each iteration of the For each task.
13 | # MAGIC 1. Can be one of the standard Databricks Jobs task types.
14 | # MAGIC 1. Cannot be another For each task.
15 | 
16 | # COMMAND ----------
17 | 
18 | single_csv_line = dbutils.widgets.get("single_csv_line")
19 | print(single_csv_line)
20 | 


--------------------------------------------------------------------------------
/Generative AI/Diffusion Models.sql:
--------------------------------------------------------------------------------
 1 | -- Databricks notebook source
 2 | -- MAGIC %md
 3 | -- MAGIC
 4 | -- MAGIC # Diffusion Models
 5 | -- MAGIC
 6 | -- MAGIC * [Wikipedia](https://en.wikipedia.org/wiki/Diffusion_model)
 7 | -- MAGIC * Models like [DALL·E](https://openai.com/dall-e-3) to generate images (**image generation**)
 8 | -- MAGIC * fine-tuned to create objects with specific desired properties
 9 | -- MAGIC * training phase
10 | 
11 | -- COMMAND ----------
12 | 
13 | -- MAGIC %md
14 | -- MAGIC
15 | -- MAGIC ## DALL-E 3
16 | -- MAGIC
17 | -- MAGIC * [Home page](https://openai.com/dall-e-3)
18 | -- MAGIC * [Research paper](https://cdn.openai.com/papers/dall-e-3.pdf)
19 | 


--------------------------------------------------------------------------------
/Delta Live Tables/TODOs.py:
--------------------------------------------------------------------------------
 1 | # Databricks notebook source
 2 | # MAGIC %md # Open Topics / TODOs
 3 | # MAGIC
 4 | # MAGIC The following is a list of things to explore in more details.
 5 | 
 6 | # COMMAND ----------
 7 | 
 8 | # MAGIC %md
 9 | # MAGIC
10 | # MAGIC 1. CDF Demo
11 | # MAGIC 2. STREAMING clause
12 | # MAGIC 3. Continuous execution pipeline mode
13 | 
14 | # COMMAND ----------
15 | 
16 | # MAGIC %md
17 | # MAGIC
18 | # MAGIC 1. What's the diff between abfss vs adls?
19 | # MAGIC 1. Different uses of [libraries](https://docs.databricks.com/api/azure/workspace/pipelines/create) in a DLT pipeline (esp. `file` and `jar` JSON fields)
20 | 
21 | # COMMAND ----------
22 | 
23 | 
24 | 


--------------------------------------------------------------------------------
/PySpark/pyspark-jupyter-poetry/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "jupyter-spark"
 3 | version = "0.1.0"
 4 | description = ""
 5 | authors = ["Jacek Laskowski <jacek@japila.pl>"]
 6 | readme = "README.md"
 7 | 
 8 | [tool.poetry.dependencies]
 9 | python = "^3.12"
10 | pyspark = { version = "^3.5.2", extras = [ "connect,sql,ml" ] } 
11 | # Required for Python 3.12
12 | # Nothing to do with PySpark
13 | # https://stackoverflow.com/q/77233855/1305344
14 | setuptools = "^74.1.2"
15 | jupyterlab = "^4.2.5"
16 | faker = "^28.4.1"
17 | pandas = "^2.2.2"
18 | pyarrow = "^17.0.0"
19 | 
20 | [build-system]
21 | requires = ["poetry-core"]
22 | build-backend = "poetry.core.masonry.api"
23 | 


--------------------------------------------------------------------------------
/demo/pydantic_workflow/src/pydantic_workflow/trip.py:
--------------------------------------------------------------------------------
 1 | from typing_extensions import Self
 2 | 
 3 | from pydantic import BaseModel, model_validator
 4 | 
 5 | from datetime import datetime
 6 | 
 7 | 
 8 | class Trip(BaseModel):
 9 |     tpep_pickup_datetime: datetime = datetime.now()
10 |     tpep_dropoff_datetime: datetime = datetime.now()
11 |     trip_distance: float = -1.0
12 |     fare_amount: float = -1.0
13 |     pickup_zip: int = -1
14 |     dropoff_zip: int = -1
15 | 
16 |     @model_validator(mode='after')
17 |     def enforce_different_zips(self) -> Self:
18 |         if self.pickup_zip == self.dropoff_zip:
19 |             raise ValueError('pickup_zip and dropoff_zip must be different')
20 |         return self
21 | 


--------------------------------------------------------------------------------
/Generative AI/Foundation Models.py:
--------------------------------------------------------------------------------
 1 | # Databricks notebook source
 2 | # MAGIC %md
 3 | # MAGIC # Foundation Models
 4 | 
 5 | # COMMAND ----------
 6 | 
 7 | # MAGIC %md
 8 | # MAGIC
 9 | # MAGIC ## Databricks Foundation Model APIs
10 | # MAGIC
11 | # MAGIC [Databricks Foundation Model APIs](https://docs.databricks.com/en/machine-learning/foundation-models/index.html):
12 | # MAGIC
13 | # MAGIC * Includes requirements for use, supported models, and limitations
14 | 
15 | # COMMAND ----------
16 | 
17 | # MAGIC %md
18 | # MAGIC
19 | # MAGIC [Get started querying LLMs on Databricks](https://docs.databricks.com/en/large-language-models/llm-serving-intro.html):
20 | # MAGIC
21 | # MAGIC * Use Foundation Model APIs to serve and query LLMs on Databricks
22 | 


--------------------------------------------------------------------------------
/demo/uv_workflow/src/uv_workflow/main.py:
--------------------------------------------------------------------------------
 1 | from pyspark.sql import SparkSession, DataFrame
 2 | 
 3 | def get_taxis(spark: SparkSession) -> DataFrame:
 4 |   return spark.read.table("samples.nyctaxi.trips")
 5 | 
 6 | 
 7 | # Create a new Databricks Connect session. If this fails,
 8 | # check that you have configured Databricks Connect correctly.
 9 | # See https://docs.databricks.com/dev-tools/databricks-connect.html.
10 | def get_spark() -> SparkSession:
11 |   try:
12 |     from databricks.connect import DatabricksSession
13 |     return DatabricksSession.builder.getOrCreate()
14 |   except ImportError:
15 |     return SparkSession.builder.getOrCreate()
16 | 
17 | def main():
18 |   get_taxis(get_spark()).show(5)
19 | 
20 | if __name__ == '__main__':
21 |   main()
22 | 


--------------------------------------------------------------------------------
/review_me/dbfs-intro.py:
--------------------------------------------------------------------------------
 1 | # Databricks notebook source
 2 | # MAGIC %py
 3 | # MAGIC
 4 | # MAGIC # this is NOT a Spark code (pyspark)
 5 | # MAGIC # We're about to load people dataset
 6 | # MAGIC spark.read.format('csv').load('/people.csv')
 7 | 
 8 | # COMMAND ----------
 9 | 
10 | # MAGIC %fs ls dbfs:/FileStore/books.csv
11 | 
12 | # COMMAND ----------
13 | 
14 | # MAGIC %md ## cat
15 | 
16 | # COMMAND ----------
17 | 
18 | # MAGIC %fs cat dbfs:/FileStore/books.csv
19 | 
20 | # COMMAND ----------
21 | 
22 | dbutils.fs.head('dbfs:/FileStore/books.csv')
23 | 
24 | # COMMAND ----------
25 | 
26 | # MAGIC %fs head dbfs:/FileStore/books.csv
27 | 
28 | # COMMAND ----------
29 | 
30 | dbutils.help()
31 | 
32 | # COMMAND ----------
33 | 
34 | # MAGIC %sh ls /dbfs/FileStore
35 | 


--------------------------------------------------------------------------------
/terraform/.terraform.lock.hcl:
--------------------------------------------------------------------------------
 1 | # This file is maintained automatically by "terraform init".
 2 | # Manual edits may be lost in future updates.
 3 | 
 4 | provider "registry.terraform.io/databricks/databricks" {
 5 |   version = "1.19.0"
 6 |   hashes = [
 7 |     "h1:uk8gR88qcyVvkvDoXTHkTnT8g+S7QgvV3w1H7osVLMU=",
 8 |     "zh:1e2bbfd4af2cf0369a51baea67d5884e87f180ea56542aa70a470022bffed7f9",
 9 |     "zh:4057f818461060bb85bf2282d88a0caccc16bee89f8da471743282bfd9dffa6e",
10 |     "zh:50c24b1c3744861262d23112b8a8f74902b43c8c406f79cd3ada84452daaeb79",
11 |     "zh:91047db0b13cb849424eeb5050c86510005569c38df52b9794363cc82f18d407",
12 |     "zh:9d22bbc77f3b790d3e4732406a1982bd0654debdabd317f81dbe38a860fc174d",
13 |     "zh:d4587098bc487ea2c4161a4b7aee94b7bf8df2cf50b6ffd44047a0b4663064bf",
14 |   ]
15 | }
16 | 


--------------------------------------------------------------------------------
/demo/pydantic_workflow/src/pydantic_workflow/main.py:
--------------------------------------------------------------------------------
 1 | from pyspark.sql import SparkSession, DataFrame
 2 | 
 3 | 
 4 | def get_taxis(spark: SparkSession) -> DataFrame:
 5 |     return spark.read.table("samples.nyctaxi.trips")
 6 | 
 7 | 
 8 | # Create a new Databricks Connect session. If this fails,
 9 | # check that you have configured Databricks Connect correctly.
10 | # See https://docs.databricks.com/dev-tools/databricks-connect.html.
11 | def get_spark() -> SparkSession:
12 |     try:
13 |         from databricks.connect import DatabricksSession
14 | 
15 |         return DatabricksSession.builder.getOrCreate()
16 |     except ImportError:
17 |         return SparkSession.builder.getOrCreate()
18 | 
19 | 
20 | def main():
21 |     get_taxis(get_spark()).show(5)
22 | 
23 | 
24 | if __name__ == "__main__":
25 |     main()
26 | 


--------------------------------------------------------------------------------
/demo/uv_workflow/databricks.yml:
--------------------------------------------------------------------------------
 1 | # This is a Databricks asset bundle definition for uv_workflow.
 2 | # See https://docs.databricks.com/dev-tools/bundles/index.html for documentation.
 3 | bundle:
 4 |   name: uv_workflow
 5 | 
 6 | include:
 7 |   - resources/*.yml
 8 | 
 9 | workspace:
10 |   host: https://curriculum-dev.cloud.databricks.com
11 | 
12 | artifacts:
13 |   uv_built_wheel:
14 |     type: whl
15 |     build: uv build --wheel
16 |     path: .
17 | 
18 | targets:
19 |   dev:
20 |     # The default target uses 'mode: development' to create a development copy.
21 |     # - Deployed resources get prefixed with '[dev my_user_name]'
22 |     # - Any job schedules and triggers are paused by default.
23 |     # See also https://docs.databricks.com/dev-tools/bundles/deployment-modes.html.
24 |     mode: development
25 |     default: true
26 | 


--------------------------------------------------------------------------------
/Databricks Asset Bundles/delta_live_tables_demo/fixtures/.gitkeep:
--------------------------------------------------------------------------------
 1 | # Fixtures
 2 | 
 3 | This folder is reserved for fixtures, such as CSV files.
 4 | 
 5 | Below is an example of how to load fixtures as a data frame:
 6 | 
 7 | ```
 8 | import pandas as pd
 9 | import os
10 | 
11 | def get_absolute_path(*relative_parts):
12 |     if 'dbutils' in globals():
13 |         base_dir = os.path.dirname(dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get()) # type: ignore
14 |         path = os.path.normpath(os.path.join(base_dir, *relative_parts))
15 |         return path if path.startswith("/Workspace") else "/Workspace" + path
16 |     else:
17 |         return os.path.join(*relative_parts)
18 | 
19 | csv_file = get_absolute_path("..", "fixtures", "mycsv.csv")
20 | df = pd.read_csv(csv_file)
21 | display(df)
22 | ```
23 | 


--------------------------------------------------------------------------------
/demo/pydantic_workflow/databricks.yml:
--------------------------------------------------------------------------------
 1 | # This is a Databricks asset bundle definition for pydantic_workflow.
 2 | # See https://docs.databricks.com/dev-tools/bundles/index.html for documentation.
 3 | bundle:
 4 |   name: pydantic_workflow
 5 | 
 6 | include:
 7 |   - resources/*.yml
 8 | 
 9 | artifacts:
10 |   pydantic_workflow_wheel:
11 |     type: whl
12 |     build: uv build --wheel
13 |     path: .
14 | 
15 | workspace:
16 |   host: https://curriculum-dev.cloud.databricks.com
17 | 
18 | targets:
19 |   dev:
20 |     # The default target uses 'mode: development' to create a development copy.
21 |     # - Deployed resources get prefixed with '[dev my_user_name]'
22 |     # - Any job schedules and triggers are paused by default.
23 |     # See also https://docs.databricks.com/dev-tools/bundles/deployment-modes.html.
24 |     mode: development
25 | 


--------------------------------------------------------------------------------
/Databricks Asset Bundles/delta_live_tables_demo/my_project/fixtures/.gitkeep:
--------------------------------------------------------------------------------
 1 | # Fixtures
 2 | 
 3 | This folder is reserved for fixtures, such as CSV files.
 4 | 
 5 | Below is an example of how to load fixtures as a data frame:
 6 | 
 7 | ```
 8 | import pandas as pd
 9 | import os
10 | 
11 | def get_absolute_path(*relative_parts):
12 |     if 'dbutils' in globals():
13 |         base_dir = os.path.dirname(dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get()) # type: ignore
14 |         path = os.path.normpath(os.path.join(base_dir, *relative_parts))
15 |         return path if path.startswith("/Workspace") else "/Workspace" + path
16 |     else:
17 |         return os.path.join(*relative_parts)
18 | 
19 | csv_file = get_absolute_path("..", "fixtures", "mycsv.csv")
20 | df = pd.read_csv(csv_file)
21 | display(df)
22 | ```
23 | 


--------------------------------------------------------------------------------
/Generative AI/Generative Pretrained Transformer.sql:
--------------------------------------------------------------------------------
 1 | -- Databricks notebook source
 2 | -- MAGIC %md # Generative Pretrained Transformer (GPT)
 3 | 
 4 | -- COMMAND ----------
 5 | 
 6 | -- MAGIC %md ## GPT-4
 7 | 
 8 | -- COMMAND ----------
 9 | 
10 | -- MAGIC %md
11 | -- MAGIC
12 | -- MAGIC [GPT-4 API general availability and deprecation of older models in the Completions API](https://openai.com/blog/gpt-4-api-general-availability):
13 | -- MAGIC
14 | -- MAGIC * GPT-4 API is now available to all paying OpenAI API customers
15 | -- MAGIC * GPT-3.5 Turbo, DALL·E and Whisper APIs are also GA
16 | -- MAGIC * a deprecation plan for older models of the Completions API
17 | -- MAGIC
18 | -- MAGIC > **Note**
19 | -- MAGIC >
20 | -- MAGIC > The OpenAI API does not provide an isolated environment therefore is likely not suitable for enterprise production applications.
21 | -- MAGIC
22 | 


--------------------------------------------------------------------------------
/demo/bestsellers/books.py:
--------------------------------------------------------------------------------
 1 | # Databricks notebook source
 2 | # MAGIC %md # Exercise: Finding 1st and 2nd Bestsellers Per Genre
 3 | 
 4 | # COMMAND ----------
 5 | 
 6 | # MAGIC %md
 7 | # MAGIC
 8 | # MAGIC This is a DLT pipeline for [Exercise: Finding 1st and 2nd Bestsellers Per Genre](https://jaceklaskowski.github.io/spark-workshop/exercises/sql/Finding-1st-and-2nd-Bestsellers-Per-Genre.html).
 9 | 
10 | # COMMAND ----------
11 | 
12 | # MAGIC %md
13 | # MAGIC
14 | # MAGIC 1. Create a live table (with a raw data) = `books` table
15 | 
16 | # COMMAND ----------
17 | 
18 | source_path = 'dbfs:/FileStore/books.csv'
19 | 
20 | # COMMAND ----------
21 | 
22 | import dlt
23 | from pyspark.sql import DataFrame
24 | 
25 | @dlt.table
26 | def books() -> DataFrame:
27 |     return spark \
28 |         .read \
29 |         .option('header', True) \
30 |         .option('inferSchema', True) \
31 |         .csv(source_path)
32 | 


--------------------------------------------------------------------------------
/Databricks Workflows/for_each_task_demo/resources/for_each_task_job.yml:
--------------------------------------------------------------------------------
 1 | resources:
 2 |   jobs:
 3 |     for_each_task_demo_job:
 4 |       name: For Each Task Demo Job
 5 |       tasks:
 6 |         - task_key: Load_googlesheets_csv
 7 |           notebook_task:
 8 |             notebook_path: ../src/Load_googlesheets_csv.py
 9 |           existing_cluster_id: ${var.cluster_id}
10 |         - task_key: for_each_task
11 |           depends_on:
12 |             - task_key: Load_googlesheets_csv
13 |           for_each_task:
14 |             inputs: "{{tasks.Load_googlesheets_csv.values.gsheets}}"
15 |             concurrency: 100
16 |             task:
17 |               task_key: for_each_task_iteration
18 |               notebook_task:
19 |                 notebook_path: ../src/Nested_Task.py
20 |                 base_parameters:
21 |                   single_csv_line: "{{input}}"
22 |               existing_cluster_id: ${var.cluster_id}
23 | 


--------------------------------------------------------------------------------
/PySpark/pyspark-jupyter-poetry/README.md:
--------------------------------------------------------------------------------
 1 | # PySpark with Jupyter
 2 | 
 3 | This project shows how to run Apache Spark (PySpark) with Jupyter.
 4 | 
 5 | Run PySpark as follows:
 6 | 
 7 | ```bash
 8 | poetry run pyspark
 9 | ```
10 | 
11 | Start Spark Connect server.
12 | 
13 | ```bash
14 | ./sbin/start-connect-server.sh
15 | ```
16 | 
17 | Review the logs of this Spark Connect server.
18 | 
19 | ```bash
20 | tail -f /Users/jacek/dev/oss/spark/logs/spark-jacek-org.apache.spark.sql.connect.service.SparkConnectServer-1-Jaceks-Mac-mini.local.out
21 | ```
22 | 
23 | At the end of the logs, you should see the following INFO message that says the URL of this Spark Connect instance.
24 | 
25 | ```text
26 | ...
27 | [main] INFO  org.apache.spark.sql.connect.service.SparkConnectServer:60 - Spark Connect server started at: 0:0:0:0:0:0:0:0:15002
28 | ```
29 | 
30 | Run PySpark within Jupyter as follows:
31 | 
32 | ```bash
33 | poetry run jupyter lab
34 | ```
35 | 


--------------------------------------------------------------------------------
/Administration/Databricks Administration.sql:
--------------------------------------------------------------------------------
 1 | -- Databricks notebook source
 2 | -- MAGIC %md # Databricks Administration
 3 | -- MAGIC
 4 | -- MAGIC Commands and tricks to manage Databricks clusters
 5 | 
 6 | -- COMMAND ----------
 7 | 
 8 | -- MAGIC %scala
 9 | -- MAGIC
10 | -- MAGIC println(s"""
11 | -- MAGIC   |Spark version: ${sc.version}
12 | -- MAGIC   |runtime_commit: ${org.apache.spark.BuildInfo.gitHash}
13 | -- MAGIC   |universe_commit: ${com.databricks.BuildInfo.gitHash}
14 | -- MAGIC """.stripMargin)
15 | 
16 | -- COMMAND ----------
17 | 
18 | -- MAGIC %py
19 | -- MAGIC
20 | -- MAGIC import os
21 | -- MAGIC os.environ
22 | 
23 | -- COMMAND ----------
24 | 
25 | -- MAGIC %sh ls /databricks/spark
26 | 
27 | -- COMMAND ----------
28 | 
29 | -- MAGIC %sh cat /databricks/spark/VERSION
30 | 
31 | -- COMMAND ----------
32 | 
33 | -- MAGIC %sh ls /databricks/spark/conf
34 | 
35 | -- COMMAND ----------
36 | 
37 | -- MAGIC %sh cat /databricks/spark/conf/spark-env.sh
38 | 


--------------------------------------------------------------------------------
/demo/delta-live-tables/resources.tf:
--------------------------------------------------------------------------------
 1 | # https://registry.terraform.io/providers/databricks/databricks/latest/docs/resources/dbfs_file
 2 | # Deploy a single file for the demo to have something to consume
 3 | resource "databricks_dbfs_file" "this" {
 4 |   source = "${path.module}/input-data/1.csv"
 5 |   path   = "${var.input_dir}/1.csv"
 6 | }
 7 | 
 8 | # https://registry.terraform.io/providers/databricks/databricks/latest/docs/resources/repo
 9 | resource "databricks_repo" "learn_databricks" {
10 |   path = "/Repos/jacek@japila.pl/delta-live-tables-demo"
11 |   url = "https://github.com/jaceklaskowski/learn-databricks"
12 | }
13 | 
14 | resource "databricks_pipeline" "this" {
15 |   name = "EXPECT Clause Demo"
16 |   development = true
17 |   library {
18 |     notebook {
19 |       path = "${databricks_repo.learn_databricks.path}/demo/delta-live-tables/my_streaming_table"
20 |     }
21 |   }
22 |   configuration = {
23 |     cloud_files_input_path = var.input_dir
24 |   }
25 | }
26 | 


--------------------------------------------------------------------------------
/demo/pydantic_workflow/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "pydantic-workflow"
 3 | version = "0.1.0"
 4 | description = "Add your description here"
 5 | readme = "README.md"
 6 | # Databricks Runtime 16.2
 7 | # https://docs.databricks.com/aws/en/release-notes/runtime/16.2#system-environment
 8 | requires-python = "==3.12.3"
 9 | dependencies = [
10 |     "pydantic>=2.10.6",
11 | ]
12 | 
13 | # A command definition (entry point)
14 | # uv run pydantic_workflow
15 | # https://docs.astral.sh/uv/concepts/projects/init/#packaged-applications
16 | [project.scripts]
17 | pydantic_workflow = "pydantic_workflow.main:main"
18 | 
19 | # uv run --dev ...
20 | # https://docs.astral.sh/uv/concepts/projects/dependencies/#dependency-groups
21 | [dependency-groups]
22 | dev = [
23 |     "pyspark>=3.5.4",
24 |     "pytest>=8.3.4",
25 | ]
26 | 
27 | # https://docs.astral.sh/uv/concepts/projects/init/#packaged-applications
28 | [build-system]
29 | requires = ["hatchling"]
30 | build-backend = "hatchling.build"


--------------------------------------------------------------------------------
/terraform/.gitignore:
--------------------------------------------------------------------------------
 1 | # Local .terraform directories
 2 | **/.terraform/*
 3 | 
 4 | # .tfstate files
 5 | *.tfstate
 6 | *.tfstate.*
 7 | 
 8 | # Crash log files
 9 | crash.log
10 | crash.*.log
11 | 
12 | # Exclude all .tfvars files, which are likely to contain sensitive data, such as
13 | # password, private keys, and other secrets. These should not be part of version 
14 | # control as they are data points which are potentially sensitive and subject 
15 | # to change depending on the environment.
16 | *.tfvars
17 | *.tfvars.json
18 | 
19 | # Ignore override files as they are usually used to override resources locally and so
20 | # are not checked in
21 | override.tf
22 | override.tf.json
23 | *_override.tf
24 | *_override.tf.json
25 | 
26 | # Include override files you do wish to add to version control using negated pattern
27 | # !example_override.tf
28 | 
29 | # Include tfplan files to ignore the plan output of command: terraform plan -out=tfplan
30 | # example: *tfplan*
31 | 
32 | # Ignore CLI configuration files
33 | .terraformrc
34 | terraform.rc
35 | 


--------------------------------------------------------------------------------
/Databricks Asset Bundles/Tips and Tricks.py:
--------------------------------------------------------------------------------
 1 | # Databricks notebook source
 2 | # MAGIC %md
 3 | # MAGIC
 4 | # MAGIC # Tips and Tricks
 5 | 
 6 | # COMMAND ----------
 7 | 
 8 | # MAGIC %md
 9 | # MAGIC
10 | # MAGIC ## List Resources
11 | # MAGIC
12 | # MAGIC There's no command line option to list the resources managed in a DAB project.
13 | # MAGIC
14 | # MAGIC Use [jq](https://jqlang.github.io/jq/) and [keys](https://jqlang.github.io/jq/manual/#keys-keys_unsorted).
15 | # MAGIC
16 | # MAGIC [How to get key names from JSON using jq](https://stackoverflow.com/q/23118341/1305344)
17 | 
18 | # COMMAND ----------
19 | 
20 | # MAGIC %md
21 | # MAGIC
22 | # MAGIC ``` console
23 | # MAGIC $ databricks bundle validate --output json | jq '.resources | keys'
24 | # MAGIC [
25 | # MAGIC   "jobs"
26 | # MAGIC ]
27 | # MAGIC ```
28 | 
29 | # COMMAND ----------
30 | 
31 | # MAGIC %md
32 | # MAGIC
33 | # MAGIC ``` console
34 | # MAGIC $ databricks bundle validate --output json | jq '.resources.jobs | keys'
35 | # MAGIC [
36 | # MAGIC   "my_job"
37 | # MAGIC ]
38 | # MAGIC ```
39 | 


--------------------------------------------------------------------------------
/demo/delta-live-tables/.gitignore:
--------------------------------------------------------------------------------
 1 | # Local .terraform directories
 2 | **/.terraform/*
 3 | 
 4 | # .tfstate files
 5 | *.tfstate
 6 | *.tfstate.*
 7 | 
 8 | # Crash log files
 9 | crash.log
10 | crash.*.log
11 | 
12 | # Exclude all .tfvars files, which are likely to contain sensitive data, such as
13 | # password, private keys, and other secrets. These should not be part of version 
14 | # control as they are data points which are potentially sensitive and subject 
15 | # to change depending on the environment.
16 | *.tfvars
17 | *.tfvars.json
18 | 
19 | # Ignore override files as they are usually used to override resources locally and so
20 | # are not checked in
21 | override.tf
22 | override.tf.json
23 | *_override.tf
24 | *_override.tf.json
25 | 
26 | # Include override files you do wish to add to version control using negated pattern
27 | # !example_override.tf
28 | 
29 | # Include tfplan files to ignore the plan output of command: terraform plan -out=tfplan
30 | # example: *tfplan*
31 | 
32 | # Ignore CLI configuration files
33 | .terraformrc
34 | terraform.rc
35 | 


--------------------------------------------------------------------------------
/Databricks Asset Bundles/delta_live_tables_demo/requirements-dev.txt:
--------------------------------------------------------------------------------
 1 | ## requirements-dev.txt: dependencies for local development.
 2 | ##
 3 | ## For defining dependencies used by jobs in Databricks Workflows, see
 4 | ## https://docs.databricks.com/dev-tools/bundles/library-dependencies.html
 5 | 
 6 | ## pytest is the default package used for testing
 7 | pytest
 8 | 
 9 | ## databricks-connect can be used to run parts of this project locally.
10 | ## See https://docs.databricks.com/dev-tools/databricks-connect.html.
11 | ##
12 | ## databricks-connect is automatically installed if you're using Databricks
13 | ## extension for Visual Studio Code
14 | ## (https://docs.databricks.com/dev-tools/vscode-ext/dev-tasks/databricks-connect.html).
15 | ##
16 | ## To manually install databricks-connect, either follow the instructions
17 | ## at https://docs.databricks.com/dev-tools/databricks-connect.html
18 | ## to install the package system-wide. Or uncomment the line below to install a
19 | ## version of db-connect that corresponds to the Databricks Runtime version used
20 | ## for this project.
21 | #
22 | # databricks-connect>=13.3,<13.4
23 | 


--------------------------------------------------------------------------------
/Table-Valued Functions.sql:
--------------------------------------------------------------------------------
 1 | -- Databricks notebook source
 2 | -- MAGIC %md # Table-Valued Functions
 3 | -- MAGIC
 4 | -- MAGIC [The Internals of Spark SQL](https://books.japila.pl/spark-sql-internals/table-valued-functions/)
 5 | 
 6 | -- COMMAND ----------
 7 | 
 8 | -- MAGIC %python
 9 | -- MAGIC
10 | -- MAGIC import os
11 | -- MAGIC print('DATABRICKS_RUNTIME_VERSION:', os.environ.get('DATABRICKS_RUNTIME_VERSION', '(undefined)'))
12 | 
13 | -- COMMAND ----------
14 | 
15 | -- MAGIC %scala
16 | -- MAGIC
17 | -- MAGIC import org.apache.spark.sql.catalyst.analysis.TableFunctionRegistry
18 | -- MAGIC display(TableFunctionRegistry.builtin.listFunction.map(_.funcName).sorted.toDF("Table-Valued Function"))
19 | 
20 | -- COMMAND ----------
21 | 
22 | -- MAGIC %fs mkdirs /tmp/jacek-laskowski
23 | 
24 | -- COMMAND ----------
25 | 
26 | select * from read_files("/tmp/jacek-laskowski")
27 | 
28 | -- COMMAND ----------
29 | 
30 | -- MAGIC %md
31 | -- MAGIC
32 | -- MAGIC ## Databricks TVFs
33 | -- MAGIC
34 | -- MAGIC [Alphabetical list of built-in functions](https://docs.databricks.com/en/sql/language-manual/sql-ref-functions-builtin-alpha.html)
35 | 


--------------------------------------------------------------------------------
/demo/delta-live-tables/.terraform.lock.hcl:
--------------------------------------------------------------------------------
 1 | # This file is maintained automatically by "terraform init".
 2 | # Manual edits may be lost in future updates.
 3 | 
 4 | provider "registry.terraform.io/databricks/databricks" {
 5 |   version     = "1.13.0"
 6 |   constraints = "1.13.0"
 7 |   hashes = [
 8 |     "h1:ga1T48DAVVP1ifvG2261xDYZ9d6zBlJc8OlkaTCHlyU=",
 9 |     "zh:1f043e67b45749a0c58638b62b5859e367931e72f3b91c8e12151fe6222ab672",
10 |     "zh:32db31687e501c55412c97b998bfd853f21ef302a2b562abefc9bc2a3bbdabf2",
11 |     "zh:7d092ac4a1e079c482ecbe541518dff756adcf7f80cac9f7e8152ee93673bcae",
12 |     "zh:90a1700cbe727597c2e1fc7f27b1e41531a16dbbfad3fd7e1a31de270b4e80d5",
13 |     "zh:9338406075c45eb732a55927479c114759b6fb24a819e8c744317b57ec04af41",
14 |     "zh:b17faafdcc8e69d890037683538eb3a8d92279efd2bc1c9a4b3e1351981218f5",
15 |     "zh:e0c603a36facc6724ef512a946e5793fb2e021bbecc68ae7993a4f5f6bcd4d92",
16 |     "zh:ec5b3ab15be55e1da39c04afe02ab6b9b37f0c9135284c6df22358dc7d253316",
17 |     "zh:ef236b53004957e6f3ef19892ce046d90cddc260cbf54acc14ea736d7f365f61",
18 |     "zh:f841ea8f17d65dd8a51d6ff7728f17ccb2aaa04e43e6baa65235cc8960b77a4f",
19 |   ]
20 | }
21 | 


--------------------------------------------------------------------------------
/review_me/DLT Step 1.sql:
--------------------------------------------------------------------------------
 1 | -- Databricks notebook source
 2 | CREATE LIVE TABLE five_row_table
 3 | AS SELECT * FROM range(0, 5)
 4 | 
 5 | -- COMMAND ----------
 6 | 
 7 | CREATE LIVE TABLE all_rows_multiplied_by_5
 8 | AS SELECT id * 5 id FROM live.five_row_table
 9 | 
10 | -- COMMAND ----------
11 | 
12 | -- MAGIC %py
13 | -- MAGIC
14 | -- MAGIC # A regular PySpark data loading pattern
15 | -- MAGIC # dataframe = spark.read.format('csv').option('header', True).load('dbfs:/FileStore/books.csv')
16 | -- MAGIC # display(dataframe)
17 | -- MAGIC
18 | -- MAGIC # What am I supposed to do with the two below
19 | -- MAGIC # to create a DLT live table in Python?
20 | -- MAGIC
21 | -- MAGIC # @dlt.table Decorator
22 | -- MAGIC # The Python table and view functions must return a DataFrame
23 | -- MAGIC
24 | -- MAGIC from pyspark.sql import DataFrame
25 | -- MAGIC import dlt
26 | -- MAGIC
27 | -- MAGIC # decorators beg for methods
28 | -- MAGIC
29 | -- MAGIC # A DLT data loading pattern
30 | -- MAGIC
31 | -- MAGIC @dlt.table
32 | -- MAGIC def python_in_sql() -> DataFrame:
33 | -- MAGIC     return spark.read.format('csv').option('header', True).load('dbfs:/FileStore/books.csv')
34 | 


--------------------------------------------------------------------------------
/Databricks Workflows/Step 2. Transform.sql:
--------------------------------------------------------------------------------
 1 | -- Databricks notebook source
 2 | -- MAGIC %md # Transform
 3 | 
 4 | -- COMMAND ----------
 5 | 
 6 | -- MAGIC %python
 7 | -- MAGIC
 8 | -- MAGIC # Creates a text input widget with a given name and default value.
 9 | -- MAGIC # Notebook Widgets are only for Run all (when executed outside a job)
10 | -- MAGIC dbutils.widgets.removeAll()
11 | -- MAGIC dbutils.widgets.text(name = "database_name", defaultValue = "jaceklaskowski", label = "Database Name")
12 | -- MAGIC dbutils.widgets.text(name = "raw_table_name", defaultValue = "workflows_raw_data", label = "Raw Table Name")
13 | -- MAGIC dbutils.widgets.text(name = "silver_table_name", defaultValue = "workflows_transform", label = "Silver Table Name")
14 | -- MAGIC dbutils.widgets.text(name = "gold_table_name", defaultValue = "workflows_aggregates", label = "Gold Table Name")
15 | 
16 | -- COMMAND ----------
17 | 
18 | USE ${database_name}
19 | 
20 | -- COMMAND ----------
21 | 
22 | CREATE OR REPLACE VIEW ${silver_table_name}
23 | COMMENT 'Silver layer'
24 | AS
25 |   SELECT id, upper(name) name
26 |   FROM ${raw_table_name}
27 | 
28 | -- COMMAND ----------
29 | 
30 | SHOW VIEWS
31 | 


--------------------------------------------------------------------------------
/PySpark/pyspark-jupyter-poetry/install-pyspark.md:
--------------------------------------------------------------------------------
 1 | # Install PySpark
 2 | 
 3 | 1. The official [Apache Spark docs](https://spark.apache.org/docs/latest/api/python/getting_started/install.html)
 4 | 1. [Getting Started](https://spark.apache.org/docs/latest/api/python/getting_started/index.html)
 5 | 
 6 | ## Spark Connect
 7 | 
 8 | [Spark Connect](https://spark.apache.org/docs/latest/api/python/getting_started/quickstart_connect.html)
 9 | 
10 | Start Spark Connect server.
11 | 
12 | ```bash
13 | ./sbin/start-connect-server.sh
14 | ```
15 | 
16 | Open http://localhost:4040/connect/ to review the Spark Connect server application UI.
17 | 
18 | ```bash
19 | poetry run pyspark --remote sc://localhost:15002
20 | ```
21 | 
22 | In the other terminal, `tail -f` the logs to learn more about the connection.
23 | 
24 | ```bash
25 | tail -f /Users/jacek/dev/oss/spark/logs/spark-jacek-org.apache.spark.sql.connect.service.SparkConnectServer-1-Jaceks-Mac-mini.local.out
26 | ```
27 | 
28 | Refresh the Spark Connect server application UI.
29 | 
30 | ## JupyterLab
31 | 
32 | [JupyterLab: A Next-Generation Notebook Interface](https://jupyter.org/)
33 | 
34 | ```bash
35 | poetry run jupyter lab
36 | ```
37 | 


--------------------------------------------------------------------------------
/workshops/Databricks Workshop Half-Day 5a.sql:
--------------------------------------------------------------------------------
 1 | -- Databricks notebook source
 2 | -- MAGIC %md # Databricks Workshop Half-Day 5a
 3 | -- MAGIC
 4 | -- MAGIC Duration: 2 hours 15 minutes (9:45-12:00)
 5 | 
 6 | -- COMMAND ----------
 7 | 
 8 | -- MAGIC %md ## Schedule
 9 | 
10 | -- COMMAND ----------
11 | 
12 | -- MAGIC %md
13 | -- MAGIC
14 | -- MAGIC * The class starts at 9:45
15 | -- MAGIC * A class is split into 1-hour blocks with a 12-minute break each
16 | -- MAGIC     * Breaks at the end of an "hour"
17 | -- MAGIC     * 09:45 - 10:25
18 | -- MAGIC     * 10:45 - 12:00
19 | 
20 | -- COMMAND ----------
21 | 
22 | -- MAGIC %md ## Agenda
23 | 
24 | -- COMMAND ----------
25 | 
26 | -- MAGIC %md
27 | -- MAGIC
28 | -- MAGIC 1. Databricks Workflows
29 | -- MAGIC     1. <a href="$../Workflow Jobs/Modular Orchestration with Run Job Task">Modular Orchestration with Run Job Task</a>
30 | -- MAGIC     1. <a href="$../Workflow Jobs/Conditional Workflows">Conditional Workflows</a>
31 | -- MAGIC 1. Delta Lake (_unlikely and leaving as an option_)
32 | -- MAGIC     1. <a href="$../Delta Lake/DESCRIBE HISTORY">DESCRIBE HISTORY Command</a>
33 | -- MAGIC     1. [REORG TABLE Command](https://books.japila.pl/delta-lake-internals/commands/reorg/)
34 | 


--------------------------------------------------------------------------------
/Delta Live Tables/Delta Live Tables Python.py:
--------------------------------------------------------------------------------
 1 | # Databricks notebook source
 2 | # MAGIC %md
 3 | # MAGIC
 4 | # MAGIC # Delta Live Tables Python API
 5 | 
 6 | # COMMAND ----------
 7 | 
 8 | # MAGIC %md ## dlt Module
 9 | # MAGIC
10 | # MAGIC Delta Live Tables Python functions are defined in the `dlt` module
11 | # MAGIC
12 | # MAGIC ```py
13 | # MAGIC import dlt
14 | # MAGIC ```
15 | 
16 | # COMMAND ----------
17 | 
18 | # MAGIC %md ## @dlt.table Decorator
19 | # MAGIC
20 | # MAGIC Used to define tables (incl. streaming tables)
21 | 
22 | # COMMAND ----------
23 | 
24 | # MAGIC %md ## @dlt.view Decorator
25 | 
26 | # COMMAND ----------
27 | 
28 | # MAGIC %md
29 | # MAGIC
30 | # MAGIC ## How Dataflow Graph is Rendered
31 | # MAGIC
32 | # MAGIC * The Python `table` and `view` methods must return either a Spark or Koalas `DataFrame`
33 | # MAGIC * DataFrame transformations are executed **after** the full dataflow graph has been resolved
34 | # MAGIC * Non-`table` or `view` functions are executed once at the graph initialization phase
35 | 
36 | # COMMAND ----------
37 | 
38 | # MAGIC %md ## Learn More
39 | # MAGIC
40 | # MAGIC * [Delta Live Tables Python language reference](https://docs.databricks.com/en/delta-live-tables/python-ref.html)
41 | 


--------------------------------------------------------------------------------
/terraform/pipeline.tf:
--------------------------------------------------------------------------------
 1 | terraform {
 2 |   # Each argument in the required_providers block enables one provider
 3 |   # The key determines the provider's local name (its unique identifier within this module)
 4 |   required_providers {
 5 |     databricks = {
 6 |       source  = "databricks/databricks"
 7 |       version >= "1.19.0"
 8 |     }
 9 |   }
10 | 
11 |   required_version = ">= 1.5.0"
12 | }
13 | 
14 | # https://developer.hashicorp.com/terraform/language/providers/configuration#default-provider-configurations
15 | # A provider block without an alias argument is the default configuration for that provider.
16 | # Resources that don't set the provider meta-argument
17 | # will use the default provider configuration that matches the first word of the resource type name.
18 | # E.g. databricks_repo, databricks_pipeline below
19 | provider "databricks" {}
20 | 
21 | resource "databricks_repo" "learn_databricks" {
22 |   url = "https://github.com/jaceklaskowski/learn-databricks"
23 | }
24 | 
25 | resource "databricks_pipeline" "this" {
26 |   name = "My Terraform-deployed DLT Pipeline"
27 |   library {
28 |     notebook {
29 |       path = "${databricks_repo.learn_databricks.path}/Delta Live Tables/my_streaming_table"
30 |     }
31 |   }
32 | }
33 | 


--------------------------------------------------------------------------------
/Databricks Asset Bundles/delta_live_tables_demo/scratch/exploration.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": null,
 6 |    "metadata": {
 7 |     "application/vnd.databricks.v1+cell": {
 8 |      "cellMetadata": {
 9 |       "byteLimit": 2048000,
10 |       "rowLimit": 10000
11 |      },
12 |      "inputWidgets": {},
13 |      "nuid": "6bca260b-13d1-448f-8082-30b60a85c9ae",
14 |      "showTitle": false,
15 |      "title": ""
16 |     }
17 |    },
18 |    "outputs": [],
19 |    "source": [
20 |     "import sys\n",
21 |     "sys.path.append('../src')\n",
22 |     "from delta_live_tables_demo import main\n",
23 |     "\n",
24 |     "main.get_taxis().show(10)"
25 |    ]
26 |   }
27 |  ],
28 |  "metadata": {
29 |   "application/vnd.databricks.v1+notebook": {
30 |    "dashboards": [],
31 |    "language": "python",
32 |    "notebookMetadata": {
33 |     "pythonIndentUnit": 2
34 |    },
35 |    "notebookName": "ipynb-notebook",
36 |    "widgets": {}
37 |   },
38 |   "kernelspec": {
39 |    "display_name": "Python 3",
40 |    "language": "python",
41 |    "name": "python3"
42 |   },
43 |   "language_info": {
44 |    "name": "python",
45 |    "version": "3.11.4"
46 |   }
47 |  },
48 |  "nbformat": 4,
49 |  "nbformat_minor": 0
50 | }
51 | 


--------------------------------------------------------------------------------
/Generative AI/Retrieval Augmented Generation.sql:
--------------------------------------------------------------------------------
 1 | -- Databricks notebook source
 2 | -- MAGIC %md # Retrieval Augmented Generation (RAG)
 3 | 
 4 | -- COMMAND ----------
 5 | 
 6 | -- MAGIC %md
 7 | -- MAGIC
 8 | -- MAGIC [Retrieval Augmented Generation](https://www.databricks.com/glossary/retrieval-augmented-generation-rag):
 9 | -- MAGIC
10 | -- MAGIC > **Retrieval augmented generation** or **RAG** is an architectural approach that can improve the efficacy of large language model (LLM) applications by leveraging custom data.
11 | -- MAGIC
12 | -- MAGIC > This is done by retrieving relevant data/documents relevant to a question or task and providing them as context for the LLM. RAG has shown success in support chatbots and Q&A systems that need to maintain up-to-date information or access domain-specific knowledge.
13 | 
14 | -- COMMAND ----------
15 | 
16 | -- MAGIC %md ## Creating High Quality RAG Applications with Databricks
17 | -- MAGIC
18 | -- MAGIC [Creating High Quality RAG Applications with Databricks](https://www.databricks.com/blog/building-high-quality-rag-applications-databricks)
19 | 
20 | -- COMMAND ----------
21 | 
22 | -- MAGIC %md
23 | -- MAGIC
24 | -- MAGIC > a powerful way to incorporate proprietary, real-time data into Large Language Model (LLM) applications.
25 | 


--------------------------------------------------------------------------------
/review_me/My Very First Notebook.py:
--------------------------------------------------------------------------------
 1 | # Databricks notebook source
 2 | dbutils.widgets.text(name='name', defaultValue='Jacek', label='Name')
 3 | 
 4 | # COMMAND ----------
 5 | 
 6 | table_name = dbutils.widgets.get('name')
 7 | 
 8 | # COMMAND ----------
 9 | 
10 | # MAGIC %md
11 | # MAGIC
12 | # MAGIC # SQL
13 | 
14 | # COMMAND ----------
15 | 
16 | # MAGIC %sql
17 | # MAGIC
18 | # MAGIC SHOW SCHEMAS
19 | 
20 | # COMMAND ----------
21 | 
22 | # MAGIC %sql
23 | # MAGIC
24 | # MAGIC -- catalog'.'schema'.'table
25 | # MAGIC -- database.table
26 | # MAGIC SELECT '${name}'
27 | 
28 | # COMMAND ----------
29 | 
30 | # MAGIC %sql
31 | # MAGIC
32 | # MAGIC SHOW TABLES LIKE '${name}'
33 | 
34 | # COMMAND ----------
35 | 
36 | # MAGIC %sql
37 | # MAGIC
38 | # MAGIC SHOW TABLES LIKE '${name}'
39 | 
40 | # COMMAND ----------
41 | 
42 | # MAGIC %md # Some Python
43 | 
44 | # COMMAND ----------
45 | 
46 | print('Bonjour a tous !')
47 | 
48 | # COMMAND ----------
49 | 
50 | # MAGIC %md
51 | # MAGIC
52 | # MAGIC # Some Scala
53 | 
54 | # COMMAND ----------
55 | 
56 | # MAGIC %scala
57 | # MAGIC println("How are you today?")
58 | 
59 | # COMMAND ----------
60 | 
61 | # MAGIC %md ## Questions
62 | 
63 | # COMMAND ----------
64 | 
65 | # MAGIC %md
66 | # MAGIC
67 | # MAGIC 1. Where to find notebooks?
68 | 


--------------------------------------------------------------------------------
/Databricks Asset Bundles/delta_live_tables_demo/setup.py:
--------------------------------------------------------------------------------
 1 | """
 2 | setup.py configuration script describing how to build and package this project.
 3 | 
 4 | This file is primarily used by the setuptools library and typically should not
 5 | be executed directly. See README.md for how to deploy, test, and run
 6 | the delta_live_tables_demo project.
 7 | """
 8 | from setuptools import setup, find_packages
 9 | 
10 | import sys
11 | sys.path.append('./src')
12 | 
13 | import delta_live_tables_demo
14 | 
15 | setup(
16 |     name="delta_live_tables_demo",
17 |     version=delta_live_tables_demo.__version__,
18 |     url="https://databricks.com",
19 |     author="jacek@japila.pl",
20 |     description="wheel file based on delta_live_tables_demo/src",
21 |     packages=find_packages(where='./src'),
22 |     package_dir={'': 'src'},
23 |     entry_points={
24 |         "packages": [
25 |             "main=delta_live_tables_demo.main:main"
26 |         ]
27 |     },
28 |     install_requires=[
29 |         # Dependencies in case the output wheel file is used as a library dependency.
30 |         # For defining dependencies, when this package is used in Databricks, see:
31 |         # https://docs.databricks.com/dev-tools/bundles/library-dependencies.html
32 |         "setuptools"
33 |     ],
34 | )
35 | 


--------------------------------------------------------------------------------
/Delta Live Tables/Deep Dive into DLTs.sql:
--------------------------------------------------------------------------------
 1 | -- Databricks notebook source
 2 | -- MAGIC %md # Deep Dive into DLTs
 3 | 
 4 | -- COMMAND ----------
 5 | 
 6 | -- MAGIC %py
 7 | -- MAGIC
 8 | -- MAGIC dbutils.widgets.text(
 9 | -- MAGIC     name='storage_location',
10 | -- MAGIC     defaultValue='Please specify Storage location',
11 | -- MAGIC     label='Storage location')
12 | 
13 | -- COMMAND ----------
14 | 
15 | -- MAGIC %py
16 | -- MAGIC
17 | -- MAGIC storage_location = dbutils.widgets.get('storage_location')
18 | 
19 | -- COMMAND ----------
20 | 
21 | -- MAGIC %python
22 | -- MAGIC
23 | -- MAGIC # %fs ls dbfs:/pipelines/75fb9324-5321-4be6-b9ca-a3a8f9b47a9b
24 | -- MAGIC display(dbutils.fs.ls(storage_location))
25 | 
26 | -- COMMAND ----------
27 | 
28 | -- MAGIC %python
29 | -- MAGIC
30 | -- MAGIC # %fs ls dbfs:/pipelines/75fb9324-5321-4be6-b9ca-a3a8f9b47a9b/tables/
31 | -- MAGIC display(dbutils.fs.ls(f'{storage_location}/tables'))
32 | 
33 | -- COMMAND ----------
34 | 
35 | SELECT '${storage_location}'
36 | 
37 | -- COMMAND ----------
38 | 
39 | select * from delta.`${storage_location}/system/events`
40 | 
41 | -- COMMAND ----------
42 | 
43 | DESCRIBE HISTORY delta.`${storage_location}/system/events`
44 | 
45 | -- COMMAND ----------
46 | 
47 | SELECT count(*) FROM delta.`${storage_location}/system/events`@v524
48 | 


--------------------------------------------------------------------------------
/Delta Lake/Merge.sql:
--------------------------------------------------------------------------------
 1 | -- Databricks notebook source
 2 | -- MAGIC %md # Merge
 3 | 
 4 | -- COMMAND ----------
 5 | 
 6 | -- MAGIC %md ## Examples
 7 | 
 8 | -- COMMAND ----------
 9 | 
10 | -- MAGIC %md ### Conditional Update with Delete
11 | 
12 | -- COMMAND ----------
13 | 
14 | DROP TABLE IF EXISTS source;
15 | DROP TABLE IF EXISTS target;
16 | 
17 | -- COMMAND ----------
18 | 
19 | CREATE TABLE source
20 | USING delta
21 | AS VALUES
22 |   (0, 0),
23 |   (1, 10),
24 |   (2, 20) AS data(key, value);
25 | 
26 | -- COMMAND ----------
27 | 
28 | select * from source;
29 | 
30 | -- COMMAND ----------
31 | 
32 | CREATE TABLE target
33 | USING delta
34 | AS VALUES
35 |   (1, 1),
36 |   (2, 2),
37 |   (3, 3) AS data(key, value);
38 | 
39 | -- COMMAND ----------
40 | 
41 | select * from target;
42 | 
43 | -- COMMAND ----------
44 | 
45 | MERGE INTO target t
46 | USING source s
47 | ON s.key = t.key
48 | WHEN MATCHED AND s.key <> 1 THEN UPDATE SET key = s.key, value = s.value
49 | WHEN MATCHED THEN DELETE
50 | 
51 | -- COMMAND ----------
52 | 
53 | select * from target;
54 | 
55 | -- COMMAND ----------
56 | 
57 | -- MAGIC %md
58 | -- MAGIC
59 | -- MAGIC ## Learning Resources
60 | 
61 | -- COMMAND ----------
62 | 
63 | -- MAGIC %md
64 | -- MAGIC
65 | -- MAGIC * [The Internals of Delta Lake](https://books.japila.pl/delta-lake-internals/commands/merge/)
66 | 


--------------------------------------------------------------------------------
/review_me/Python example.py:
--------------------------------------------------------------------------------
 1 | # Databricks notebook source
 2 | # MAGIC %md
 3 | # MAGIC
 4 | # MAGIC # **Matplotlib**
 5 | # MAGIC You can display Matplotlib objects in Python notebooks.
 6 | 
 7 | # COMMAND ----------
 8 | 
 9 | # in DBR 6.4 and below, uncomment the line below
10 | # %matplotlib inline
11 | 
12 | # COMMAND ----------
13 | 
14 | import numpy as np
15 | import matplotlib.pyplot as plt
16 | 
17 | x = np.linspace(0, 2*np.pi, 50)
18 | y = np.sin(x)
19 | y2 = y + 0.1 * np.random.normal(size=x.shape)
20 | 
21 | fig, ax = plt.subplots()
22 | ax.plot(x, y, 'k--')
23 | ax.plot(x, y2, 'ro')
24 | 
25 | # set ticks and tick labels
26 | ax.set_xlim((0, 2*np.pi))
27 | ax.set_xticks([0, np.pi, 2*np.pi])
28 | ax.set_xticklabels(['0', '$\pi$','2$\pi$'])
29 | ax.set_ylim((-1.5, 1.5))
30 | ax.set_yticks([-1, 0, 1])
31 | 
32 | # Only draw spine between the y-ticks
33 | ax.spines['left'].set_bounds(-1, 1)
34 | # Hide the right and top spines
35 | ax.spines['right'].set_visible(False)
36 | ax.spines['top'].set_visible(False)
37 | # Only show ticks on the left and bottom spines
38 | ax.yaxis.set_ticks_position('left')
39 | ax.xaxis.set_ticks_position('bottom')
40 | 
41 | # COMMAND ----------
42 | 
43 | # MAGIC %md In Databricks Runtime 6.2 and below, run the `display` command to view the plot.
44 | 
45 | # COMMAND ----------
46 | 
47 | display(fig)
48 | 


--------------------------------------------------------------------------------
/Databricks Workflows/Step 3. Build Aggregates.sql:
--------------------------------------------------------------------------------
 1 | -- Databricks notebook source
 2 | -- MAGIC %md # Build Aggregates
 3 | -- MAGIC
 4 | -- MAGIC ...for presentation layer
 5 | 
 6 | -- COMMAND ----------
 7 | 
 8 | -- MAGIC %python
 9 | -- MAGIC
10 | -- MAGIC # Creates a text input widget with a given name and default value.
11 | -- MAGIC # Notebook Widgets are only for Run all (when executed outside a job)
12 | -- MAGIC dbutils.widgets.removeAll()
13 | -- MAGIC dbutils.widgets.text(name = "database_name", defaultValue = "jaceklaskowski", label = "Database Name")
14 | -- MAGIC dbutils.widgets.text(name = "raw_table_name", defaultValue = "workflows_raw_data", label = "Raw Table Name")
15 | -- MAGIC dbutils.widgets.text(name = "silver_table_name", defaultValue = "workflows_transform", label = "Silver Table Name")
16 | -- MAGIC dbutils.widgets.text(name = "gold_table_name", defaultValue = "workflows_aggregates", label = "Gold Table Name")
17 | 
18 | -- COMMAND ----------
19 | 
20 | USE ${database_name}
21 | 
22 | -- COMMAND ----------
23 | 
24 | CREATE OR REPLACE VIEW ${gold_table_name}
25 | COMMENT 'Golden layer'
26 | AS
27 |   SELECT length(name) % 2 gid, count(name) count, collect_set(name) names
28 |   FROM ${silver_table_name}
29 |   GROUP BY 1
30 | 
31 | -- COMMAND ----------
32 | 
33 | SHOW VIEWS
34 | 
35 | -- COMMAND ----------
36 | 
37 | SELECT * FROM ${gold_table_name}
38 | 


--------------------------------------------------------------------------------
/Databricks Asset Bundles/Job and Task Parameters.py:
--------------------------------------------------------------------------------
 1 | # Databricks notebook source
 2 | # MAGIC %md
 3 | # MAGIC
 4 | # MAGIC # Job and Task Parameters
 5 | 
 6 | # COMMAND ----------
 7 | 
 8 | # DBTITLE 0,job.yml
 9 | # MAGIC %md
10 | # MAGIC
11 | # MAGIC ```
12 | # MAGIC resources:
13 | # MAGIC   jobs:
14 | # MAGIC     demo_job:
15 | # MAGIC       name: demo_job
16 | # MAGIC       description: My custom description that should describe the purpose of this job
17 | # MAGIC       # https://docs.databricks.com/api/workspace/jobs/create#parameters
18 | # MAGIC       # Job-level parameters
19 | # MAGIC       parameters:
20 | # MAGIC         - name: jacek_custom_variable
21 | # MAGIC           default: FIXME_parameters
22 | # MAGIC       tasks:
23 | # MAGIC         - task_key: notebook_task
24 | # MAGIC           existing_cluster_id: ${var.my-human-readable-name}
25 | # MAGIC           notebook_task:
26 | # MAGIC             notebook_path: ../src/notebook.ipynb
27 | # MAGIC             # https://docs.databricks.com/api/workspace/jobs/create#tasks-notebook_task-base_parameters
28 | # MAGIC             # Base parameters used for each run of this job
29 | # MAGIC             # Parameters at job level take precedence
30 | # MAGIC             # Use dbutils.widgets.get to access the value
31 | # MAGIC             base_parameters:
32 | # MAGIC               jacek_custom_variable: FIXME_base_parameters
33 | # MAGIC ```
34 | 


--------------------------------------------------------------------------------
/Delta Live Tables/DLT Lab.py:
--------------------------------------------------------------------------------
 1 | # Databricks notebook source
 2 | # MAGIC %md # DLT Lab
 3 | 
 4 | # COMMAND ----------
 5 | 
 6 | # MAGIC %md
 7 | # MAGIC
 8 | # MAGIC 1. Create a 3-table DLT pipeline
 9 | # MAGIC     1. 3 tables for each layer (bronze, silver, gold)
10 | # MAGIC     1. A DLT pipeline based on 1 (at least) or (better / highly recommended) many notebooks
11 | # MAGIC 1. `CREATE TABLE` regular table (non-live) that you can use to `INSERT` records into so your pipeline can digest it and do all the transformations
12 | # MAGIC     1. Think of JSON-encoded medical records
13 | # MAGIC     1. A raw table = JSON intact
14 | # MAGIC     1. A silver table = JSON flatten out (`explode` standard function + `:` JSON access pattern)
15 | # MAGIC     1. A(nother) silver table = some unification (e.g. LonDON, london, LONDON)
16 | # MAGIC     1. A Gold table = some aggs (`count`s = how many people live in different cities or hobbies)
17 | 
18 | # COMMAND ----------
19 | 
20 | # MAGIC %md
21 | # MAGIC
22 | # MAGIC ## Hint: Create pipeline with blank notebook
23 | # MAGIC
24 | # MAGIC **Source code** are the paths to notebooks or files that contain pipeline source code.
25 | # MAGIC
26 | # MAGIC Paths can be modified after the pipeline is created.
27 | # MAGIC
28 | # MAGIC With no source code specified, Databricks will create an empty notebook for the pipeline.
29 | # MAGIC
30 | # MAGIC You can edit this notebook later.
31 | 


--------------------------------------------------------------------------------
/demo/bestsellers/pipeline_clone.py:
--------------------------------------------------------------------------------
 1 | # Databricks notebook source
 2 | # MAGIC %md # Exercise: Finding 1st and 2nd Bestsellers Per Genre
 3 | 
 4 | # COMMAND ----------
 5 | 
 6 | # MAGIC %md
 7 | # MAGIC
 8 | # MAGIC This is a DLT pipeline for [Exercise: Finding 1st and 2nd Bestsellers Per Genre](https://jaceklaskowski.github.io/spark-workshop/exercises/sql/Finding-1st-and-2nd-Bestsellers-Per-Genre.html).
 9 | 
10 | # COMMAND ----------
11 | 
12 | # MAGIC %md
13 | # MAGIC
14 | # MAGIC 1. Create a live table (with a raw data) = `books` table
15 | 
16 | # COMMAND ----------
17 | 
18 | import dlt
19 | from pyspark.sql import DataFrame
20 | 
21 | @dlt.table
22 | def books() -> DataFrame:
23 |     return spark.range(4)
24 | 
25 | # COMMAND ----------
26 | 
27 | # MAGIC %md
28 | # MAGIC
29 | # MAGIC -- CREATE TABLE jacek_laskowski.books
30 | # MAGIC -- OPTIONS (header=true)
31 | # MAGIC -- AS SELECT * FROM csv.`/FileStore/books.csv`
32 | 
33 | # COMMAND ----------
34 | 
35 | # MAGIC %md
36 | # MAGIC
37 | # MAGIC -- SELECT * FROM jacek_laskowski.books
38 | 
39 | # COMMAND ----------
40 | 
41 | # MAGIC %sql
42 | # MAGIC -- CREATE LIVE TABLE books
43 | # MAGIC -- OPTIONS (header=true)
44 | # MAGIC -- AS SELECT * FROM csv.`/FileStore/books.csv`
45 | 
46 | # COMMAND ----------
47 | 
48 | # MAGIC %md
49 | # MAGIC
50 | # MAGIC %scala
51 | # MAGIC
52 | # MAGIC val books = spark
53 | # MAGIC   .read
54 | # MAGIC   .option("header", true)
55 | # MAGIC   .option("inferSchema", true)
56 | # MAGIC   .csv("/FileStore/books.csv")
57 | # MAGIC books.schema
58 | 


--------------------------------------------------------------------------------
/Databricks Asset Bundles/delta_live_tables_demo/my_project/README.md:
--------------------------------------------------------------------------------
 1 | # my_project
 2 | 
 3 | The 'my_project' project was generated by using the default-python template.
 4 | 
 5 | ## Getting started
 6 | 
 7 | 1. Install the Databricks CLI from https://docs.databricks.com/dev-tools/cli/databricks-cli.html
 8 | 
 9 | 2. Authenticate to your Databricks workspace:
10 |     ```
11 |     $ databricks configure
12 |     ```
13 | 
14 | 3. To deploy a development copy of this project, type:
15 |     ```
16 |     $ databricks bundle deploy --target dev
17 |     ```
18 |     (Note that "dev" is the default target, so the `--target` parameter
19 |     is optional here.)
20 | 
21 |     This deploys everything that's defined for this project.
22 |     For example, the default template would deploy a job called
23 |     `[dev yourname] my_project_job` to your workspace.
24 |     You can find that job by opening your workpace and clicking on **Workflows**.
25 | 
26 | 4. Similarly, to deploy a production copy, type:
27 |    ```
28 |    $ databricks bundle deploy --target prod
29 |    ```
30 | 
31 | 5. To run a job or pipeline, use the "run" comand:
32 |    ```
33 |    $ databricks bundle run
34 |    ```
35 | 
36 | 6. Optionally, install developer tools such as the Databricks extension for Visual Studio Code from
37 |    https://docs.databricks.com/dev-tools/vscode-ext.html.
38 | 
39 | 7. For documentation on the Databricks asset bundles format used
40 |    for this project, and for CI/CD configuration, see
41 |    https://docs.databricks.com/dev-tools/bundles/index.html.
42 | 


--------------------------------------------------------------------------------
/Generative AI/llm-rag-chatbot/_resources/NOTICE.py:
--------------------------------------------------------------------------------
 1 | # Databricks notebook source
 2 | # MAGIC %md
 3 | # MAGIC ## Licence
 4 | # MAGIC See LICENSE file.
 5 | # MAGIC
 6 | # MAGIC ## Data collection
 7 | # MAGIC To improve users experience and dbdemos asset quality, dbdemos sends report usage and capture views in the installed notebook (usually in the first cell) and other assets like dashboards. This information is captured for product improvement only and not for marketing purpose, and doesn't contain PII information. By using `dbdemos` and the assets it provides, you consent to this data collection. If you wish to disable it, you can set `Tracker.enable_tracker` to False in the `tracker.py` file.
 8 | # MAGIC
 9 | # MAGIC ## Resource creation
10 | # MAGIC To simplify your experience, `dbdemos` will create and start for you resources. As example, a demo could start (not exhaustive):
11 | # MAGIC - A cluster to run your demo
12 | # MAGIC - A Delta Live Table Pipeline to ingest data
13 | # MAGIC - A DBSQL endpoint to run DBSQL dashboard
14 | # MAGIC - An ML model
15 | # MAGIC
16 | # MAGIC While `dbdemos` does its best to limit the consumption and enforce resource auto-termination, you remain responsible for the resources created and the potential consumption associated.
17 | # MAGIC
18 | # MAGIC ## Support
19 | # MAGIC Databricks does not offer official support for `dbdemos` and the associated assets.
20 | # MAGIC For any issue with `dbdemos` or the demos installed, please open an issue and the demo team will have a look on a best effort basis.
21 | # MAGIC
22 | # MAGIC
23 | 


--------------------------------------------------------------------------------
/Databricks Workflows/for_each_task_demo/src/Load_googlesheets_csv.py:
--------------------------------------------------------------------------------
 1 | # Databricks notebook source
 2 | # MAGIC %md
 3 | # MAGIC
 4 | # MAGIC # Load googlesheets csv Notebook
 5 | 
 6 | # COMMAND ----------
 7 | 
 8 | # MAGIC %md
 9 | # MAGIC
10 | # MAGIC This notebook is part of the For each task Demo.
11 | 
12 | # COMMAND ----------
13 | 
14 | # MAGIC %md ## Step 1. Load CSV file
15 | # MAGIC
16 | # MAGIC It could load a CSV file with `google_spreadsheet`s (to load in parallel in the For each task).
17 | 
18 | # COMMAND ----------
19 | 
20 | google_spreadsheet_df = spark.createDataFrame(
21 |     [
22 |         ("Non-logistics - Energy", "[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16]", "[1]"),
23 |         ("Use Phase Factors", "[0,1,2,3,4,5,6]", "[1]"),
24 |     ],
25 |     "google_spreadsheet string, use_cols string, skip_rows string",
26 | )
27 | 
28 | # COMMAND ----------
29 | 
30 | display(google_spreadsheet_df)
31 | 
32 | # COMMAND ----------
33 | 
34 | # MAGIC %md
35 | # MAGIC
36 | # MAGIC ## Step 2. Collect Data
37 | 
38 | # COMMAND ----------
39 | 
40 | gsheets = google_spreadsheet_df.toJSON().collect()
41 | 
42 | # COMMAND ----------
43 | 
44 | print(gsheets)
45 | 
46 | # COMMAND ----------
47 | 
48 | print(type(gsheets))
49 | 
50 | # COMMAND ----------
51 | 
52 | print(type(gsheets[0]))
53 | 
54 | # COMMAND ----------
55 | 
56 | # MAGIC %md
57 | # MAGIC
58 | # MAGIC ## Step 3. Define Task Value
59 | 
60 | # COMMAND ----------
61 | 
62 | help(dbutils.jobs.taskValues.set)
63 | 
64 | # COMMAND ----------
65 | 
66 | dbutils.jobs.taskValues.set(key='gsheets', value=gsheets)
67 | 


--------------------------------------------------------------------------------
/demo/uv_workflow/resources/uv_workflow.job.yml:
--------------------------------------------------------------------------------
 1 | # The main job for uv_workflow.
 2 | resources:
 3 |   jobs:
 4 |     uv_workflow_job:
 5 |       name: uv_workflow_job
 6 | 
 7 |       trigger:
 8 |         # Run this job every day, exactly one day from the last run; see https://docs.databricks.com/api/workspace/jobs/create#trigger
 9 |         periodic:
10 |           interval: 1
11 |           unit: DAYS
12 | 
13 |       email_notifications:
14 |         on_failure:
15 |           - jacek@japila.pl
16 | 
17 |       tasks:
18 |         - task_key: notebook_task
19 |           job_cluster_key: job_cluster
20 |           notebook_task:
21 |             notebook_path: ../src/notebook.ipynb
22 |         
23 |         - task_key: main_task
24 |           depends_on:
25 |             - task_key: notebook_task
26 |           
27 |           job_cluster_key: job_cluster
28 |           python_wheel_task:
29 |             package_name: uv_workflow
30 |             entry_point: main
31 |           libraries:
32 |             # By default we just include the .whl file generated for the uv_workflow package.
33 |             # See https://docs.databricks.com/dev-tools/bundles/library-dependencies.html
34 |             # for more information on how to add other libraries.
35 |             - whl: ../dist/*.whl
36 | 
37 |       job_clusters:
38 |         - job_cluster_key: job_cluster
39 |           new_cluster:
40 |             spark_version: 15.4.x-scala2.12
41 |             node_type_id: i3.xlarge
42 |             autoscale:
43 |                 min_workers: 1
44 |                 max_workers: 4
45 | 


--------------------------------------------------------------------------------
/Databricks SQL/Queries.py:
--------------------------------------------------------------------------------
 1 | # Databricks notebook source
 2 | # MAGIC %md # Databricks SQL » Queries
 3 | 
 4 | # COMMAND ----------
 5 | 
 6 | # MAGIC %md
 7 | # MAGIC
 8 | # MAGIC 1. Queries are associated with a catalog and a schema
 9 | # MAGIC 1. Queries can be used as tasks in Workflow jobs (see [Workflow Jobs]($../Workflow Jobs/Databricks Jobs))
10 | # MAGIC 1. The results of executing queries can be added to dashboards (see [Dashboards]($./Dashboards))
11 | 
12 | # COMMAND ----------
13 | 
14 | # MAGIC %md
15 | # MAGIC
16 | # MAGIC ```sql
17 | # MAGIC SELECT
18 | # MAGIC   *
19 | # MAGIC FROM
20 | # MAGIC   book_ranks
21 | # MAGIC WHERE
22 | # MAGIC   book_rank in (1, 2)
23 | # MAGIC ```
24 | 
25 | # COMMAND ----------
26 | 
27 | # MAGIC %md ## Parametrized Queries
28 | # MAGIC
29 | # MAGIC [Query parameters](https://docs.databricks.com/en/sql/user/queries/query-parameters.html)
30 | 
31 | # COMMAND ----------
32 | 
33 | # MAGIC %md
34 | # MAGIC
35 | # MAGIC 1. Queries can be parameterized with curly brackets (`{{ table_pattern }}`)
36 | # MAGIC 1. Substitute values into a query at runtime
37 | # MAGIC 1. A widget appears above the results pane
38 | # MAGIC 1. Query parameters are more flexible than query filters, and should only be used in cases where query filters are not sufficient
39 | # MAGIC 1. `Cmd + I` to define a query parameter at the text caret
40 | # MAGIC 1. Click **Apply Changes** to run a query with a parameter value
41 | 
42 | # COMMAND ----------
43 | 
44 | # MAGIC %md
45 | # MAGIC
46 | # MAGIC ```sql
47 | # MAGIC show tables like {{ table_pattern }}
48 | # MAGIC ```
49 | 


--------------------------------------------------------------------------------
/Delta Lake/DESCRIBE HISTORY.sql:
--------------------------------------------------------------------------------
 1 | -- Databricks notebook source
 2 | -- MAGIC %md # DESCRIBE HISTORY
 3 | -- MAGIC
 4 | -- MAGIC `DESCRIBE HISTORY` command can be used in subqueries in Delta Lake (on [Databricks only](https://twitter.com/jaceklaskowski/status/1733466666749526278)).
 5 | 
 6 | -- COMMAND ----------
 7 | 
 8 | -- https://docs.databricks.com/en/sql/language-manual/sql-ref-syntax-ddl-create-table-using.html
 9 | CREATE TABLE my_students (id INT, name STRING, age INT);
10 | 
11 | -- COMMAND ----------
12 | 
13 | -- https://docs.databricks.com/en/sql/language-manual/sql-ref-syntax-dml-insert-into.html
14 | INSERT INTO my_students
15 | VALUES
16 |   (0, 'Jacek', 50);
17 | 
18 | -- COMMAND ----------
19 | 
20 | SELECT *
21 | FROM (
22 |   DESCRIBE HISTORY my_students
23 | )
24 | WHERE version = 1;
25 | 
26 | -- COMMAND ----------
27 | 
28 | -- MAGIC %scala
29 | -- MAGIC
30 | -- MAGIC val q = sql("""SELECT *
31 | -- MAGIC FROM (
32 | -- MAGIC   DESCRIBE HISTORY my_students
33 | -- MAGIC )
34 | -- MAGIC WHERE version = 1;""")
35 | -- MAGIC q.explain(extended = true)
36 | 
37 | -- COMMAND ----------
38 | 
39 | -- MAGIC %md ## DESCRIBE HISTORY Command
40 | -- MAGIC
41 | -- MAGIC A little about the internals of [DESCRIBE HISTORY Command](https://books.japila.pl/delta-lake-internals/commands/describe-history/)
42 | -- MAGIC
43 | -- MAGIC * a mere wrapper around `DeltaHistoryManager` to access the history of a delta table
44 | -- MAGIC * Possible Cost Optimization on Microsoft Azure using `spark.databricks.delta.history.maxKeysPerList` configuration property
45 | 
46 | -- COMMAND ----------
47 | 
48 | SET spark.databricks.delta.history.maxKeysPerList
49 | 


--------------------------------------------------------------------------------
/meetups/Meetup_2025_01_09.sql:
--------------------------------------------------------------------------------
 1 | -- Databricks notebook source
 2 | -- MAGIC %md # Meetup 2025-01-09
 3 | -- MAGIC
 4 | -- MAGIC ➡️ [Deploying Databricks Workflows with uv and Databricks Asset Bundles](https://www.meetup.com/warsaw-data-engineering/events/305473028/)
 5 | -- MAGIC
 6 | -- MAGIC Agenda:
 7 | -- MAGIC
 8 | -- MAGIC 1. 5 minut rogrzewki na luźne pomysły na ten i przyszłe meetupy
 9 | -- MAGIC     * News (new versions, etc.)
10 | -- MAGIC 1. 50 minut Live coding session, a w nim:
11 | -- MAGIC     * Stworzenie projektu w Pythonie z uv
12 | -- MAGIC     * Stworzenie Databricks job z notebookiem z naszym projektem w Pythonie wyżej (wszystko ręcznie / klikamy w UI / pełny manual)
13 | -- MAGIC     * Automatyzacja z Databricks Asset Bundles (DAB)
14 | -- MAGIC 1. Q&A / Zbieranie pomysłów na kolejne edycje (5 minut)
15 | 
16 | -- COMMAND ----------
17 | 
18 | -- MAGIC %md # News
19 | 
20 | -- COMMAND ----------
21 | 
22 | -- MAGIC %md
23 | -- MAGIC
24 | -- MAGIC ## New Versions
25 | -- MAGIC
26 | -- MAGIC * [uv 0.5.16](https://github.com/astral-sh/uv/releases/tag/0.5.16)
27 | -- MAGIC * [Databricks CLI 0.238.0](https://github.com/databricks/cli/releases/tag/v0.238.0)
28 | -- MAGIC * [Delta Lake 3.3.0](https://github.com/delta-io/delta/releases/tag/v3.3.0)
29 | -- MAGIC * [awscli 2.22.31](https://github.com/aws/aws-cli/releases/tag/2.22.31)
30 | 
31 | -- COMMAND ----------
32 | 
33 | -- MAGIC %md
34 | -- MAGIC
35 | -- MAGIC ## Open focus mode
36 | -- MAGIC
37 | -- MAGIC [Databricks notebook interface and controls](https://docs.databricks.com/en/notebooks/notebook-ui.html)
38 | 
39 | -- COMMAND ----------
40 | 
41 | -- MAGIC %md
42 | -- MAGIC
43 | -- MAGIC # Live Coding Session
44 | 


--------------------------------------------------------------------------------
/meetups/Meetup_2025_02_06.sql:
--------------------------------------------------------------------------------
 1 | -- Databricks notebook source
 2 | -- MAGIC %md # Data Quality in Databricks Workflows with Pydantic cntd.
 3 | -- MAGIC
 4 | -- MAGIC ➡️ [Meetup Announcement](https://www.meetup.com/warsaw-data-engineering/events/305995327/)
 5 | -- MAGIC
 6 | -- MAGIC Zakładamy, że mamy 2 projekty. Pierwszy projekt z pydantic (libka w Pythonie), a drugi to "hello world" Databricks Asset Bundle project z przykładowym job'em. Nic specjalnie wyrafinowanego. Od tego zaczniemy.
 7 | -- MAGIC
 8 | -- MAGIC Agenda:
 9 | -- MAGIC
10 | -- MAGIC 1. 5 minut rozgrzewki na luźne pomysły na ten i przyszłe meetupy
11 | -- MAGIC     * News (new versions, new features, etc.)
12 | -- MAGIC 1. 50 minut Live coding session, a w nim:
13 | -- MAGIC     * Za pomocą Databricks Asset Bundles (DAB), uruchomisz Databricks job z notebookiem z libką w Pythonie z Pydantic (takie tam "hello world"). Wszystko z pomocą uv do zarządzania projektem.
14 | -- MAGIC     * Stworzymy UDFa do walidacji rekordów, którego "uzbroimy" w pydantic'a. To główny cel meetupu, którego osiągnięcie będzie naszym "najosobityczniejszym" sukcesem 🥂
15 | -- MAGIC     * Może coś jeszcze, ale nie zdradzę teraz 🤷‍♂️
16 | -- MAGIC 1. 5 minut Q&A / Zbieranie pomysłów na kolejne edycje
17 | 
18 | -- COMMAND ----------
19 | 
20 | -- MAGIC %md # 📢 News
21 | 
22 | -- COMMAND ----------
23 | 
24 | -- MAGIC %md
25 | -- MAGIC
26 | -- MAGIC ## New Versions
27 | -- MAGIC
28 | -- MAGIC * [uv 0.5.29](https://github.com/astral-sh/uv/releases/tag/0.5.29)
29 | -- MAGIC * [awscli 2.24.0](https://github.com/aws/aws-cli/releases/tag/2.24.0)
30 | 
31 | -- COMMAND ----------
32 | 
33 | -- MAGIC %md
34 | -- MAGIC
35 | -- MAGIC # Live Coding Session
36 | 


--------------------------------------------------------------------------------
/Databricks Asset Bundles/delta_live_tables_demo/src/notebook.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "metadata": {
 6 |     "application/vnd.databricks.v1+cell": {
 7 |      "cellMetadata": {},
 8 |      "inputWidgets": {},
 9 |      "nuid": "ee353e42-ff58-4955-9608-12865bd0950e",
10 |      "showTitle": false,
11 |      "title": ""
12 |     }
13 |    },
14 |    "source": [
15 |     "# Default notebook\n",
16 |     "\n",
17 |     "This default notebook is executed using Databricks Workflows as defined in resources/delta_live_tables_demo_job.yml."
18 |    ]
19 |   },
20 |   {
21 |    "cell_type": "code",
22 |    "execution_count": 0,
23 |    "metadata": {
24 |     "application/vnd.databricks.v1+cell": {
25 |      "cellMetadata": {
26 |       "byteLimit": 2048000,
27 |       "rowLimit": 10000
28 |      },
29 |      "inputWidgets": {},
30 |      "nuid": "6bca260b-13d1-448f-8082-30b60a85c9ae",
31 |      "showTitle": false,
32 |      "title": ""
33 |     }
34 |    },
35 |    "outputs": [],
36 |    "source": [
37 |     "from delta_live_tables_demo import main\n",
38 |     "\n",
39 |     "main.get_taxis().show(10)"
40 |    ]
41 |   }
42 |  ],
43 |  "metadata": {
44 |   "application/vnd.databricks.v1+notebook": {
45 |    "dashboards": [],
46 |    "language": "python",
47 |    "notebookMetadata": {
48 |     "pythonIndentUnit": 2
49 |    },
50 |    "notebookName": "notebook",
51 |    "widgets": {}
52 |   },
53 |   "kernelspec": {
54 |    "display_name": "Python 3",
55 |    "language": "python",
56 |    "name": "python3"
57 |   },
58 |   "language_info": {
59 |    "name": "python",
60 |    "version": "3.11.4"
61 |   }
62 |  },
63 |  "nbformat": 4,
64 |  "nbformat_minor": 0
65 | }
66 | 


--------------------------------------------------------------------------------
/Databricks Asset Bundles/delta_live_tables_demo/README.md:
--------------------------------------------------------------------------------
 1 | # delta_live_tables_demo
 2 | 
 3 | The 'delta_live_tables_demo' project was generated by using the default-python template.
 4 | 
 5 | ## Getting started
 6 | 
 7 | 1. Install the Databricks CLI from https://docs.databricks.com/dev-tools/cli/databricks-cli.html
 8 | 
 9 | 2. Authenticate to your Databricks workspace:
10 |     ```
11 |     $ databricks configure
12 |     ```
13 | 
14 | 3. To deploy a development copy of this project, type:
15 |     ```
16 |     $ databricks bundle deploy --target dev
17 |     ```
18 |     (Note that "dev" is the default target, so the `--target` parameter
19 |     is optional here.)
20 | 
21 |     This deploys everything that's defined for this project.
22 |     For example, the default template would deploy a job called
23 |     `[dev yourname] delta_live_tables_demo_job` to your workspace.
24 |     You can find that job by opening your workpace and clicking on **Workflows**.
25 | 
26 | 4. Similarly, to deploy a production copy, type:
27 |    ```
28 |    $ databricks bundle deploy --target prod
29 |    ```
30 | 
31 | 5. To run a job or pipeline, use the "run" comand:
32 |    ```
33 |    $ databricks bundle run
34 |    ```
35 | 
36 | 6. Optionally, install developer tools such as the Databricks extension for Visual Studio Code from
37 |    https://docs.databricks.com/dev-tools/vscode-ext.html. Or read the "getting started" documentation for
38 |    **Databricks Connect** for instructions on running the included Python code from a different IDE.
39 | 
40 | 7. For documentation on the Databricks asset bundles format used
41 |    for this project, and for CI/CD configuration, see
42 |    https://docs.databricks.com/dev-tools/bundles/index.html.
43 | 


--------------------------------------------------------------------------------
/review_me/CREATE TABLE IF NOT EXISTS jacek_laskowski.my_table.py:
--------------------------------------------------------------------------------
 1 | # Databricks notebook source
 2 | # MAGIC %md # Day 1 Exercise One
 3 | # MAGIC
 4 | # MAGIC Create a table
 5 | 
 6 | # COMMAND ----------
 7 | 
 8 | # MAGIC %md ## Define Variable
 9 | 
10 | # COMMAND ----------
11 | 
12 | # MAGIC %md ## Execute Code (Python)
13 | 
14 | # COMMAND ----------
15 | 
16 | dbutils.widgets.text(name='table_name', defaultValue='jacek_laskowski.my_table', label='Table Name')
17 | table_name_param = dbutils.widgets.get('table_name')
18 | 
19 | # COMMAND ----------
20 | 
21 | print(f'Table name {table_name_param}')
22 | 
23 | # COMMAND ----------
24 | 
25 | # MAGIC %md ## Execute Code (Scala)
26 | 
27 | # COMMAND ----------
28 | 
29 | # MAGIC %scala
30 | # MAGIC
31 | # MAGIC val table_name_param = dbutils.widgets.get("table_name")
32 | 
33 | # COMMAND ----------
34 | 
35 | # MAGIC %scala
36 | # MAGIC
37 | # MAGIC println(s"Table name: $table_name_param")
38 | 
39 | # COMMAND ----------
40 | 
41 | # MAGIC %sql
42 | # MAGIC
43 | # MAGIC SHOW TABLES
44 | 
45 | # COMMAND ----------
46 | 
47 | # MAGIC %md ## Exercise / Question
48 | # MAGIC
49 | # MAGIC Change (find the way how to do it) the default schema to be `main`.
50 | 
51 | # COMMAND ----------
52 | 
53 | # MAGIC %sql
54 | # MAGIC
55 | # MAGIC CREATE SCHEMA IF NOT EXISTS jacek_laskowski
56 | 
57 | # COMMAND ----------
58 | 
59 | # MAGIC %sql
60 | # MAGIC
61 | # MAGIC CREATE TABLE IF NOT EXISTS ${table_name} (
62 | # MAGIC   id LONG,
63 | # MAGIC   name STRING
64 | # MAGIC )
65 | # MAGIC USING delta
66 | 
67 | # COMMAND ----------
68 | 
69 | # MAGIC %sql
70 | # MAGIC
71 | # MAGIC SHOW TABLES IN jacek_laskowski
72 | 
73 | # COMMAND ----------
74 | 
75 | # MAGIC %sql
76 | # MAGIC
77 | # MAGIC SELECT * FROM ${table_name}
78 | 


--------------------------------------------------------------------------------
/Photon.py:
--------------------------------------------------------------------------------
 1 | # Databricks notebook source
 2 | # MAGIC %md # Photon
 3 | # MAGIC
 4 | # MAGIC This notebook is a list of _things_ (articles, talks, demos, etc.) to learn Photon from.
 5 | 
 6 | # COMMAND ----------
 7 | 
 8 | # MAGIC %md
 9 | # MAGIC
10 | # MAGIC ## Slogans
11 | # MAGIC
12 | # MAGIC [Slogan](https://dictionary.cambridge.org/dictionary/english/slogan) is a _a short and striking or memorable phrase used in advertising._ so it makes much sense to learn Photon by how the people behind this product want us to remember it.
13 | # MAGIC
14 | # MAGIC * [The next generation engine for the Lakehouse](https://www.databricks.com/product/photon)
15 | # MAGIC * [Photon: A High-Performance Query Engine for the Lakehouse](https://www.cidrdb.org/cidr2022/papers/a100-behm.pdf)
16 | # MAGIC * [Photon: A Fast Query Engine for Lakehouse Systems](https://cs.stanford.edu/~matei/papers/2022/sigmod_photon.pdf)
17 | 
18 | # COMMAND ----------
19 | 
20 | # MAGIC %md ## Learn More
21 | # MAGIC
22 | # MAGIC ### Reading
23 | # MAGIC
24 | # MAGIC 1. [Photon: A Fast Query Engine for Lakehouse Systems](https://cs.stanford.edu/~matei/papers/2022/sigmod_photon.pdf) (_Photon whitepaper_)
25 | # MAGIC 1. [Photon: A High-Performance Query Engine for the Lakehouse](https://www.cidrdb.org/cidr2022/papers/a100-behm.pdf)
26 | # MAGIC
27 | # MAGIC ### Watching
28 | # MAGIC
29 | # MAGIC 1. [Photon for Dummies: How Does this New Execution Engine Actually Work?](https://www.databricks.com/dataaisummit/session/photon-dummies-how-does-new-execution-engine-actually-work/)
30 | # MAGIC 1. [Advancing Spark - The Photon Whitepaper](https://youtu.be/hxvQxI4FksY)
31 | # MAGIC 1. [Photon Technical Deep Dive: How to Think Vectorized](https://youtu.be/pNn5W4ujP3w)
32 | 
33 | # COMMAND ----------
34 | 
35 | 
36 | 


--------------------------------------------------------------------------------
/Databricks Asset Bundles/delta_live_tables_demo/resources/delta_live_tables_demo_job.yml:
--------------------------------------------------------------------------------
 1 | # The main job for delta_live_tables_demo
 2 | resources:
 3 |   jobs:
 4 |     delta_live_tables_demo_job:
 5 |       name: delta_live_tables_demo_job
 6 | 
 7 |       schedule:
 8 |         quartz_cron_expression: '44 37 8 * * ?'
 9 |         timezone_id: Europe/Amsterdam
10 | 
11 |       email_notifications:
12 |         on_failure:
13 |           - jacek@japila.pl
14 | 
15 |       tasks:
16 |         - task_key: notebook_task
17 |           job_cluster_key: job_cluster
18 |           notebook_task:
19 |             notebook_path: ../src/notebook.ipynb
20 |         
21 |         - task_key: refresh_pipeline
22 |           depends_on:
23 |             - task_key: notebook_task
24 |           pipeline_task:
25 |             pipeline_id: ${resources.pipelines.delta_live_tables_demo_pipeline.id}
26 |         
27 |         - task_key: main_task
28 |           depends_on:
29 |             - task_key: refresh_pipeline
30 |           job_cluster_key: job_cluster
31 |           python_wheel_task:
32 |             package_name: delta_live_tables_demo
33 |             entry_point: main
34 |           libraries:
35 |             # By default we just include the .whl file generated for the delta_live_tables_demo package.
36 |             # See https://docs.databricks.com/dev-tools/bundles/library-dependencies.html
37 |             # for more information on how to add other libraries.
38 |             - whl: ../dist/*.whl
39 | 
40 |       job_clusters:
41 |         - job_cluster_key: job_cluster
42 |           new_cluster:
43 |             spark_version: 13.3.x-scala2.12
44 |             node_type_id: i3.xlarge
45 |             autoscale:
46 |                 min_workers: 1
47 |                 max_workers: 4
48 | 


--------------------------------------------------------------------------------
/Delta Live Tables/Pipeline settings.py:
--------------------------------------------------------------------------------
 1 | # Databricks notebook source
 2 | # MAGIC %md # Pipeline settings
 3 | # MAGIC
 4 | # MAGIC Click the **Settings** button in the DLT UI
 5 | # MAGIC
 6 | # MAGIC There are two main UI settings views:
 7 | # MAGIC
 8 | # MAGIC 1. **UI** - a human-friendly view
 9 | # MAGIC 1. **JSON**
10 | # MAGIC
11 | # MAGIC There is one extra available under the three-dots menu:
12 | # MAGIC
13 | # MAGIC 1. **Pipeline settings YAML** that can be used with Databricks Asset Bundles to source control and apply CI/CD to pipelines.
14 | 
15 | # COMMAND ----------
16 | 
17 | # MAGIC %md
18 | # MAGIC ## General
19 | 
20 | # COMMAND ----------
21 | 
22 | # MAGIC %md
23 | # MAGIC ## Source code
24 | # MAGIC
25 | # MAGIC ➡️ [Configure source code libraries](https://docs.databricks.com/en/delta-live-tables/settings.html#select-a-cluster-policy)
26 | # MAGIC
27 | # MAGIC Use the file selector in the Delta Live Tables UI to configure the source code defining your pipeline.
28 | # MAGIC
29 | # MAGIC Pipeline source code is defined in Databricks notebooks or in SQL or Python scripts stored in workspace files.
30 | # MAGIC
31 | # MAGIC You can add one or more notebooks or workspace files or a combination of notebooks and workspace files.
32 | 
33 | # COMMAND ----------
34 | 
35 | # MAGIC %md
36 | # MAGIC
37 | # MAGIC ```
38 | # MAGIC "libraries": [
39 | # MAGIC     {
40 | # MAGIC         "notebook": {
41 | # MAGIC             "path": "/Repos/jacek@japila.pl/learn-databricks/Delta Live Tables/delta-live-tables-bundle/five_record_table"
42 | # MAGIC         }
43 | # MAGIC     },
44 | # MAGIC     {
45 | # MAGIC         "file": {
46 | # MAGIC             "path": "/Repos/jacek@japila.pl/learn-databricks/Delta Live Tables/delta-live-tables-bundle/bronze_table.sql"
47 | # MAGIC         }
48 | # MAGIC     }
49 | # MAGIC ],
50 | # MAGIC ```
51 | 


--------------------------------------------------------------------------------
/demo/uv_workflow/src/notebook.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "metadata": {
 6 |     "application/vnd.databricks.v1+cell": {
 7 |      "cellMetadata": {},
 8 |      "inputWidgets": {},
 9 |      "nuid": "ee353e42-ff58-4955-9608-12865bd0950e",
10 |      "showTitle": false,
11 |      "title": ""
12 |     }
13 |    },
14 |    "source": [
15 |     "# Default notebook\n",
16 |     "\n",
17 |     "This default notebook is executed using Databricks Workflows as defined in resources/uv_workflow.job.yml."
18 |    ]
19 |   },
20 |   {
21 |    "cell_type": "code",
22 |    "execution_count": 2,
23 |    "metadata": {},
24 |    "outputs": [],
25 |    "source": [
26 |     "%load_ext autoreload\n",
27 |     "%autoreload 2"
28 |    ]
29 |   },
30 |   {
31 |    "cell_type": "code",
32 |    "execution_count": 0,
33 |    "metadata": {
34 |     "application/vnd.databricks.v1+cell": {
35 |      "cellMetadata": {
36 |       "byteLimit": 2048000,
37 |       "rowLimit": 10000
38 |      },
39 |      "inputWidgets": {},
40 |      "nuid": "6bca260b-13d1-448f-8082-30b60a85c9ae",
41 |      "showTitle": false,
42 |      "title": ""
43 |     }
44 |    },
45 |    "outputs": [],
46 |    "source": [
47 |     "from uv_workflow import main\n",
48 |     "\n",
49 |     "main.get_taxis(spark).show(10)"
50 |    ]
51 |   }
52 |  ],
53 |  "metadata": {
54 |   "application/vnd.databricks.v1+notebook": {
55 |    "dashboards": [],
56 |    "language": "python",
57 |    "notebookMetadata": {
58 |     "pythonIndentUnit": 2
59 |    },
60 |    "notebookName": "notebook",
61 |    "widgets": {}
62 |   },
63 |   "kernelspec": {
64 |    "display_name": "Python 3",
65 |    "language": "python",
66 |    "name": "python3"
67 |   },
68 |   "language_info": {
69 |    "name": "python",
70 |    "version": "3.11.4"
71 |   }
72 |  },
73 |  "nbformat": 4,
74 |  "nbformat_minor": 0
75 | }
76 | 


--------------------------------------------------------------------------------
/demo/pydantic_workflow/src/notebook.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "metadata": {
 6 |     "application/vnd.databricks.v1+cell": {
 7 |      "cellMetadata": {},
 8 |      "inputWidgets": {},
 9 |      "nuid": "ee353e42-ff58-4955-9608-12865bd0950e",
10 |      "showTitle": false,
11 |      "title": ""
12 |     }
13 |    },
14 |    "source": [
15 |     "# Default notebook\n",
16 |     "\n",
17 |     "This default notebook is executed using Databricks Workflows as defined in resources/pydantic_workflow.job.yml."
18 |    ]
19 |   },
20 |   {
21 |    "cell_type": "code",
22 |    "execution_count": 2,
23 |    "metadata": {},
24 |    "outputs": [],
25 |    "source": [
26 |     "%load_ext autoreload\n",
27 |     "%autoreload 2"
28 |    ]
29 |   },
30 |   {
31 |    "cell_type": "code",
32 |    "execution_count": 0,
33 |    "metadata": {
34 |     "application/vnd.databricks.v1+cell": {
35 |      "cellMetadata": {
36 |       "byteLimit": 2048000,
37 |       "rowLimit": 10000
38 |      },
39 |      "inputWidgets": {},
40 |      "nuid": "6bca260b-13d1-448f-8082-30b60a85c9ae",
41 |      "showTitle": false,
42 |      "title": ""
43 |     }
44 |    },
45 |    "outputs": [],
46 |    "source": [
47 |     "from pydantic_workflow import main\n",
48 |     "\n",
49 |     "main.get_taxis(spark).show(10)"
50 |    ]
51 |   }
52 |  ],
53 |  "metadata": {
54 |   "application/vnd.databricks.v1+notebook": {
55 |    "dashboards": [],
56 |    "language": "python",
57 |    "notebookMetadata": {
58 |     "pythonIndentUnit": 2
59 |    },
60 |    "notebookName": "notebook",
61 |    "widgets": {}
62 |   },
63 |   "kernelspec": {
64 |    "display_name": "Python 3",
65 |    "language": "python",
66 |    "name": "python3"
67 |   },
68 |   "language_info": {
69 |    "name": "python",
70 |    "version": "3.11.4"
71 |   }
72 |  },
73 |  "nbformat": 4,
74 |  "nbformat_minor": 0
75 | }
76 | 


--------------------------------------------------------------------------------
/Delta Lake/Delta Lake 3.1.0.sql:
--------------------------------------------------------------------------------
 1 | -- Databricks notebook source
 2 | -- MAGIC %md # Delta Lake 3.1.0
 3 | -- MAGIC
 4 | -- MAGIC [DeltaLake 3.1.0 RC3](https://github.com/delta-io/delta/releases/tag/v3.1.0rc3) just hit the shelves! 🚀
 5 | -- MAGIC
 6 | -- MAGIC Learn more in the [LinkedIn post](https://www.linkedin.com/feed/update/urn:li:activity:7157783263820861441?updateEntityUrn=urn%3Ali%3Afs_updateV2%3A%28urn%3Ali%3Aactivity%3A7157783263820861441%2CFEED_DETAIL%2CEMPTY%2CDEFAULT%2Cfalse%29), too.
 7 | 
 8 | -- COMMAND ----------
 9 | 
10 | -- MAGIC %md ## Auto Compaction
11 | -- MAGIC
12 | -- MAGIC **Auto compaction** to address the small files problem during table writes. Auto compaction which runs at the end of the write query combines small files within partitions to large files to reduce the metadata size and improve query performance.
13 | -- MAGIC
14 | -- MAGIC ### Learn More
15 | -- MAGIC
16 | -- MAGIC 1. [The official documentation of Delta Lake](https://docs.delta.io/3.1.0/optimizations-oss.html#auto-compaction)
17 | -- MAGIC 1. [The Internals of Delta Lake](https://books.japila.pl/delta-lake-internals/auto-compaction/)
18 | 
19 | -- COMMAND ----------
20 | 
21 | -- MAGIC %md ## Liquid Clustering
22 | -- MAGIC
23 | -- MAGIC That's really really huge! 🔥 It is still marked as Experimental, but at least let people have a peek under the hood at how it really works.
24 | -- MAGIC
25 | -- MAGIC From the [announcement](https://github.com/delta-io/delta/releases/tag/v3.1.0rc3):
26 | -- MAGIC
27 | -- MAGIC > (Experimental) Liquid clustering for better table layout Now Delta allows clustering the data in a Delta table for better data skipping. Currently this is an experimental feature. See [documentation](https://docs.delta.io/3.1.0/delta-clustering.html) and [example](https://github.com/delta-io/delta/blob/branch-3.1/examples/scala/src/main/scala/example/Clustering.scala) for how to try out this feature.
28 | 


--------------------------------------------------------------------------------
/demo/delta-live-tables/README.md:
--------------------------------------------------------------------------------
 1 | # Delta Live Tables Pipeline Demo
 2 | 
 3 | ```console
 4 | $ tfi
 5 | ...
 6 | Terraform has been successfully initialized!
 7 | ```
 8 | 
 9 | ```console
10 | tfa -auto-approve
11 | ```
12 | 
13 | Check out the pipeline. This step is completely optional.
14 | 
15 | ```console
16 | $ databricks pipelines list | jq '.[] | { name, pipeline_id }'
17 | {
18 |   "name": "EXPECT Clause Demo",
19 |   "pipeline_id": "a02952e6-7197-44a4-a072-5ea5124d7bce"
20 | }
21 | ```
22 | 
23 | **IMPORTANT** Every push to the repo is not reflected (`git pull`) by the repo after `tfa` so you have to `tfd`.
24 | 
25 | Run the pipeline.
26 | 
27 | ```console
28 | databricks pipelines start --pipeline-id $(tfo -raw pipeline_id)
29 | ```
30 | 
31 | Wait until the pipeline finishes (until `IDLE` comes up from the following command).
32 | 
33 | ```console
34 | while (true)
35 | do
36 |   state=$(databricks pipelines get --pipeline-id $(tfo -raw pipeline_id) | jq --raw-output '.state')
37 |   if [[ $state =~ "IDLE" ]]; then
38 |     echo "Pipeline stopped (state: $state)"
39 |     break;
40 |   fi
41 |   echo "Waiting for the pipeline to stop (state: $state)"
42 |   sleep 5
43 | done
44 | ```
45 | 
46 | Switch to the DLT UI. Select (_click_) the `raw_streaming_table` streaming live table and review the **Data quality** section.
47 | 
48 | Upload data again and re-run the pipeline.
49 | 
50 | ```console
51 | databricks fs cp input-data/2.csv dbfs:$(tfo -raw input_dir)
52 | ```
53 | 
54 | ```console
55 | databricks pipelines start --pipeline-id $(tfo -raw pipeline_id)
56 | ```
57 | 
58 | Review the events delta table (use **Data Quality Checks** cell in [Storage location](../../Delta%20Live%20Tables/Storage%20location.sql) notebook).
59 | 
60 | ## Clean Up
61 | 
62 | ```console
63 | tfd -auto-approve
64 | ```
65 | 
66 | ```console
67 | databricks fs rm -r dbfs:/FileStore/jacek_laskowski/delta-live-tables-demo-input
68 | ```
69 | 


--------------------------------------------------------------------------------
/Databricks Asset Bundles/delta_live_tables_demo/my_project/databricks.yml:
--------------------------------------------------------------------------------
 1 | # This is a Databricks asset bundle definition for my_project.
 2 | # See https://docs.databricks.com/dev-tools/bundles/index.html for documentation.
 3 | bundle:
 4 |   name: my_project
 5 | 
 6 | include:
 7 |   - resources/*.yml
 8 | 
 9 | targets:
10 |   # The 'dev' target, used for development purposes.
11 |   # Whenever a developer deploys using 'dev', they get their own copy.
12 |   dev:
13 |     # We use 'mode: development' to make sure everything deployed to this target gets a prefix
14 |     # like '[dev my_user_name]'. Setting this mode also disables any schedules and
15 |     # automatic triggers for jobs and enables the 'development' mode for Delta Live Tables pipelines.
16 |     mode: development
17 |     default: true
18 |     workspace:
19 |       host: https://training-partners.cloud.databricks.com
20 | 
21 |   # Optionally, there could be a 'staging' target here.
22 |   # (See Databricks docs on CI/CD at https://docs.databricks.com/dev-tools/bundles/index.html.)
23 |   #
24 |   # staging:
25 |   #  workspace:
26 |   #    host: https://training-partners.cloud.databricks.com
27 | 
28 |   # The 'prod' target, used for production deployment.
29 |   prod:
30 |     # For production deployments, we only have a single copy, so we override the
31 |     # workspace.root_path default of
32 |     # /Users/${workspace.current_user.userName}/.bundle/${bundle.target}/${bundle.name}
33 |     # to a path that is not specific to the current user.
34 |     mode: production
35 |     workspace:
36 |       host: https://training-partners.cloud.databricks.com
37 |       root_path: /Shared/.bundle/prod/${bundle.name}
38 |     run_as:
39 |       # This runs as jacek@japila.pl in production. Alternatively,
40 |       # a service principal could be used here using service_principal_name
41 |       # (see Databricks documentation).
42 |       user_name: jacek@japila.pl
43 |     


--------------------------------------------------------------------------------
/Generative AI/Llama.py:
--------------------------------------------------------------------------------
 1 | # Databricks notebook source
 2 | # MAGIC %md # Meta Llama
 3 | 
 4 | # COMMAND ----------
 5 | 
 6 | # MAGIC %md
 7 | # MAGIC
 8 | # MAGIC ## Introduction
 9 | # MAGIC
10 | # MAGIC * State-of-the-art open source language model
11 | # MAGIC * Available on Databricks using [Foundation Model APIs]($./Foundation Models)
12 | 
13 | # COMMAND ----------
14 | 
15 | # MAGIC %md
16 | # MAGIC
17 | # MAGIC ## Llama 3.1
18 | # MAGIC
19 | # MAGIC [Announcing the availability of Llama 3.1 models on the Databricks Data Intelligence Platform](https://www.databricks.com/blog/new-standard-open-source-ai-meta-llama-31-databricks):
20 | # MAGIC
21 | # MAGIC * Llama 3.1 series of open source language models
22 | # MAGIC   * Meta Llama 3.1-8B-Instruct
23 | # MAGIC   * Meta Llama 3.1-70B-Instruct
24 | # MAGIC   * Meta Llama 3.1-405B-Instruct
25 | # MAGIC * New Llama 3.1 models available on Databricks
26 | # MAGIC   * Unity Catalog's [system.ai](https://docs.databricks.com/en/generative-ai/pretrained-models.html) catalog
27 | # MAGIC   * Served using [Mosaic AI Model Serving]($./Model Serving)
28 | # MAGIC * Used to build production-scale and high-quality GenAI applications
29 | # MAGIC   * Databricks customers can use Mosaic AI to serve and fine-tune the Llama 3.1 models
30 | # MAGIC   * Connect the models to [Retrieval Augmented Generation (RAG)]($./Retrieval Augmented Generation) and agentic systems
31 | # MAGIC   * Synthetic data generation
32 | # MAGIC   * Real-time batch inference
33 | # MAGIC   * Leverage the models for scalable evaluation
34 | 
35 | # COMMAND ----------
36 | 
37 | # MAGIC %md
38 | # MAGIC
39 | # MAGIC ## Get Started
40 | # MAGIC
41 | # MAGIC * Visit the [Mosaic AI Playground](https://docs.databricks.com/en/large-language-models/ai-playground.html) to quickly try Meta Llama 3.1 and other Foundation Models directly from your workspace
42 | # MAGIC * [Get started querying LLMs on Databricks](https://docs.databricks.com/en/large-language-models/llm-serving-intro.html)
43 | 


--------------------------------------------------------------------------------
/meetups/README.md:
--------------------------------------------------------------------------------
 1 | # Meetups
 2 | 
 3 | This directory contains the "agenda" notebooks of the past **Databricks Talks** series of the [Warsaw Data Engineering](https://www.meetup.com/warsaw-data-engineering/) meetup group.
 4 | 
 5 | The meetup group uses [lu.ma](https://lu.ma/warsaw-data-engineering) for meetup announcements.
 6 | 
 7 | 1. [Meetup next](./Meetup_next.ipynb)
 8 | 1. [2025_12_11](./Meetup_2025_12_11.ipynb) Developing AI Agents with DSPy and MLflow in Databricks
 9 | 1. [2025_10_30](./Meetup_2025_10_30.ipynb) Developing AI Programs with DSPy
10 | 1. [2025_10_23](./Meetup_2025_10_23.ipynb) Building Model Context Protocol (MCP) servers in Python cntd.
11 | 1. [2025_10_16](./Meetup_2025_10_16.ipynb) Building Model Context Protocol (MCP) servers in Python
12 | 1. [2025_08_21](./Meetup_2025_08_21.ipynb) Learn Python through functools module (and OpenAI's Python API)
13 | 1. [2025_06_26](./Meetup_2025_06_26.ipynb) MLflow 3.1 and Classic ML Models on Lakeflow Declarative Pipelines
14 | 1. [2025_06_12](./Meetup_2025_06_12.ipynb) Deploy and Query Models in Databricks
15 | 1. [2025_05_29](./Meetup_2025_05_29.ipynb) Model Lifecycle in Databricks Machine Learning
16 | 1. [2025_05_22](./Meetup_2025_05_22.ipynb) Databricks Machine Learning and MLflow Client API
17 | 1. [2025_05_15](./Meetup_2025_05_15.ipynb) Managed MLflow and Databricks Machine Learning
18 | 1. [2025_05_08](./Meetup_2025_05_08.ipynb) MLflow on Databricks
19 | 1. [2025_04_10](./Meetup_2025_04_10.ipynb) Learn MLflow from `mlflow/models/docker_utils.py`
20 | 1. [2025_03_27](./Meetup_2025_03_27.ipynb) MLflow (Local) Serving
21 | 1. [2025_03_13](./2025_03_13.ipynb) Intro to Delta Live Tables (DLT)
22 | 1. [2025_03_06](./2025_03_06.ipynb) Intro to MLflow (with uv)
23 | 1. [2025_02_20](./Meetup_2025_02_20.ipynb) Data Quality in Databricks Workflows (jobs) with Pydantic
24 | 1. [2025_02_06](./Meetup_2025_02_06.sql) Data Quality in Databricks Workflows with Pydantic cntd.
25 | 1. [2025_01_30](./Meetup_2025_01_30.sql)
26 | 1. [2025_01_09](./Meetup_2025_01_09.sql)
27 | 


--------------------------------------------------------------------------------
/Delta Live Tables/Full Refresh.sql:
--------------------------------------------------------------------------------
 1 | -- Databricks notebook source
 2 | -- MAGIC %md # Full Refresh
 3 | -- MAGIC 
 4 | -- MAGIC Let's deep dive into **Full Refresh** feature of DLTs.
 5 | -- MAGIC 
 6 | -- MAGIC You can trigger a full refresh of a DLT pipeline using:
 7 | -- MAGIC 
 8 | -- MAGIC * The Pipelines UI under **Start > Full refresh all**
 9 | -- MAGIC * Delta Live Tables CLI `databricks pipelines start --full-refresh`
10 | 
11 | -- COMMAND ----------
12 | 
13 | -- MAGIC %md ## DESCRIBE HISTORY
14 | 
15 | -- COMMAND ----------
16 | 
17 | USE jaceklaskowski_meetup
18 | 
19 | -- COMMAND ----------
20 | 
21 | SHOW TABLES;
22 | 
23 | -- COMMAND ----------
24 | 
25 | SELECT version, operation, operationParameters, readVersion, isolationLevel, isBlindAppend, operationMetrics, engineInfo
26 | FROM (DESCRIBE HISTORY my_streaming_table)
27 | 
28 | -- COMMAND ----------
29 | 
30 | -- MAGIC %md ## Full Refresh All
31 | -- MAGIC 
32 | -- MAGIC ```console
33 | -- MAGIC databricks pipelines start --full-refresh --pipeline-id 3a69ffe2-d42a-47b5-8731-84e7ffb3c844
34 | -- MAGIC ```
35 | 
36 | -- COMMAND ----------
37 | 
38 | SELECT * FROM my_streaming_table
39 | 
40 | -- COMMAND ----------
41 | 
42 | -- MAGIC %md ## Demo
43 | -- MAGIC 
44 | -- MAGIC This demo shows Full refresh all to fix a header issue with Auto Loader in a DLT pipeline.
45 | -- MAGIC 
46 | -- MAGIC 1. Without `header` option, CSVs with headers are processed as if they had one record extra (the header)
47 | -- MAGIC 1. Once fixed and Full refresh all, the Streaming table should have proper records
48 | -- MAGIC 
49 | -- MAGIC Use <a href="$my_streaming_table">my_streaming_table</a> notebook.
50 | 
51 | -- COMMAND ----------
52 | 
53 | SHOW TABLES in jaceklaskowski_meetup
54 | 
55 | -- COMMAND ----------
56 | 
57 | -- WRONG: Show all records, incl. headers
58 | select * from jaceklaskowski_meetup.my_streaming_table
59 | 
60 | -- COMMAND ----------
61 | 
62 | -- CORRECT: Show all records with no headers this time
63 | select * from jaceklaskowski_meetup.my_streaming_table
64 | 


--------------------------------------------------------------------------------
/Databricks Asset Bundles/delta_live_tables_demo/databricks.yml:
--------------------------------------------------------------------------------
 1 | # This is a Databricks asset bundle definition for delta_live_tables_demo.
 2 | # See https://docs.databricks.com/dev-tools/bundles/index.html for documentation.
 3 | bundle:
 4 |   name: delta_live_tables_demo
 5 | 
 6 | include:
 7 |   - resources/*.yml
 8 | 
 9 | targets:
10 |   # The 'dev' target, used for development purposes.
11 |   # Whenever a developer deploys using 'dev', they get their own copy.
12 |   meetup:
13 |     default: true
14 |     # FIXME: A bug?
15 |     # workspace:
16 |       # profile: default
17 |   dev:
18 |     # We use 'mode: development' to make sure everything deployed to this target gets a prefix
19 |     # like '[dev my_user_name]'. Setting this mode also disables any schedules and
20 |     # automatic triggers for jobs and enables the 'development' mode for Delta Live Tables pipelines.
21 |     mode: development
22 |     workspace:
23 |       host: https://training-partners.cloud.databricks.com
24 | 
25 |   # Optionally, there could be a 'staging' target here.
26 |   # (See Databricks docs on CI/CD at https://docs.databricks.com/dev-tools/bundles/index.html.)
27 |   #
28 |   # staging:
29 |   #  workspace:
30 |   #    host: https://training-partners.cloud.databricks.com
31 | 
32 |   # The 'prod' target, used for production deployment.
33 |   prod:
34 |     # For production deployments, we only have a single copy, so we override the
35 |     # workspace.root_path default of
36 |     # /Users/${workspace.current_user.userName}/.bundle/${bundle.target}/${bundle.name}
37 |     # to a path that is not specific to the current user.
38 |     mode: production
39 |     workspace:
40 |       host: https://training-partners.cloud.databricks.com
41 |       root_path: /Shared/.bundle/prod/${bundle.name}
42 |     run_as:
43 |       # This runs as jacek@japila.pl in production. Alternatively,
44 |       # a service principal could be used here using service_principal_name
45 |       # (see Databricks documentation).
46 |       user_name: jacek@japila.pl
47 |     


--------------------------------------------------------------------------------
/Delta Live Tables/Materialization.sql:
--------------------------------------------------------------------------------
 1 | -- Databricks notebook source
 2 | -- MAGIC %md # Materialization
 3 | 
 4 | -- COMMAND ----------
 5 | 
 6 | -- MAGIC %md ## Review Me
 7 | -- MAGIC
 8 | -- MAGIC 1. https://www.databricks.com/glossary/materialized-views
 9 | -- MAGIC 1. https://docs.databricks.com/en/sql/user/materialized-views.html
10 | -- MAGIC 1. https://www.google.com/search?q=databricks+materialized+view
11 | 
12 | -- COMMAND ----------
13 | 
14 | -- MAGIC %md ## CREATE TABLE
15 | -- MAGIC
16 | -- MAGIC [CREATE TABLE](https://docs.databricks.com/en/sql/language-manual/sql-ref-syntax-ddl-create-table-using.html)
17 | -- MAGIC
18 | -- MAGIC ```sql
19 | -- MAGIC { { [CREATE OR] REPLACE TABLE | CREATE [EXTERNAL] TABLE [ IF NOT EXISTS ] }
20 | -- MAGIC   table_name
21 | -- MAGIC   [ table_specification ]
22 | -- MAGIC   [ USING data_source ]
23 | -- MAGIC   [ table_clauses ]
24 | -- MAGIC   [ AS query ] }
25 | -- MAGIC ```
26 | 
27 | -- COMMAND ----------
28 | 
29 | -- MAGIC %md ## AS query
30 | -- MAGIC
31 | -- MAGIC > This optional clause populates the table using the data from query. When you specify a query you must not also specify a table_specification. The table schema is derived from the query.
32 | -- MAGIC
33 | -- MAGIC > Note that Databricks overwrites the underlying data source with the data of the input query, to make sure the table gets created contains exactly the same data as the input query.
34 | -- MAGIC
35 | -- MAGIC
36 | 
37 | -- COMMAND ----------
38 | 
39 | CREATE TABLE IF NOT EXISTS demo_table
40 | AS SELECT * FROM VALUES 1,2,3,4
41 | 
42 | -- COMMAND ----------
43 | 
44 | DESCRIBE EXTENDED demo_table
45 | 
46 | -- COMMAND ----------
47 | 
48 | -- MAGIC %md ## Materialized Views
49 | -- MAGIC
50 | -- MAGIC [Materialized Views](https://www.databricks.com/glossary/materialized-views):
51 | -- MAGIC
52 | -- MAGIC > A materialized view is a database object that stores the results of a query as a physical table. Unlike regular database views, which are virtual and derive their data from the underlying tables, materialized views contain precomputed data that is incrementally updated on a schedule or on demand.
53 | 
54 | -- COMMAND ----------
55 | 
56 | 
57 | 


--------------------------------------------------------------------------------
/Databricks Workflows/for_each_task_demo/README.md:
--------------------------------------------------------------------------------
 1 | # For each Task Demo
 2 | 
 3 | The 'for_each_task_demo' project was generated by using the default-python template of [Databricks Asset Bundles](https://docs.databricks.com/en/dev-tools/bundles/index.html).
 4 | 
 5 | ## Run Demo
 6 | 
 7 | Deploy the demo project first.
 8 | 
 9 | ```bash
10 | databricks bundle deploy
11 | ```
12 | 
13 | Run the job.
14 | 
15 | ```bash
16 | databricks bundle run for_each_task_demo_job
17 | ```
18 | 
19 | ## Getting started
20 | 
21 | 1. Install the Databricks CLI from https://docs.databricks.com/dev-tools/cli/databricks-cli.html
22 | 
23 | 2. Authenticate to your Databricks workspace, if you have not done so already:
24 | 
25 |     ```bash
26 |     $ databricks configure
27 |     ```
28 | 
29 | 3. To deploy a development copy of this project, type:
30 |     ```
31 |     $ databricks bundle deploy --target dev
32 |     ```
33 |     (Note that "dev" is the default target, so the `--target` parameter
34 |     is optional here.)
35 | 
36 |     This deploys everything that's defined for this project.
37 |     For example, the default template would deploy a job called
38 |     `[dev yourname] for_each_task_demo_job` to your workspace.
39 |     You can find that job by opening your workpace and clicking on **Workflows**.
40 | 
41 | 4. Similarly, to deploy a production copy, type:
42 |    ```
43 |    $ databricks bundle deploy --target prod
44 |    ```
45 | 
46 |    Note that the default job from the template has a schedule that runs every day
47 |    (defined in resources/for_each_task_demo.job.yml). The schedule
48 |    is paused when deploying in development mode (see
49 |    https://docs.databricks.com/dev-tools/bundles/deployment-modes.html).
50 | 
51 | 5. To run a job or pipeline, use the "run" command:
52 |    ```
53 |    $ databricks bundle run
54 |    ```
55 | 
56 | 6. Optionally, install developer tools such as the Databricks extension for Visual Studio Code from
57 |    https://docs.databricks.com/dev-tools/vscode-ext.html.
58 | 
59 | 7. For documentation on the Databricks asset bundles format used
60 |    for this project, and for CI/CD configuration, see
61 |    https://docs.databricks.com/dev-tools/bundles/index.html.
62 | 


--------------------------------------------------------------------------------
/Python/Pyenv.py:
--------------------------------------------------------------------------------
 1 | # Databricks notebook source
 2 | # MAGIC %md # Pyenv
 3 | # MAGIC
 4 | # MAGIC [Simple Python Version Management: pyenv](https://github.com/pyenv/pyenv)
 5 | 
 6 | # COMMAND ----------
 7 | 
 8 | # MAGIC %md ## Installation
 9 | # MAGIC
10 | # MAGIC On macos:
11 | # MAGIC
12 | # MAGIC ```shell
13 | # MAGIC brew update
14 | # MAGIC brew install pyenv
15 | # MAGIC ```
16 | # MAGIC
17 | # MAGIC or [Installation](https://github.com/pyenv/pyenv#installation)
18 | 
19 | # COMMAND ----------
20 | 
21 | # MAGIC %md
22 | # MAGIC
23 | # MAGIC ```
24 | # MAGIC $ pyenv --version
25 | # MAGIC pyenv 2.3.30
26 | # MAGIC ```
27 | 
28 | # COMMAND ----------
29 | 
30 | # MAGIC %md
31 | # MAGIC
32 | # MAGIC ```shell
33 | # MAGIC $ pyenv install 3.12
34 | # MAGIC python-build: use openssl@3 from homebrew
35 | # MAGIC python-build: use readline from homebrew
36 | # MAGIC Downloading Python-3.12.0.tar.xz...
37 | # MAGIC -> https://www.python.org/ftp/python/3.12.0/Python-3.12.0.tar.xz
38 | # MAGIC Installing Python-3.12.0...
39 | # MAGIC python-build: use tcl-tk from homebrew
40 | # MAGIC python-build: use readline from homebrew
41 | # MAGIC python-build: use ncurses from homebrew
42 | # MAGIC python-build: use zlib from xcode sdk
43 | # MAGIC Installed Python-3.12.0 to /Users/jacek/.pyenv/versions/3.12.0
44 | # MAGIC ```
45 | # MAGIC
46 | # MAGIC ```shell
47 | # MAGIC $ pyenv virtualenv 3.12 databricks-cli
48 | # MAGIC ```
49 | # MAGIC
50 | # MAGIC ```shell
51 | # MAGIC $ pyenv activate databricks-cli
52 | # MAGIC ```
53 | # MAGIC
54 | # MAGIC ```shell
55 | # MAGIC $ python --version
56 | # MAGIC Python 3.12.0
57 | # MAGIC ```
58 | # MAGIC
59 | # MAGIC ```shell
60 | # MAGIC $ pyenv deactivate
61 | # MAGIC ```
62 | # MAGIC
63 | # MAGIC ```shell
64 | # MAGIC $ python --version
65 | # MAGIC Python 3.11.6
66 | # MAGIC ```
67 | # MAGIC
68 | # MAGIC ```shell
69 | # MAGIC $ pyenv local databricks-cli
70 | # MAGIC ```
71 | # MAGIC
72 | # MAGIC ```shell
73 | # MAGIC $ pyenv local
74 | # MAGIC ```
75 | # MAGIC
76 | # MAGIC A special version name `system` means to use whatever Python is found on PATH after the shims PATH entry (in other words, whatever would be run if Pyenv shims weren't on PATH).
77 | # MAGIC
78 | # MAGIC ```shell
79 | # MAGIC $ pyenv global
80 | # MAGIC system
81 | # MAGIC ```
82 | 


--------------------------------------------------------------------------------
/Databricks Asset Bundles/delta_live_tables_demo/src/dlt_pipeline.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "metadata": {
 6 |     "application/vnd.databricks.v1+cell": {
 7 |      "cellMetadata": {},
 8 |      "inputWidgets": {},
 9 |      "nuid": "9a626959-61c8-4bba-84d2-2a4ecab1f7ec",
10 |      "showTitle": false,
11 |      "title": ""
12 |     }
13 |    },
14 |    "source": [
15 |     "# DLT pipeline\n",
16 |     "\n",
17 |     "This Delta Live Tables (DLT) definition is executed using a pipeline defined in resources/delta_live_tables_demo_pipeline.yml."
18 |    ]
19 |   },
20 |   {
21 |    "cell_type": "code",
22 |    "execution_count": 0,
23 |    "metadata": {
24 |     "application/vnd.databricks.v1+cell": {
25 |      "cellMetadata": {},
26 |      "inputWidgets": {},
27 |      "nuid": "9198e987-5606-403d-9f6d-8f14e6a4017f",
28 |      "showTitle": false,
29 |      "title": ""
30 |     }
31 |    },
32 |    "outputs": [],
33 |    "source": [
34 |     "# Import DLT and src/delta_live_tables_demo\n",
35 |     "import dlt\n",
36 |     "import sys\n",
37 |     "from pyspark.sql.functions import expr\n",
38 |     "from delta_live_tables_demo import main"
39 |    ]
40 |   },
41 |   {
42 |    "cell_type": "code",
43 |    "execution_count": 0,
44 |    "metadata": {
45 |     "application/vnd.databricks.v1+cell": {
46 |      "cellMetadata": {},
47 |      "inputWidgets": {},
48 |      "nuid": "3fc19dba-61fd-4a89-8f8c-24fee63bfb14",
49 |      "showTitle": false,
50 |      "title": ""
51 |     }
52 |    },
53 |    "outputs": [],
54 |    "source": [
55 |     "@dlt.view\n",
56 |     "def taxi_raw():\n",
57 |     "  return main.get_taxis()\n",
58 |     "\n",
59 |     "@dlt.table\n",
60 |     "def filtered_taxis():\n",
61 |     "  return dlt.read(\"taxi_raw\").filter(expr(\"fare_amount < 30\"))"
62 |    ]
63 |   }
64 |  ],
65 |  "metadata": {
66 |   "application/vnd.databricks.v1+notebook": {
67 |    "dashboards": [],
68 |    "language": "python",
69 |    "notebookMetadata": {
70 |     "pythonIndentUnit": 2
71 |    },
72 |    "notebookName": "dlt_pipeline",
73 |    "widgets": {}
74 |   },
75 |   "kernelspec": {
76 |    "display_name": "Python 3",
77 |    "language": "python",
78 |    "name": "python3"
79 |   },
80 |   "language_info": {
81 |    "name": "python",
82 |    "version": "3.11.4"
83 |   }
84 |  },
85 |  "nbformat": 4,
86 |  "nbformat_minor": 0
87 | }
88 | 


--------------------------------------------------------------------------------
/review_me/python only please.py:
--------------------------------------------------------------------------------
 1 | # Databricks notebook source
 2 | # MAGIC %md # Create DLT Table (Live Table) using Python API
 3 | 
 4 | # COMMAND ----------
 5 | 
 6 | dbutils.widgets.text(name='filename', defaultValue='dbfs:/FileStore/books.csv')
 7 | filename = dbutils.widgets.get('filename')
 8 | 
 9 | # COMMAND ----------
10 | 
11 | import dlt
12 | 
13 | # COMMAND ----------
14 | 
15 | # MAGIC %py
16 | # MAGIC
17 | # MAGIC # A regular PySpark data loading pattern
18 | # MAGIC # dataframe = spark.read.format('csv').option('header', True).load('dbfs:/FileStore/books.csv')
19 | # MAGIC # display(dataframe)
20 | # MAGIC
21 | # MAGIC # What am I supposed to do with the two below
22 | # MAGIC # to create a DLT live table in Python?
23 | # MAGIC
24 | # MAGIC # @dlt.table Decorator
25 | # MAGIC # The Python table and view functions must return a DataFrame
26 | # MAGIC
27 | # MAGIC from pyspark.sql import DataFrame
28 | # MAGIC
29 | # MAGIC # decorators beg for methods
30 | # MAGIC
31 | # MAGIC # A DLT data loading pattern
32 | # MAGIC
33 | # MAGIC @dlt.table(name='raw_books')
34 | # MAGIC def raw_load_csv() -> DataFrame:
35 | # MAGIC     return spark.read.format('csv').option('header', True).load(filename)
36 | 
37 | # COMMAND ----------
38 | 
39 | @dlt.table
40 | def silver_book_titles() -> DataFrame:
41 |     # The following won't work as we renamed the table name using @dlt.table(name=...)
42 |     # return spark.table('live.raw_load_csv').select('title')
43 | 
44 |     # This is how to access Spark property
45 |     column_name = spark.conf.get('column_name')
46 |     return spark.table('live.raw_books').select(column_name)
47 | 
48 | # COMMAND ----------
49 | 
50 | from pyspark.sql import functions as F
51 | 
52 | 
53 | @dlt.table
54 | def golden_upper_titles() -> DataFrame:
55 |     return spark.table('live.silver_book_titles').select(F.upper('title').alias('upper_title'))
56 | 
57 | # COMMAND ----------
58 | 
59 | # MAGIC %sql
60 | # MAGIC
61 | # MAGIC CREATE LIVE TABLE sql_in_python
62 | # MAGIC AS SELECT * FROM range(0, 5)
63 | 
64 | # COMMAND ----------
65 | 
66 | # MAGIC %sql
67 | # MAGIC
68 | # MAGIC show tables
69 | 
70 | # COMMAND ----------
71 | 
72 | from pyspark.sql import functions as F
73 | spark.table('my_table').select(F.upper('name')).display()
74 | 
75 | # COMMAND ----------
76 | 
77 | # MAGIC %sql insert into my_table values (0, 'ania')
78 | 
79 | # COMMAND ----------
80 | 
81 | 
82 | 


--------------------------------------------------------------------------------
/PySpark/PySpark.py:
--------------------------------------------------------------------------------
 1 | # Databricks notebook source
 2 | # MAGIC %md # PySpark
 3 | 
 4 | # COMMAND ----------
 5 | 
 6 | # MAGIC %md # Create DataFrame From NumPy Array
 7 | 
 8 | # COMMAND ----------
 9 | 
10 | # MAGIC %pip install numpy matplotlib scipy
11 | 
12 | # COMMAND ----------
13 | 
14 | dbutils.library.restartPython()
15 | 
16 | # COMMAND ----------
17 | 
18 | # https://realpython.com/preview/numpy-random-normal/
19 | import numpy as np
20 | rng = np.random.default_rng()
21 | numbers = rng.normal(size=10_000)
22 | nums = spark.createDataFrame(numbers)
23 | display(nums)
24 | 
25 | # COMMAND ----------
26 | 
27 | # MAGIC %md # Standard Functions
28 | # MAGIC
29 | # MAGIC [pyspark.sql.functions](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/functions.html)
30 | 
31 | # COMMAND ----------
32 | 
33 | # MAGIC %md # Basic Aggregation with pandas UDFs
34 | 
35 | # COMMAND ----------
36 | 
37 | import pandas as pd
38 | from pyspark.sql.functions import pandas_udf
39 | 
40 | # COMMAND ----------
41 | 
42 | # MAGIC %md
43 | # MAGIC
44 | # MAGIC Learn more about [pandas.Series](https://pandas.pydata.org/docs/reference/api/pandas.Series.html).
45 | 
46 | # COMMAND ----------
47 | 
48 | @pandas_udf(returnType = "long")
49 | def group_id(vs: pd.Series) -> pd.Series:
50 |     return (vs.abs() * 1000).round() % 2
51 | 
52 | # COMMAND ----------
53 | 
54 | with_gid = nums.withColumn("gid", group_id(nums.value))
55 | display(with_gid)
56 | 
57 | # COMMAND ----------
58 | 
59 | display(with_gid.groupby("gid").count())
60 | 
61 | # COMMAND ----------
62 | 
63 | @pandas_udf(returnType = "long")
64 | def my_count(s: pd.Series) -> 'long':
65 |     return pd.Series(s.count())
66 | 
67 | # COMMAND ----------
68 | 
69 | grouped_nums = with_gid.groupBy("gid")
70 | count_by_gid_agg = my_count("gid").alias("count")
71 | counts_by_gid = grouped_nums.agg(count_by_gid_agg)
72 | 
73 | # COMMAND ----------
74 | 
75 | display(counts_by_gid)
76 | 
77 | # COMMAND ----------
78 | 
79 | # MAGIC %md # DataFrame Partitions
80 | # MAGIC
81 | # MAGIC [pyspark.sql.functions.spark_partition_id](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.functions.spark_partition_id.html#pyspark.sql.functions.spark_partition_id)
82 | 
83 | # COMMAND ----------
84 | 
85 | from pyspark.sql.functions import spark_partition_id
86 | 
87 | display(counts_by_gid.withColumn("spark_partition_id", spark_partition_id()))
88 | 


--------------------------------------------------------------------------------
/Generative AI/llm-rag-chatbot/config.py:
--------------------------------------------------------------------------------
 1 | # Databricks notebook source
 2 | # MAGIC %md 
 3 | # MAGIC ## Configuration file
 4 | # MAGIC
 5 | # MAGIC Please change your catalog and schema here to run the demo on a different catalog.
 6 | # MAGIC
 7 | # MAGIC <!-- Collect usage data (view). Remove it to disable collection or disable tracker during installation. View README for more details.  -->
 8 | # MAGIC <img width="1px" src="https://ppxrzfxige.execute-api.us-west-2.amazonaws.com/v1/analytics?category=data-science&org_id=1785533703310188&notebook=%2Fconfig&demo_name=llm-rag-chatbot&event=VIEW&path=%2F_dbdemos%2Fdata-science%2Fllm-rag-chatbot%2Fconfig&version=1">
 9 | 
10 | # COMMAND ----------
11 | 
12 | VECTOR_SEARCH_ENDPOINT_NAME="dbdemos_vs_endpoint"
13 | 
14 | DATABRICKS_SITEMAP_URL = "https://docs.databricks.com/en/doc-sitemap.xml"
15 | 
16 | catalog = "main"
17 | 
18 | #email = spark.sql('select current_user() as user').collect()[0]['user']
19 | #username = email.split('@')[0].replace('.', '_')
20 | #dbName = db = f"rag_chatbot_{username}"
21 | dbName = db = "rag_chatbot"
22 | 
23 | # COMMAND ----------
24 | 
25 | # MAGIC %md
26 | # MAGIC ### License
27 | # MAGIC This demo installs the following external libraries on top of DBR(ML):
28 | # MAGIC
29 | # MAGIC
30 | # MAGIC | Library | License |
31 | # MAGIC |---------|---------|
32 | # MAGIC | langchain     | [MIT](https://github.com/langchain-ai/langchain/blob/master/LICENSE)     |
33 | # MAGIC | lxml      | [BSD-3](https://pypi.org/project/lxml/)     |
34 | # MAGIC | transformers      | [Apache 2.0](https://github.com/huggingface/transformers/blob/main/LICENSE)     |
35 | # MAGIC | unstructured      | [Apache 2.0](https://github.com/Unstructured-IO/unstructured/blob/main/LICENSE.md)     |
36 | # MAGIC | llama-index      | [MIT](https://github.com/run-llama/llama_index/blob/main/LICENSE)     |
37 | # MAGIC | tesseract      | [Apache 2.0](https://github.com/tesseract-ocr/tesseract/blob/main/LICENSE)     |
38 | # MAGIC | poppler-utils      | [MIT](https://github.com/skmetaly/poppler-utils/blob/master/LICENSE)     |
39 | # MAGIC | textstat      | [MIT](https://pypi.org/project/textstat/)     |
40 | # MAGIC | tiktoken      | [MIT](https://github.com/openai/tiktoken/blob/main/LICENSE)     |
41 | # MAGIC | evaluate      | [Apache2](https://pypi.org/project/evaluate/)     |
42 | # MAGIC | torch      | [BDS-3](https://github.com/intel/torch/blob/master/LICENSE.md)     |
43 | # MAGIC | tiktoken      | [MIT](https://github.com/openai/tiktoken/blob/main/LICENSE)     |
44 | # MAGIC
45 | # MAGIC
46 | # MAGIC
47 | # MAGIC
48 | 


--------------------------------------------------------------------------------
/Apache Spark/Parameterized Queries.sql:
--------------------------------------------------------------------------------
 1 | -- Databricks notebook source
 2 | -- MAGIC %md
 3 | -- MAGIC # Parameterized Queries
 4 | -- MAGIC
 5 | -- MAGIC [The Internals of Spark SQL](https://books.japila.pl/spark-sql-internals/parameterized-queries/)
 6 | 
 7 | -- COMMAND ----------
 8 | 
 9 | -- MAGIC %md
10 | -- MAGIC
11 | -- MAGIC ## Parameter markers
12 | -- MAGIC
13 | -- MAGIC [Parameter markers](https://docs.databricks.com/en/sql/language-manual/sql-ref-parameter-marker.html)
14 | 
15 | -- COMMAND ----------
16 | 
17 | -- MAGIC %md
18 | -- MAGIC
19 | -- MAGIC The following parameterized query does not seem to work in Databricks (as I hoped) and fails with the exception:
20 | -- MAGIC
21 | -- MAGIC > org.apache.spark.sql.catalyst.ExtendedAnalysisException: [UNBOUND_SQL_PARAMETER] Found the unbound parameter: limitA. Please, fix `args` and provide a mapping of the parameter to either a SQL literal or collection constructor functions such as `map()`, `array()`, `struct()`. SQLSTATE: 42P02; line 4 pos 6;
22 | -- MAGIC
23 | -- MAGIC ```sql
24 | -- MAGIC WITH a AS (SELECT 1 c)
25 | -- MAGIC SELECT *
26 | -- MAGIC FROM a
27 | -- MAGIC LIMIT : limitA
28 | -- MAGIC ```
29 | 
30 | -- COMMAND ----------
31 | 
32 | -- MAGIC %md
33 | -- MAGIC
34 | -- MAGIC ## DECLARE VARIABLE
35 | -- MAGIC
36 | -- MAGIC [DECLARE VARIABLE](https://docs.databricks.com/en/sql/language-manual/sql-ref-syntax-ddl-declare-variable.html)
37 | 
38 | -- COMMAND ----------
39 | 
40 | DECLARE OR REPLACE VARIABLE limitA INT DEFAULT 5;
41 | 
42 | -- COMMAND ----------
43 | 
44 | -- MAGIC %md
45 | -- MAGIC
46 | -- MAGIC ## SET VARIABLE
47 | -- MAGIC
48 | -- MAGIC [SET VARIABLE](https://docs.databricks.com/en/sql/language-manual/sql-ref-syntax-aux-set-variable.html)
49 | 
50 | -- COMMAND ----------
51 | 
52 | SET VARIABLE limitA=10
53 | 
54 | -- COMMAND ----------
55 | 
56 | -- MAGIC %md
57 | -- MAGIC
58 | -- MAGIC ## EXECUTE IMMEDIATE
59 | -- MAGIC
60 | -- MAGIC [EXECUTE IMMEDIATE](https://docs.databricks.com/en/sql/language-manual/sql-ref-syntax-aux-execute-immediate.html)
61 | 
62 | -- COMMAND ----------
63 | 
64 | DECLARE OR REPLACE sqlStr = 'WITH a AS (SELECT "It works! 🔥" result)
65 | SELECT *
66 | FROM a
67 | LIMIT :limitA';
68 | 
69 | -- COMMAND ----------
70 | 
71 | DECLARE OR REPLACE limitA = 5;
72 | 
73 | -- COMMAND ----------
74 | 
75 | EXECUTE IMMEDIATE sqlStr USING (limitA AS limitA);
76 | 
77 | -- COMMAND ----------
78 | 
79 | -- MAGIC %md
80 | -- MAGIC
81 | -- MAGIC ## Learn More
82 | -- MAGIC
83 | -- MAGIC 1. [Parameterized queries with PySpark](https://www.databricks.com/blog/parameterized-queries-pyspark)
84 | 


--------------------------------------------------------------------------------
/Generative AI/Databricks Mosaic AI.sql:
--------------------------------------------------------------------------------
 1 | -- Databricks notebook source
 2 | -- MAGIC %md # Databricks Mosaic AI
 3 | 
 4 | -- COMMAND ----------
 5 | 
 6 | -- MAGIC %md
 7 | -- MAGIC
 8 | -- MAGIC [Databricks Mosaic AI](https://www.databricks.com/product/machine-learning)
 9 | -- MAGIC
10 | -- MAGIC > Build and deploy production-quality ML and GenAI applications
11 | 
12 | -- COMMAND ----------
13 | 
14 | -- MAGIC %md
15 | -- MAGIC
16 | -- MAGIC Provides unified tooling to build, deploy and monitor AI and ML solutions — from building predictive models to the latest GenAI and large language models (LLMs)
17 | 
18 | -- COMMAND ----------
19 | 
20 | -- MAGIC %md ## Databricks + MosaicML
21 | -- MAGIC
22 | -- MAGIC [Databricks + MosaicML](https://www.databricks.com/blog/databricks-mosaicml)
23 | -- MAGIC
24 | -- MAGIC * Databricks acquired MosaicML
25 | -- MAGIC * A leading platform for creating and customizing generative AI models for enterprises
26 | -- MAGIC * They keep using the term so I'm gonna repeat the full sentence: _"Democratize data and AI for every enterprise"_
27 | -- MAGIC * to provide the best-in-class experience for training, customizing, and deploying generative AI applications
28 | -- MAGIC * The three most important developments required to move generative AI into the mainstream for enterprises
29 | -- MAGIC     * models are widely available to every company
30 | -- MAGIC     * reduce the price of training and customizing large language models
31 | -- MAGIC     * training and serving costs
32 | -- MAGIC     * Open LLMs: popular [MPT-7B](https://huggingface.co/mosaicml/mpt-7b) and [MPT-30B](https://huggingface.co/mosaicml/mpt-30b) base LLMs
33 | -- MAGIC     * AI applications with reasoning abilities and language-based interfaces
34 | -- MAGIC     * incorporate the large volumes of custom data: information about business processes, customers, accounts, orders, or other aspects of their business
35 | -- MAGIC     * Data privacy and safety
36 | -- MAGIC     * little tolerance for hallucinations or incorrect responses
37 | -- MAGIC     * deploy safe, secure, and effective AI applications
38 | -- MAGIC     * Unifying the AI and data stack
39 | -- MAGIC     * model development life cycle
40 | -- MAGIC     * Databricks to continue to put data at the center of the AI journey
41 | -- MAGIC     * upstream data preparation like cleaning, featurization, and embedding of data for use in models
42 | -- MAGIC     * the data and models must be jointly curated
43 | -- MAGIC * efficiently build large AI models on their own data and business processes
44 | -- MAGIC * MosaicML within the Lakehouse AI Platform
45 | 


--------------------------------------------------------------------------------
/meetups/Meetup_2025_01_30.sql:
--------------------------------------------------------------------------------
 1 | -- Databricks notebook source
 2 | -- MAGIC %md # Data Quality in Databricks Workflows with Pydantic
 3 | -- MAGIC
 4 | -- MAGIC ➡️ [Meetup Announcement](https://www.meetup.com/warsaw-data-engineering/events/305877678/)
 5 | -- MAGIC
 6 | -- MAGIC Agenda:
 7 | -- MAGIC
 8 | -- MAGIC 1. 5 minut rozgrzewki na luźne pomysły na ten i przyszłe meetupy
 9 | -- MAGIC     * News (new versions, new features, etc.)
10 | -- MAGIC 1. 50 minut Live coding session, a w nim:
11 | -- MAGIC     * Stworzysz nowy projekt dla libki w Pythonie z Pydantic (hello world itp.) i jedynie słusznym uv do zarządzania projektem
12 | -- MAGIC     * Stworzysz Databricks job z notebookiem z naszym projektem w Pythonie wyżej (wszystko ręcznie / klikamy w UI / pełny manual)
13 | -- MAGIC     * Automatyzacja z Databricks Asset Bundles (DAB)
14 | -- MAGIC 1. 5 minut Q&A / Zbieranie pomysłów na kolejne edycje
15 | 
16 | -- COMMAND ----------
17 | 
18 | -- MAGIC %md
19 | -- MAGIC
20 | -- MAGIC ## 🌟 Praise Quote 🌟
21 | -- MAGIC
22 | -- MAGIC > Co zainteresowało Cię w Warsaw Data Engineering Meetup, że zdecydowałaś/-eś się przyłączyć?
23 | -- MAGIC
24 | -- MAGIC > I love studying everything in detail.
25 | -- MAGIC > I'd like to learn more about Apache Spark.
26 | -- MAGIC > I read a lot of articles by Jacek Laskowski and have started reading books on Spark internals.
27 | 
28 | -- COMMAND ----------
29 | 
30 | -- MAGIC %md # 📢 News
31 | 
32 | -- COMMAND ----------
33 | 
34 | -- MAGIC %md
35 | -- MAGIC
36 | -- MAGIC ## New Versions
37 | -- MAGIC
38 | -- MAGIC * [uv 0.5.25](https://github.com/astral-sh/uv/releases/tag/0.5.25)
39 | -- MAGIC * [Databricks CLI 0.240.0](https://github.com/databricks/cli/releases/tag/v0.240.0)
40 | -- MAGIC * [awscli 2.23.9](https://github.com/aws/aws-cli/releases/tag/2.23.9)
41 | 
42 | -- COMMAND ----------
43 | 
44 | -- MAGIC %md
45 | -- MAGIC
46 | -- MAGIC ## Databricks Notebook UI
47 | -- MAGIC
48 | -- MAGIC [Databricks notebook interface and controls](https://docs.databricks.com/en/notebooks/notebook-ui.html)
49 | -- MAGIC
50 | -- MAGIC **Cmd + Shift + P** for [Command palette](https://docs.databricks.com/en/notebooks/notebook-editor.html) with the following:
51 | -- MAGIC
52 | -- MAGIC 1. [Multicursor support](https://docs.databricks.com/en/notebooks/notebook-editor.html#multicursor-support) 🥳
53 | -- MAGIC 1. [Use web terminal and Databricks CLI](https://docs.databricks.com/en/notebooks/notebook-editor.html#use-web-terminal-and-databricks-cli) 🤔
54 | -- MAGIC 1. Duplicating lines as in Visual Code ❤️
55 | 
56 | -- COMMAND ----------
57 | 
58 | -- MAGIC %md
59 | -- MAGIC
60 | -- MAGIC # Live Coding Session
61 | 


--------------------------------------------------------------------------------
/Delta Live Tables/Agenda.sql:
--------------------------------------------------------------------------------
 1 | -- Databricks notebook source
 2 | -- MAGIC %md # Delta Live Tables » Agenda
 3 | 
 4 | -- COMMAND ----------
 5 | 
 6 | -- MAGIC %md
 7 | -- MAGIC
 8 | -- MAGIC | # | Module |
 9 | -- MAGIC | --- | --- |
10 | -- MAGIC | 0 | [Introduction]($./Delta Live Tables) |
11 | -- MAGIC | 1 | [Delta Live Tables SQL]($./Building Delta Live Tables pipelines with SQL) |
12 | -- MAGIC | 2 | 👍 [Delta Live Tables Python API]($./Delta Live Tables Python) |
13 | -- MAGIC | x | [Pipeline settings]($./Pipeline settings) |
14 | -- MAGIC | 2L | [DLT Lab]($./DLT Lab) |
15 | -- MAGIC | 3 | [Expectations]($./Expectations) |
16 | -- MAGIC | 4 | [Storage location]($./Storage location) |
17 | -- MAGIC | 5 | [Full Refresh]($./Full Refresh) |
18 | -- MAGIC | 6 | [Deep Dive into DLTs]($./Deep Dive into DLTs) |
19 | -- MAGIC | 7 | [CLI]($./Delta Live Tables CLI) |
20 | -- MAGIC | 8 | [Auto Loader and Streaming DLTs]($./Auto Loader and Streaming DLTs) |
21 | 
22 | -- COMMAND ----------
23 | 
24 | -- MAGIC %md ## Open Topics / TODOs
25 | -- MAGIC
26 | -- MAGIC [Open Topics / TODOs]($./TODOs)
27 | 
28 | -- COMMAND ----------
29 | 
30 | -- MAGIC %md
31 | -- MAGIC
32 | -- MAGIC ## Topics
33 | -- MAGIC
34 | -- MAGIC * How to work with files in Databricks
35 | -- MAGIC * `/FileStore`
36 | -- MAGIC * `dbfs` magic command
37 | -- MAGIC * Parameters in jobs vs DLT pipelines
38 | -- MAGIC * How to parameterized SQL queries (to define parameters at job level)
39 | -- MAGIC     * https://docs.databricks.com/en/sql/user/queries/query-parameters.html
40 | 
41 | -- COMMAND ----------
42 | 
43 | -- MAGIC %md ## Heads-Up
44 | -- MAGIC
45 | -- MAGIC 1. You can use Python in a SQL notebook for a DLT pipeline yet it won't be rendered in a dataflow (and vice versa)
46 | 
47 | -- COMMAND ----------
48 | 
49 | -- MAGIC %md ## Exercise
50 | -- MAGIC
51 | -- MAGIC Based on https://jaceklaskowski.github.io/spark-workshop/exercises/spark-sql-exercise-Using-upper-Standard-Function.html
52 | -- MAGIC
53 | -- MAGIC Create a DLT pipeline that does the following:
54 | -- MAGIC
55 | -- MAGIC 1. FIXME Accepts a parameter - a CSV filename to load
56 | -- MAGIC 1. FIXME Accepts another parameter that is the column name with string values
57 | -- MAGIC 1. Executes `upper` standard function on this string column
58 | -- MAGIC
59 | -- MAGIC FIXMEs = how to pass parameters to a DLT pipeline
60 | -- MAGIC
61 | -- MAGIC In summary:
62 | -- MAGIC
63 | -- MAGIC The dataflow (pipeline) should be two tables
64 | 
65 | -- COMMAND ----------
66 | 
67 | -- MAGIC %md ## Exercise (Databricks SQL)
68 | -- MAGIC
69 | -- MAGIC Create a job with a DLT pipeline (that's already created) and a (SQL) query
70 | 


--------------------------------------------------------------------------------
/Databricks Workflows/01 Conditional Workflows.py:
--------------------------------------------------------------------------------
 1 | # Databricks notebook source
 2 | # MAGIC %md
 3 | # MAGIC
 4 | # MAGIC # Conditional Workflows
 5 | 
 6 | # COMMAND ----------
 7 | 
 8 | # MAGIC %md
 9 | # MAGIC
10 | # MAGIC ## Conditional Tasks
11 | # MAGIC
12 | # MAGIC `if/else condition` task is used to run a part of a job DAG based on the results of a boolean expression.
13 | # MAGIC
14 | # MAGIC Adds branching logic to your job
15 | # MAGIC
16 | # MAGIC [Add branching logic to your job with the If/else condition task](https://docs.databricks.com/en/workflows/jobs/conditional-tasks.html#add-branching-logic-to-your-job-with-the-ifelse-condition-task):
17 | # MAGIC
18 | # MAGIC 1. Runs a part of a job DAG based on a boolean expression
19 | # MAGIC 1. The expression consists of a boolean operator and a pair of operands, where the operands might reference job or task state using [job and task parameter variables](https://docs.databricks.com/en/workflows/jobs/parameter-value-references.html) or use [task values](https://docs.databricks.com/en/workflows/jobs/share-task-context.html).
20 | # MAGIC
21 | 
22 | # COMMAND ----------
23 | 
24 | # MAGIC %md
25 | # MAGIC
26 | # MAGIC ## Run if dependencies
27 | # MAGIC
28 | # MAGIC [Add the Run if condition of a task](https://docs.databricks.com/en/workflows/jobs/conditional-tasks.html#add-the-run-if-condition-of-a-task):
29 | # MAGIC
30 | # MAGIC 1. Adds conditions to a task
31 | # MAGIC 1. `Run if dependencies` drop-down menu in the task configuration
32 | # MAGIC 1. Condition is evaluated after completing all the task dependencies
33 | 
34 | # COMMAND ----------
35 | 
36 | # MAGIC %md
37 | # MAGIC
38 | # MAGIC ## Conditional Execution
39 | # MAGIC
40 | # MAGIC 1. Tasks configured to handle failures or not meeting if/else condition are marked as Excluded.
41 | # MAGIC 1. Excluded tasks are skipped and are treated as successful.
42 | # MAGIC 1. If all task dependencies are excluded, the task is also excluded, regardless of its Run if condition.
43 | # MAGIC 1. If you cancel a task run, the cancellation propagates through downstream tasks, and tasks with a Run if condition that handles failure are run, for example, to verify a cleanup task runs when a task run is canceled.
44 | 
45 | # COMMAND ----------
46 | 
47 | # MAGIC %md
48 | # MAGIC
49 | # MAGIC ## Job Run Status
50 | # MAGIC
51 | # MAGIC [How does Databricks Jobs determine job run status?](https://docs.databricks.com/en/workflows/jobs/conditional-tasks.html#how-does-databricks-jobs-determine-job-run-status)
52 | 
53 | # COMMAND ----------
54 | 
55 | # MAGIC %md
56 | # MAGIC
57 | # MAGIC ## Learn More
58 | 
59 | # COMMAND ----------
60 | 
61 | # MAGIC %md
62 | # MAGIC
63 | # MAGIC 1. [Run tasks conditionally in a Databricks job](https://docs.databricks.com/en/workflows/jobs/conditional-tasks.html)
64 | 


--------------------------------------------------------------------------------
/Apache Spark/Bucketing.sql:
--------------------------------------------------------------------------------
  1 | -- Databricks notebook source
  2 | -- MAGIC %md # Bucketing
  3 | 
  4 | -- COMMAND ----------
  5 | 
  6 | -- MAGIC %md
  7 | -- MAGIC 
  8 | -- MAGIC * Not supported by Delta Lake
  9 | 
 10 | -- COMMAND ----------
 11 | 
 12 | DROP SCHEMA jaceklaskowski CASCADE;
 13 | 
 14 | -- COMMAND ----------
 15 | 
 16 | -- SCHEMA == DATABASE
 17 | CREATE SCHEMA jaceklaskowski;
 18 | USE jaceklaskowski;
 19 | 
 20 | -- COMMAND ----------
 21 | 
 22 | CREATE TABLE bucketed (
 23 |   id BIGINT,
 24 |   name STRING,
 25 |   type STRING)
 26 | USING parquet
 27 | CLUSTERED BY (type) INTO 8 BUCKETS
 28 | 
 29 | -- COMMAND ----------
 30 | 
 31 | -- MAGIC %scala
 32 | -- MAGIC 
 33 | -- MAGIC import org.apache.spark.sql.SaveMode
 34 | -- MAGIC spark.range(10e4.toLong).write.format("parquet").mode(SaveMode.Overwrite).saveAsTable("jaceklaskowski.t10e4")
 35 | -- MAGIC spark.range(10e6.toLong).write.format("parquet").mode(SaveMode.Overwrite).saveAsTable("jaceklaskowski.t10e6")
 36 | 
 37 | -- COMMAND ----------
 38 | 
 39 | SHOW TABLES
 40 | 
 41 | -- COMMAND ----------
 42 | 
 43 | -- MAGIC %scala
 44 | -- MAGIC 
 45 | -- MAGIC sc.setJobDescription("Setup")
 46 | -- MAGIC 
 47 | -- MAGIC spark.conf.set("spark.sql.autoBroadcastJoinThreshold", -1)
 48 | -- MAGIC spark.conf.set("spark.sql.adaptive.enabled", false)
 49 | -- MAGIC 
 50 | -- MAGIC // https://docs.databricks.com/optimizations/disk-cache.html
 51 | -- MAGIC spark.conf.set("spark.databricks.io.cache.enabled", false)
 52 | 
 53 | -- COMMAND ----------
 54 | 
 55 | -- MAGIC %scala
 56 | -- MAGIC 
 57 | -- MAGIC sc.setJobDescription("Non-Bucketed Join")
 58 | -- MAGIC 
 59 | -- MAGIC val t4 = spark.table("t10e4")
 60 | -- MAGIC val t6 = spark.table("t10e6")
 61 | -- MAGIC 
 62 | -- MAGIC assert(t4.count == 10e4)
 63 | -- MAGIC assert(t6.count == 10e6)
 64 | -- MAGIC 
 65 | -- MAGIC // trigger execution of the join query
 66 | -- MAGIC t4.join(t6, "id").foreach(_ => ())
 67 | 
 68 | -- COMMAND ----------
 69 | 
 70 | -- MAGIC %scala
 71 | -- MAGIC 
 72 | -- MAGIC sc.setJobDescription("Create Bucketed Tables")
 73 | -- MAGIC 
 74 | -- MAGIC import org.apache.spark.sql.SaveMode
 75 | -- MAGIC spark.range(10e4.toLong)
 76 | -- MAGIC   .write
 77 | -- MAGIC   .format("parquet")
 78 | -- MAGIC   .bucketBy(4, "id")
 79 | -- MAGIC   .sortBy("id")
 80 | -- MAGIC   .mode(SaveMode.Overwrite)
 81 | -- MAGIC   .saveAsTable("bucketed_4_10e4")
 82 | -- MAGIC 
 83 | -- MAGIC spark.range(10e6.toLong)
 84 | -- MAGIC   .write
 85 | -- MAGIC   .format("parquet")
 86 | -- MAGIC   .bucketBy(4, "id")
 87 | -- MAGIC   .sortBy("id")
 88 | -- MAGIC   .mode(SaveMode.Overwrite)
 89 | -- MAGIC   .saveAsTable("bucketed_4_10e6")
 90 | 
 91 | -- COMMAND ----------
 92 | 
 93 | -- MAGIC %scala
 94 | -- MAGIC 
 95 | -- MAGIC sc.setJobDescription("Bucketed Join")
 96 | -- MAGIC 
 97 | -- MAGIC val bucketed_4_10e4 = spark.table("bucketed_4_10e4")
 98 | -- MAGIC val bucketed_4_10e6 = spark.table("bucketed_4_10e6")
 99 | -- MAGIC bucketed_4_10e4.join(bucketed_4_10e6, "id").foreach(_ => ())
100 | 


--------------------------------------------------------------------------------
/Generative AI/llm-rag-chatbot/00-RAG-LLM-RAG-Introduction.py:
--------------------------------------------------------------------------------
 1 | # Databricks notebook source
 2 | # MAGIC %md
 3 | # MAGIC # Deploy Your LLM Chatbots With the Data Intelligence Platform
 4 | # MAGIC
 5 | # MAGIC In this tutorial, you will learn how to build your own Chatbot Assisstant to help your customers answer questions about Databricks, using Retrieval Augmented Generation (RAG), llama2-70B Foundation Model and Vector Search.
 6 | # MAGIC
 7 | # MAGIC <!-- Collect usage data (view). Remove it to disable collection or disable tracker during installation. View README for more details.  -->
 8 | # MAGIC <img width="1px" src="https://ppxrzfxige.execute-api.us-west-2.amazonaws.com/v1/analytics?category=data-science&org_id=1785533703310188&notebook=%2F00-RAG-LLM-RAG-Introduction&demo_name=llm-rag-chatbot&event=VIEW&path=%2F_dbdemos%2Fdata-science%2Fllm-rag-chatbot%2F00-RAG-LLM-RAG-Introduction&version=1">
 9 | 
10 | # COMMAND ----------
11 | 
12 | # MAGIC %md-sandbox
13 | # MAGIC ## Quickstart: Getting started
14 | # MAGIC
15 | # MAGIC <img src="https://github.com/databricks-demos/dbdemos-resources/blob/main/images/product/chatbot-rag/llm-rag-self-managed-flow-0.png?raw=true" style="float: right"  width="700px;">
16 | # MAGIC Start here if this is your first time implementing a GenAI application.
17 | # MAGIC
18 | # MAGIC You will learn:
19 | # MAGIC
20 | # MAGIC - How to prepare your document dataset, creating text chunk from documentation pages
21 | # MAGIC - Create your Vector Search index and send queries to find similar documents
22 | # MAGIC - Build your langchain model leveraging Databricks Foundation Model (Llama 2)
23 | # MAGIC - Deploy the chatbot model as Model Serving Endpoint 
24 | 
25 | # COMMAND ----------
26 | 
27 | # MAGIC %md 
28 | # MAGIC Get started: open the [01-quickstart/00-RAG-chatbot-Introduction notebook]($./01-quickstart/00-RAG-chatbot-Introduction).
29 | 
30 | # COMMAND ----------
31 | 
32 | # MAGIC %md
33 | # MAGIC ## Advanced: Going further
34 | # MAGIC
35 | # MAGIC Explore this content to discover how to leverage all the Databricks Data Intelligence Platform capabilities for your GenAI Apps.
36 | # MAGIC
37 | # MAGIC You will learn:
38 | # MAGIC
39 | # MAGIC - How to extract information from unstructured documents (pdfs) and create custom chunks
40 | # MAGIC - Leverage Databricks Embedding Foundation Model to compute the chunks embeddings
41 | # MAGIC - Create a Self Managed Vector Search index and send queries to find similar documents
42 | # MAGIC - Build an advanecd langchain model leveraging Databricks Foundation Model (Llama 2)
43 | # MAGIC - Evaluate your model chatbot model correctness with MLflow
44 | # MAGIC - Deploy your Model Serving Endpoint with Table Inferences to automatically log your model traffic
45 | # MAGIC - Run online llm evaluation and track your metrics with Databricks Monitoring
46 | 
47 | # COMMAND ----------
48 | 
49 | # MAGIC %md 
50 | # MAGIC Learn more adavanced GenAI concepts: [open the 02-advanced/01-PDF-Advanced-Data-Preparation]($./02-advanced/01-PDF-Advanced-Data-Preparation).
51 | 


--------------------------------------------------------------------------------
/Apache Spark/Parquet Connector.scala:
--------------------------------------------------------------------------------
  1 | // Databricks notebook source
  2 | // MAGIC %md # Parquet Connector
  3 | 
  4 | // COMMAND ----------
  5 | 
  6 | // MAGIC %md
  7 | // MAGIC 
  8 | // MAGIC ## Requirements
  9 | // MAGIC 
 10 | // MAGIC 1. A load-save query (loading a parquet dataset, `Dataset.map` over records and saving it out)
 11 | // MAGIC     1. Let's call `Dataset.map` operation as `samanta`
 12 | // MAGIC 1. A Scala case class as a record
 13 | // MAGIC 1. 1 partition
 14 | // MAGIC 1. 1GB per record
 15 | 
 16 | // COMMAND ----------
 17 | 
 18 | // MAGIC %md
 19 | // MAGIC 
 20 | // MAGIC ## Open questions and observations
 21 | // MAGIC 
 22 | // MAGIC 1. Vectorized parquet decoding seems making query processing faster
 23 | // MAGIC 1. `Dataset.map` vs `Dataset.mapPartitions`
 24 | 
 25 | // COMMAND ----------
 26 | 
 27 | // MAGIC %md ## Experiment
 28 | 
 29 | // COMMAND ----------
 30 | 
 31 | val input = "/Users/jacek@japila.pl/1g.parquet"
 32 | val output = "/Users/jacek@japila.pl/1g.parquet_output"
 33 | 
 34 | // COMMAND ----------
 35 | 
 36 | dbutils.fs.rm(dir = input, recurse = true)
 37 | dbutils.fs.rm(dir = output, recurse = true)
 38 | 
 39 | // COMMAND ----------
 40 | 
 41 | // MAGIC %md ### Prepare 1G parquet dataset
 42 | 
 43 | // COMMAND ----------
 44 | 
 45 | // 
 46 | 
 47 | // Each number takes up 4 bytes
 48 | // 1 billion numbers gives 4GB
 49 | // We just need 1GB (hence division by 4)
 50 | 
 51 | spark.range(1000*1000*1000 / 4).repartition(1).write.format("parquet").mode("overwrite").save(input)
 52 | 
 53 | // COMMAND ----------
 54 | 
 55 | // MAGIC %fs ls /Users/jacek@japila.pl/1g.parquet/
 56 | 
 57 | // COMMAND ----------
 58 | 
 59 | // MAGIC %md ### Run the query
 60 | 
 61 | // COMMAND ----------
 62 | 
 63 | sc.setJobDescription(s"mapPartition over parquet ($dataset)")
 64 | 
 65 | // https://docs.databricks.com/optimizations/disk-cache.html
 66 | spark.conf.set("spark.databricks.io.cache.enabled", "false")
 67 | 
 68 | import spark.implicits._
 69 | case class MyRecord(id: Long, name: String)
 70 | 
 71 | // FIXME Każdy task po 1GB i takich rekodów ~1k
 72 | 
 73 | // FIXME What exactly should samanta convert to?
 74 | val samanta = (mr: MyRecord) => r
 75 | 
 76 | spark
 77 |   .read
 78 |   .parquet(input)
 79 |   .as[MyRecord]
 80 |   // .map(samanta)
 81 |   .write
 82 |   .format("parquet")
 83 |   .save(output)
 84 | 
 85 | // COMMAND ----------
 86 | 
 87 | // Skip the rest
 88 | dbutils.notebook.exit("skip the rest")
 89 | 
 90 | // COMMAND ----------
 91 | 
 92 | sc.setJobDescription("mapPartition over parquet (data20K)")
 93 | 
 94 | # https://docs.databricks.com/optimizations/disk-cache.html
 95 | spark.conf.set("spark.databricks.io.cache.enabled", "false")
 96 | 
 97 | val samanta_jeden_rekord = r => r
 98 | val samanta = rs => Iterator.single(rs.length) // rs.map(samanta_jeden_rekord)
 99 | 
100 | // FIXME
101 | // 1 task / partycja
102 | // 1 executor only najmniejszy
103 | // rekord waży 1G (case class = row)
104 | // zbadać row groups
105 | // mapPartitions vs map
106 | val r = spark.read.schema("rating DOUBLE,review STRING").parquet("/databricks-datasets/amazon/data20K").mapPartitions(samanta)
107 | display(r)
108 | 
109 | // COMMAND ----------
110 | 
111 | // MAGIC %md
112 | // MAGIC 
113 | // MAGIC ## Spark QA
114 | // MAGIC 
115 | // MAGIC 1. Introduction to JVM (Łukasz)
116 | 


--------------------------------------------------------------------------------
/Delta Lake/TRUNCATE TABLE in Delta Lake.sql:
--------------------------------------------------------------------------------
 1 | -- Databricks notebook source
 2 | -- MAGIC %md # TRUNCATE TABLE in Delta Lake
 3 | -- MAGIC
 4 | -- MAGIC The not-so-obvious ways
 5 | 
 6 | -- COMMAND ----------
 7 | 
 8 | -- MAGIC %scala
 9 | -- MAGIC
10 | -- MAGIC // `truncateTable` = `TRUNCATE TABLE` was running perfectly fine with Hive
11 | -- MAGIC // 1k tests that did `TRUNCATE TABLE` and `INSERT INTO` 3-10-20 records, do some testing and it's over and over again.
12 | -- MAGIC
13 | -- MAGIC // With Hive, it took 30 mins
14 | -- MAGIC
15 | -- MAGIC // Switching from `format("hive")` to `format("delta")`
16 | -- MAGIC
17 | -- MAGIC // With Delta Lake, it took 5 hours
18 | -- MAGIC
19 | -- MAGIC // TRUNCATE TABLE is NOT supported by Delta Lake (open-source version / outside Databricks)
20 | -- MAGIC // because...all the tests were executed OUTSIDE Databricks
21 | -- MAGIC
22 | -- MAGIC // Why do you think the time could even increase?!
23 | -- MAGIC // 1. Metadata! What happens when you `DELETE FROM` / `TRUNCATE` => a new version is created (= a disk op)
24 | -- MAGIC
25 | -- MAGIC protected def truncateTable(databaseName: String, tableName: String): Unit = {
26 | -- MAGIC   val fullTableName = s"$databaseName.$tableName"
27 | -- MAGIC
28 | -- MAGIC   val beforeNs = System.nanoTime()
29 | -- MAGIC
30 | -- MAGIC   // Approach #0
31 | -- MAGIC   // val approach = "DELETE FROM"
32 | -- MAGIC   // spark.sql(s"DELETE FROM $databaseName.$tableName")
33 | -- MAGIC
34 | -- MAGIC   // Approach #1
35 | -- MAGIC   // https://stackoverflow.com/a/67519402/1305344
36 | -- MAGIC   val approach = "limit(0)"
37 | -- MAGIC   spark.table(fullTableName).limit(0).write.mode("overwrite").format("delta").saveAsTable(fullTableName)
38 | -- MAGIC
39 | -- MAGIC   // Approach #2
40 | -- MAGIC   // 10x slower than DELETE FROM
41 | -- MAGIC   // https://docs.delta.io/latest/delta-utility.html#remove-files-no-longer-referenced-by-a-delta-table
42 | -- MAGIC   // VACUUM RETAIN 0 HOURS
43 | -- MAGIC   //   .config("spark.databricks.delta.retentionDurationCheck.enabled", "false")
44 | -- MAGIC   //   .config("spark.databricks.delta.vacuum.parallelDelete.enabled", "true")
45 | -- MAGIC   // val approach = "VACUUM RETAIN 0 HOURS"
46 | -- MAGIC   // spark.sql(s"VACUUM $fullTableName RETAIN 0 HOURS")
47 | -- MAGIC
48 | -- MAGIC   // Approach #3
49 | -- MAGIC   // https://spark.apache.org/docs/latest/sql-ref-syntax-ddl-truncate-table.html
50 | -- MAGIC   // Not supported in Delta Lake OSS
51 | -- MAGIC   // val approach = "TRUNCATE TABLE"
52 | -- MAGIC   // spark.sql(s"TRUNCATE TABLE $databaseName.$tableName")
53 | -- MAGIC
54 | -- MAGIC   // Approach #4
55 | -- MAGIC   // val approach = "DeltaTable API"
56 | -- MAGIC   // import io.delta.tables.DeltaTable
57 | -- MAGIC   // DeltaTable.forName(fullTableName).delete()
58 | -- MAGIC
59 | -- MAGIC   val tookSecs = (System.nanoTime() - beforeNs) / 1e+9
60 | -- MAGIC   println(s">>> truncateTable($fullTableName) took ${tookSecs}s (using $approach)")
61 | -- MAGIC }
62 | 
63 | -- COMMAND ----------
64 | 
65 | DROP TABLE IF EXISTS jacek_demo
66 | 
67 | -- COMMAND ----------
68 | 
69 | CREATE TABLE jacek_demo
70 | AS SELECT 1
71 | 
72 | -- COMMAND ----------
73 | 
74 | DESCRIBE HISTORY jacek_demo
75 | 
76 | -- COMMAND ----------
77 | 
78 | SELECT * FROM jacek_demo
79 | 
80 | -- COMMAND ----------
81 | 
82 | TRUNCATE TABLE jacek_demo
83 | 
84 | -- COMMAND ----------
85 | 
86 | DESCRIBE HISTORY jacek_demo
87 | 
88 | -- COMMAND ----------
89 | 
90 | SELECT * FROM jacek_demo
91 | 


--------------------------------------------------------------------------------
/Generative AI/Model Serving.py:
--------------------------------------------------------------------------------
 1 | # Databricks notebook source
 2 | # MAGIC %md
 3 | # MAGIC # Model Serving
 4 | 
 5 | # COMMAND ----------
 6 | 
 7 | # MAGIC %md
 8 | # MAGIC
 9 | # MAGIC ## Mosaic AI Model Serving
10 | # MAGIC
11 | # MAGIC [Model serving with Databricks](https://docs.databricks.com/en/machine-learning/model-serving/index.html):
12 | # MAGIC
13 | # MAGIC * Mosaic AI Model Serving provides a unified interface to deploy, govern, and query AI models
14 | # MAGIC * Each model served is available using REST API
15 | # MAGIC * Offers a unified REST API and MLflow Deployment API for CRUD and querying tasks
16 | # MAGIC * Provides a highly available and low-latency service for deploying models
17 | # MAGIC * A single UI to manage all your models and their respective serving endpoints
18 | # MAGIC * Ability to extend pre-trained models (e.g., Llama 3.1) with proprietary data to improve quality
19 | # MAGIC   * Specialize them for specific business contexts and skills to build higher quality models
20 | # MAGIC * Automatically scales up or down to meet demand changes
21 | # MAGIC   * Uses [serverless compute](https://docs.databricks.com/en/getting-started/overview.html#serverless)
22 | # MAGIC   * [Pricing](https://www.databricks.com/product/pricing/model-serving)
23 | 
24 | # COMMAND ----------
25 | 
26 | # MAGIC %md
27 | # MAGIC
28 | # MAGIC Model serving supports the following language models:
29 | # MAGIC
30 | # MAGIC 1. [Custom models](https://docs.databricks.com/en/machine-learning/model-serving/custom-models.html)
31 | # MAGIC     * Python models packaged in the MLflow format
32 | # MAGIC     * Registered either in Unity Catalog or in the workspace model registry
33 | # MAGIC     * Examples: scikit-learn, XGBoost, PyTorch, Hugging Face transformer models, [agent serving](https://docs.databricks.com/en/generative-ai/deploy-agent.html)
34 | # MAGIC 1. State-of-the-art open models using [Foundation Model APIs]($./Foundation Models)
35 | # MAGIC     * [Llama]($./Llama)
36 | # MAGIC     * Curated foundation model architectures that support optimized inference
37 | # MAGIC     * Base models (e.g., Llama-2-70B-chat, BGE-Large, and Mistral-7B) are available for immediate use with pay-per-token pricing
38 | # MAGIC     * Workloads that require performance guarantees and fine-tuned model variants can be deployed with provisioned throughput
39 | # MAGIC 1. [External models](https://docs.databricks.com/en/generative-ai/external-models/index.html)
40 | # MAGIC     * Generative AI models hosted outside of Databricks
41 | # MAGIC     * Endpoints that serve external models can be centrally governed and customers can establish rate limits and access control for them
42 | # MAGIC     * Examples: OpenAI’s GPT-4, Anthropic’s Claude
43 | 
44 | # COMMAND ----------
45 | 
46 | # MAGIC %md
47 | # MAGIC
48 | # MAGIC ## SQL Access
49 | # MAGIC
50 | # MAGIC Models are available from SQL using [AI functions](https://docs.databricks.com/en/large-language-models/ai-functions.html) for easy integration into analytics workflows.
51 | 
52 | # COMMAND ----------
53 | 
54 | # MAGIC %md
55 | # MAGIC
56 | # MAGIC ## Tutorials
57 | # MAGIC
58 | # MAGIC 1. [Tutorial: Deploy and query a custom model](https://docs.databricks.com/en/machine-learning/model-serving/model-serving-intro.html) on how to serve custom models on Databricks
59 | # MAGIC 1. [Get started querying LLMs on Databricks](https://docs.databricks.com/en/large-language-models/llm-serving-intro.html) on how to query a foundation model on Databricks
60 | 


--------------------------------------------------------------------------------
/Apache Spark/ANTI and SEMI joins in SQL and DataFrame API.py:
--------------------------------------------------------------------------------
  1 | # Databricks notebook source
  2 | # MAGIC %md
  3 | # MAGIC
  4 | # MAGIC # ANTI and SEMI joins in SQL and DataFrame API
  5 | 
  6 | # COMMAND ----------
  7 | 
  8 | # MAGIC %md
  9 | # MAGIC
 10 | # MAGIC ## Discovery of the Day
 11 | # MAGIC
 12 | # MAGIC There's no `DataFrame.createOrReplaceView` in PySpark and Scala APIs 😬
 13 | # MAGIC
 14 | # MAGIC `CREATE VIEW` only.
 15 | 
 16 | # COMMAND ----------
 17 | 
 18 | # MAGIC %md
 19 | # MAGIC
 20 | # MAGIC ## SQL Join types explained with 1 picture
 21 | # MAGIC
 22 | # MAGIC [SQL Join types explained with 1 picture](https://www.securesolutions.no/sql-join-types-explained-with-1-picture/)
 23 | # MAGIC
 24 | # MAGIC ![Joins](https://www.securesolutions.no/wp-content/uploads/2014/07/joins-1.jpg)
 25 | 
 26 | # COMMAND ----------
 27 | 
 28 | # MAGIC %md
 29 | # MAGIC
 30 | # MAGIC ## SQL Joins
 31 | # MAGIC
 32 | # MAGIC [SQL Joins](https://www.w3schools.com/sql/sql_join.asp)
 33 | 
 34 | # COMMAND ----------
 35 | 
 36 | # MAGIC %md
 37 | # MAGIC
 38 | # MAGIC ## JOINs in Spark SQL
 39 | # MAGIC
 40 | # MAGIC [JOIN](https://spark.apache.org/docs/latest/sql-ref-syntax-qry-select-join.html)
 41 | 
 42 | # COMMAND ----------
 43 | 
 44 | left = spark.range(3)
 45 | 
 46 | # COMMAND ----------
 47 | 
 48 | # MAGIC %md ## Left anti join
 49 | # MAGIC
 50 | # MAGIC ![Left anti join](https://learn.microsoft.com/en-us/power-query/media/merge-queries-left-anti/left-anti-join-operation.png)
 51 | 
 52 | # COMMAND ----------
 53 | 
 54 | # MAGIC %md ## Left Semi JOIN
 55 | # MAGIC
 56 | # MAGIC * [Difference Between Anti-Join and Semi-Join](https://www.geeksforgeeks.org/difference-between-anti-join-and-semi-join/)
 57 | # MAGIC * [Difference between INNER JOIN and LEFT SEMI JOIN](https://stackoverflow.com/q/21738784/1305344)
 58 | 
 59 | # COMMAND ----------
 60 | 
 61 | # MAGIC %md
 62 | # MAGIC
 63 | # MAGIC ## DataFrame API
 64 | # MAGIC
 65 | # MAGIC * [DataFrame.except](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrame.except.html)
 66 | # MAGIC * [DataFrame.subtract](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrame.subtract.html)
 67 | # MAGIC * [DataFrame.intersect](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrame.intersect.html)
 68 | 
 69 | # COMMAND ----------
 70 | 
 71 | left = spark.range(5)
 72 | dups_left = left.union(left)
 73 | two_threes = spark.createDataFrame([3,3], 'int').withColumnRenamed('value', 'id')
 74 | dups_left_with_threes = dups_left.union(two_threes)
 75 | right = spark.range(3, 8, 1)
 76 | 
 77 | # COMMAND ----------
 78 | 
 79 | two_threes.display()
 80 | 
 81 | # COMMAND ----------
 82 | 
 83 | left.join(right, 'id', 'L_E_f_t_aN_tI').display()
 84 | 
 85 | # COMMAND ----------
 86 | 
 87 | left.exceptAll(right).display()
 88 | 
 89 | # COMMAND ----------
 90 | 
 91 | # MAGIC %md
 92 | # MAGIC
 93 | # MAGIC programmatic vs maths approach
 94 | 
 95 | # COMMAND ----------
 96 | 
 97 | two_threes.join(right, 'id', 'left_anti').display()
 98 | 
 99 | # COMMAND ----------
100 | 
101 | two_threes.exceptAll(right).display()
102 | 
103 | # COMMAND ----------
104 | 
105 | two_threes.subtract(right).display()
106 | 
107 | # COMMAND ----------
108 | 
109 | # MAGIC %md
110 | # MAGIC
111 | # MAGIC ## Mind Query Plans
112 | # MAGIC
113 | # MAGIC Explore query plans with larger datasets (best would be to use delta tables) before claiming that one is better than the others 😜
114 | # MAGIC
115 | # MAGIC Danke schon, Paul, for bringing it up to my attention! 🥳
116 | 


--------------------------------------------------------------------------------
/Databricks Workflows/Step 1. Load Raw Data.sql:
--------------------------------------------------------------------------------
 1 | -- Databricks notebook source
 2 | -- MAGIC %md # Load Raw Data
 3 | -- MAGIC
 4 | -- MAGIC This notebook uses an input parameter (using [Databricks widgets](https://docs.databricks.com/notebooks/widgets.html)).
 5 | -- MAGIC
 6 | -- MAGIC Name | Default Value | Label
 7 | -- MAGIC -----|---------------|-------
 8 | -- MAGIC  `table_name` | workflows_raw_data | Table Name (Raw Data)
 9 | -- MAGIC  `database_name` | jaceklaskowski | Database Name
10 | 
11 | -- COMMAND ----------
12 | 
13 | -- MAGIC %md
14 | -- MAGIC
15 | -- MAGIC **NOTE**:
16 | -- MAGIC
17 | -- MAGIC > Do not use `Run all` button to run all the cells.
18 | 
19 | -- COMMAND ----------
20 | 
21 | -- MAGIC %md
22 | -- MAGIC
23 | -- MAGIC [Databricks widgets](https://docs.databricks.com/notebooks/widgets.html) describes how to access widgets values in Spark SQL.
24 | -- MAGIC
25 | -- MAGIC
26 | -- MAGIC Unfortunatelly, [notebooks in jobs cannot use widgets](https://docs.databricks.com/notebooks/widgets.html#using-widget-values-in-spark-sql):
27 | -- MAGIC
28 | -- MAGIC > In general, you cannot use widgets (...) if you use Run All or run the notebook as a job.
29 | -- MAGIC
30 | -- MAGIC There are a couple of issues to keep in mind, esp. while doing a demo:
31 | -- MAGIC
32 | -- MAGIC 1. In general, you cannot use widgets to pass arguments between different languages within a notebook
33 | -- MAGIC 1. You can create a widget `arg1` in a Python cell and use it in a SQL or Scala cell only if you run one cell at a time.
34 | -- MAGIC 1. Using widget values between different languages does not work if you use **Run All** or run the notebook as a job
35 | 
36 | -- COMMAND ----------
37 | 
38 | -- MAGIC %python
39 | -- MAGIC
40 | -- MAGIC dbutils.jobs.taskValues.help()
41 | 
42 | -- COMMAND ----------
43 | 
44 | -- MAGIC %python
45 | -- MAGIC
46 | -- MAGIC dbutils.jobs.taskValues.help("get")
47 | 
48 | -- COMMAND ----------
49 | 
50 | -- MAGIC %python
51 | -- MAGIC
52 | -- MAGIC # Creates a text input widget with a given name and default value.
53 | -- MAGIC # Notebook Widgets are only for Run all (when executed outside a job)
54 | -- MAGIC dbutils.widgets.removeAll()
55 | -- MAGIC dbutils.widgets.text(name = "database_name", defaultValue = "jaceklaskowski", label = "Database Name")
56 | -- MAGIC dbutils.widgets.text(name = "raw_table_name", defaultValue = "workflows_raw_data", label = "Raw Table Name")
57 | -- MAGIC dbutils.widgets.text(name = "silver_table_name", defaultValue = "workflows_transform", label = "Silver Table Name")
58 | -- MAGIC dbutils.widgets.text(name = "gold_table_name", defaultValue = "workflows_aggregates", label = "Gold Table Name")
59 | 
60 | -- COMMAND ----------
61 | 
62 | -- The following does not seem to work
63 | -- REMOVE WIDGET table_name;
64 | -- CREATE WIDGET TEXT table_name DEFAULT "workflows_raw_data";
65 | 
66 | -- COMMAND ----------
67 | 
68 | CREATE DATABASE IF NOT EXISTS ${database_name};
69 | USE ${database_name}
70 | 
71 | -- COMMAND ----------
72 | 
73 | SHOW TABLES
74 | 
75 | -- COMMAND ----------
76 | 
77 | -- MAGIC %md ## Create Raw Table
78 | -- MAGIC
79 | -- MAGIC Learn more in [CREATE VIEW](https://docs.databricks.com/sql/language-manual/sql-ref-syntax-ddl-create-view.html)
80 | 
81 | -- COMMAND ----------
82 | 
83 | -- MAGIC %python
84 | -- MAGIC
85 | -- MAGIC dbutils.widgets.getArgument("raw_table_name")
86 | 
87 | -- COMMAND ----------
88 | 
89 | CREATE OR REPLACE VIEW ${raw_table_name}
90 |   (id COMMENT 'Unique identification number', name)
91 | COMMENT 'Bronze layer'
92 | AS
93 |   SELECT id, name
94 |   FROM VALUES (0, "zero"), (1, "one") t(id, name)
95 | 
96 | -- COMMAND ----------
97 | 
98 | SELECT * FROM ${raw_table_name}
99 | 


--------------------------------------------------------------------------------
/Databricks Workflows/02 Modular Orchestration with Run Job Task.py:
--------------------------------------------------------------------------------
 1 | # Databricks notebook source
 2 | # MAGIC %md # Modular Orchestration with Run Job Task
 3 | 
 4 | # COMMAND ----------
 5 | 
 6 | # MAGIC %md
 7 | # MAGIC
 8 | # MAGIC 1. Breaking down large complex workflows (DAGs) into logical chunks or smaller "child" jobs that are defined and managed separately
 9 | # MAGIC 1. parent and child jobs
10 | # MAGIC 1. split a DAG up by organizational boundaries
11 | # MAGIC     * allowing different teams in an organization to work together on different parts of a workflow
12 | # MAGIC     * ownership of parts of the workflow can be better managed, with different teams potentially using different code repositories for the jobs they own
13 | # MAGIC     * testing and updates covered by child job ownership
14 | # MAGIC 1. reusability
15 | # MAGIC     * define common shared steps in a job once and then reuse that as a child job in different parent workflows
16 | # MAGIC     * With parameters, reused tasks can be made more flexible to fit the needs of different parent workflows
17 | # MAGIC 1. Creates a modular workflow
18 | 
19 | # COMMAND ----------
20 | 
21 | # MAGIC %md ## Run Job
22 | # MAGIC
23 | # MAGIC 1. A new task type **Run Job**
24 | # MAGIC     * Requires a job to trigger
25 | # MAGIC 1. Calls a job to be run by the task
26 | # MAGIC 1. Jobs triggered by a Run Job task use their own cluster configuration
27 | # MAGIC     * [Trigger a new job run](https://docs.databricks.com/api/workspace/jobs/runnow)
28 | 
29 | # COMMAND ----------
30 | 
31 | # MAGIC %md ## Parameters
32 | # MAGIC
33 | # MAGIC 1. Enter the key and value of each job parameter to pass to a Run Job
34 | # MAGIC 1. [Pass context about job runs into job tasks](https://docs.databricks.com/en/workflows/jobs/parameter-value-references.html)
35 | # MAGIC     * Click **Browse dynamic values** for a list of available dynamic value references
36 | # MAGIC 1. If job parameters are configured on the job a task belongs to, those parameters are displayed when you add task parameters.
37 | # MAGIC     * If job and task parameters share a key, the job parameter takes precedence.
38 | # MAGIC     * A warning is shown in the UI if you attempt to add a task parameter with the same key as a job parameter.
39 | # MAGIC     * [Add parameters for all job tasks](https://docs.databricks.com/en/workflows/jobs/settings.html#add-parameters-for-all-job-tasks)
40 | 
41 | # COMMAND ----------
42 | 
43 | # MAGIC %md ## Task Queueing
44 | # MAGIC
45 | # MAGIC 1. A workspace is limited to 1000 concurrent task runs. A 429 Too Many Requests response is returned when you request a run that cannot start immediately.
46 | # MAGIC 1. The number of jobs a workspace can create in an hour is limited to 10000 (includes “runs submit”). This limit also affects jobs created by the REST API and notebook workflows.
47 | 
48 | # COMMAND ----------
49 | 
50 | # MAGIC %md ## Gotchas
51 | # MAGIC
52 | # MAGIC 1. You should not create jobs with circular dependencies or jobs that nest more than three Run Job tasks.
53 | # MAGIC 1. Circular dependencies are Run Job tasks that directly or indirectly trigger each other.
54 | # MAGIC 1. A run is queued when the maximum concurrent Run Job task runs in the workspace is reached (see [What if my job cannot run because of concurrency limits?](https://docs.databricks.com/en/workflows/jobs/create-run-jobs.html#what-if-my-job-cannot-run-because-of-concurrency-limits))
55 | 
56 | # COMMAND ----------
57 | 
58 | # MAGIC %md ## Learn More
59 | # MAGIC
60 | # MAGIC 1. [Modular Orchestration with Databricks Workflows](https://www.databricks.com/blog/modular-orchestration-databricks-workflows)
61 | # MAGIC 1. [Task type options](https://docs.databricks.com/en/workflows/jobs/create-run-jobs.html#task-type-options)
62 | 


--------------------------------------------------------------------------------
/Generative AI/00 Generative AI.sql:
--------------------------------------------------------------------------------
 1 | -- Databricks notebook source
 2 | -- MAGIC %md # Generative AI and Large Language Models
 3 | -- MAGIC
 4 | -- MAGIC Gain more ground in **Generative AI** (#GenAI) and **Large Language Models** (#LLM) that I know nothing about to become pick as many as could please you (these I found while reading some articles about the subject 😎):
 5 | -- MAGIC
 6 | -- MAGIC * A passionate GenAI and LLM enthusiast
 7 | -- MAGIC * A seasoned GenAI and LLM professional
 8 | 
 9 | -- COMMAND ----------
10 | 
11 | -- MAGIC %md
12 | -- MAGIC
13 | -- MAGIC There are four [Generative AI Architecture Patterns](https://www.databricks.com/product/machine-learning/build-generative-ai) to consider when building a large language model–based solution:
14 | -- MAGIC
15 | -- MAGIC 1. [Prompt Engineering]($./Prompt Engineering)
16 | -- MAGIC 1. [Retrieval Augmented Generation (RAG)]($./Retrieval Augmented Generation)
17 | -- MAGIC 1. Fine-tuning
18 | -- MAGIC 1. Pretraining
19 | 
20 | -- COMMAND ----------
21 | 
22 | -- MAGIC %md
23 | -- MAGIC
24 | -- MAGIC ## Mosaic AI Agent Framework and Agent Evaluation
25 | -- MAGIC
26 | -- MAGIC RAG applications and Agents are the most popular GenAI applications on Databricks (built using [Mosaic AI Agent Framework and Agent Evaluation](https://www.databricks.com/blog/announcing-mosaic-ai-agent-framework-and-agent-evaluation))
27 | 
28 | -- COMMAND ----------
29 | 
30 | -- MAGIC %md ## Use Cases
31 | -- MAGIC
32 | -- MAGIC * Doc Q&A
33 | -- MAGIC * Chatbots
34 | 
35 | -- COMMAND ----------
36 | 
37 | -- MAGIC %md ## Other Branches of Machine Learning
38 | -- MAGIC
39 | -- MAGIC * Predictive and Prescriptive Analytics
40 | -- MAGIC * Computer Vision
41 | -- MAGIC * Natural Language Processing
42 | -- MAGIC * Deployment / ML Ops / Cloud
43 | -- MAGIC * Reinforcement Learning
44 | -- MAGIC
45 | 
46 | -- COMMAND ----------
47 | 
48 | -- MAGIC %md ## Large Language Models (LLMs)
49 | -- MAGIC
50 | -- MAGIC * ChatGPT
51 | -- MAGIC * [BloombergGPT](https://www.bloomberg.com/company/press/bloomberggpt-50-billion-parameter-llm-tuned-finance/) - Bloomberg’s 50-billion parameter large language model, purpose-built from scratch for finance
52 | -- MAGIC * [M0deler](https://m0deler.com/)
53 | -- MAGIC
54 | 
55 | -- COMMAND ----------
56 | 
57 | -- MAGIC %md 
58 | -- MAGIC
59 | -- MAGIC Generative AI and Large Language Models (LLMs):
60 | -- MAGIC
61 | -- MAGIC * [OpenAI](https://openai.com/)
62 | -- MAGIC * Local and on-premise models
63 | -- MAGIC * the Rise of Generative AI due to ChatGPT
64 | -- MAGIC * general-purpose chat bots
65 | -- MAGIC * Building products that use LLMs and GenAI
66 | -- MAGIC * Developing apps with GenAI and LLMs
67 | 
68 | -- COMMAND ----------
69 | 
70 | -- MAGIC %md ## Prompt Engineering
71 | 
72 | -- COMMAND ----------
73 | 
74 | -- MAGIC %md
75 | -- MAGIC
76 | -- MAGIC * Writing ChatGPT prompts
77 | -- MAGIC * the no-code method for writing ChatGPT prompts
78 | 
79 | -- COMMAND ----------
80 | 
81 | -- MAGIC %md
82 | -- MAGIC
83 | -- MAGIC ## Databricks Generative AI Fundamentals Learning Plan
84 | -- MAGIC
85 | -- MAGIC [Generative AI Fundamentals](https://www.databricks.com/resources/learn/training/generative-ai-fundamentals)
86 | 
87 | -- COMMAND ----------
88 | 
89 | -- MAGIC %md ## Recommended Reading by ChatGPT 3.5
90 | -- MAGIC
91 | -- MAGIC There are the academic papers recommended by [ChatGPT 3.5](https://chat.openai.com/share/3dfac550-eeb6-4740-b68e-52140632edc0) that were instrumental in advancing large language models:
92 | -- MAGIC
93 | -- MAGIC ### Attention Is All You Need
94 | -- MAGIC
95 | -- MAGIC Started my journey into academic paper reading and LLMs from [Attention Is All You Need](https://arxiv.org/abs/1706.03762) by Vaswani et al. (2017) as _"This paper introduced the Transformer architecture, which revolutionized the field of NLP and laid the foundation for large language models like GPT and BERT."_
96 | 


--------------------------------------------------------------------------------
/Delta Live Tables/Expectations.sql:
--------------------------------------------------------------------------------
 1 | -- Databricks notebook source
 2 | -- MAGIC %md # Delta Live Tables Expectations
 3 | 
 4 | -- COMMAND ----------
 5 | 
 6 | -- MAGIC %md
 7 | -- MAGIC 
 8 | -- MAGIC ## Introduction
 9 | -- MAGIC 
10 | -- MAGIC [Manage data quality with Delta Live Tables](https://docs.databricks.com/delta-live-tables/expectations.html):
11 | -- MAGIC 
12 | -- MAGIC 1. **Expectations** define data quality constraints (_requirements_, _assertions_)
13 | -- MAGIC 1. Optional
14 | -- MAGIC 1. Data quality checks on each record passing through a query (before they land in a delta table)
15 | -- MAGIC     ```
16 | -- MAGIC     expectation: record => Boolean
17 | -- MAGIC     ```
18 | -- MAGIC 1. Provide insights into data quality for each pipeline update
19 | -- MAGIC 1. Applied to queries using Python decorators or SQL `CONSTRAINT` clauses
20 | 
21 | -- COMMAND ----------
22 | 
23 | -- MAGIC %md ## CREATE OR REFRESH Statement
24 | -- MAGIC 
25 | -- MAGIC [Delta Live Tables SQL language reference](https://docs.databricks.com/delta-live-tables/sql-ref.html)
26 | -- MAGIC 
27 | -- MAGIC ```sql
28 | -- MAGIC CREATE OR REFRESH [TEMPORARY] { STREAMING TABLE | LIVE TABLE } table_name
29 | -- MAGIC   [(
30 | -- MAGIC     [
31 | -- MAGIC     col_name1 col_type1 [ GENERATED ALWAYS AS generation_expression1 ] [ COMMENT col_comment1 ],
32 | -- MAGIC     col_name2 col_type2 [ GENERATED ALWAYS AS generation_expression2 ] [ COMMENT col_comment2 ],
33 | -- MAGIC     ...
34 | -- MAGIC     ]
35 | -- MAGIC     [
36 | -- MAGIC     CONSTRAINT expectation_name_1 EXPECT (expectation_expr1) [ON VIOLATION { FAIL UPDATE | DROP ROW }],
37 | -- MAGIC     CONSTRAINT expectation_name_2 EXPECT (expectation_expr2) [ON VIOLATION { FAIL UPDATE | DROP ROW }],
38 | -- MAGIC     ...
39 | -- MAGIC     ]
40 | -- MAGIC   )]
41 | -- MAGIC   [USING DELTA]
42 | -- MAGIC   [PARTITIONED BY (col_name1, col_name2, ... )]
43 | -- MAGIC   [LOCATION path]
44 | -- MAGIC   [COMMENT table_comment]
45 | -- MAGIC   [TBLPROPERTIES (key1 [ = ] val1, key2 [ = ] val2, ... )]
46 | -- MAGIC   AS select_statement
47 | -- MAGIC ```
48 | 
49 | -- COMMAND ----------
50 | 
51 | -- MAGIC %md ## CONSTRAINT Clause
52 | -- MAGIC 
53 | -- MAGIC ```sql
54 | -- MAGIC CONSTRAINT expectation_name EXPECT (expectation_expr) [ON VIOLATION { FAIL UPDATE | DROP ROW }]
55 | -- MAGIC ```
56 | -- MAGIC 
57 | -- MAGIC An expectation (`CONSTRAINT`) consists of three properties:
58 | -- MAGIC 
59 | -- MAGIC Property | SQL |Meaning
60 | -- MAGIC ---------|-----|-------
61 | -- MAGIC Identifier | `expectation_name` | a unique identifier and allows you to track metrics for the constraint
62 | -- MAGIC Condition | `expectation_expr` | A boolean expression
63 | -- MAGIC Action | `ON VIOLATION` | (optional) What to do when a record fails the expectation (the condition is `false`)
64 | 
65 | -- COMMAND ----------
66 | 
67 | -- MAGIC %md
68 | -- MAGIC 
69 | -- MAGIC ```sql
70 | -- MAGIC CONSTRAINT expectation_name               -- Name / Identifier
71 | -- MAGIC EXPECT (expectation_expr)                 -- Data Quality Assertion
72 | -- MAGIC [ON VIOLATION { DROP ROW | FAIL UPDATE }] -- Action
73 | -- MAGIC ```
74 | -- MAGIC 
75 | -- MAGIC [SQL properties](https://docs.databricks.com/delta-live-tables/sql-ref.html#sql-properties-1)
76 | -- MAGIC 
77 | -- MAGIC Action | Result
78 | -- MAGIC -------|-------
79 | -- MAGIC No `ON VIOLATION` (_warn_) | **(default)** Invalid records are written to the target table; failure is reported as a metric for the dataset. (_accept violation_)
80 | -- MAGIC  `ON VIOLATION DROP ROW` | Invalid records are dropped (not written to the target table) and a pipeline continues processing; failure is reported as a metrics for the dataset
81 | -- MAGIC  `ON VIOLATION FAIL UPDATE` | An invalid record immediately stops pipeline execution; Manual intervention is required before re-processing
82 | 
83 | -- COMMAND ----------
84 | 
85 | -- MAGIC %md
86 | -- MAGIC 
87 | -- MAGIC [What are Delta Live Tables expectations?](https://docs.databricks.com/delta-live-tables/expectations.html#what-are-delta-live-tables-expectations)
88 | 


--------------------------------------------------------------------------------
/Generative AI/llm-rag-chatbot/_resources/LICENSE.py:
--------------------------------------------------------------------------------
 1 | # Databricks notebook source
 2 | # MAGIC %md
 3 | # MAGIC ## Licence
 4 | 
 5 | # COMMAND ----------
 6 | 
 7 | # MAGIC %md
 8 | # MAGIC
 9 | # MAGIC Copyright (2022) Databricks, Inc.
10 | # MAGIC
11 | # MAGIC This library (the "Software") may not be used except in connection with the Licensee's use of the Databricks Platform Services pursuant 
12 | # MAGIC to an Agreement (defined below) between Licensee (defined below) and Databricks, Inc. ("Databricks"). The Object Code version of the 
13 | # MAGIC Software shall be deemed part of the Downloadable Services under the Agreement, or if the Agreement does not define Downloadable Services, 
14 | # MAGIC Subscription Services, or if neither are defined then the term in such Agreement that refers to the applicable Databricks Platform 
15 | # MAGIC Services (as defined below) shall be substituted herein for “Downloadable Services.”  Licensee's use of the Software must comply at 
16 | # MAGIC all times with any restrictions applicable to the Downlodable Services and Subscription Services, generally, and must be used in 
17 | # MAGIC accordance with any applicable documentation. For the avoidance of doubt, the Software constitutes Databricks Confidential Information
18 | # MAGIC under the Agreement.
19 | # MAGIC
20 | # MAGIC Additionally, and notwithstanding anything in the Agreement to the contrary: 
21 | # MAGIC * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 
22 | # MAGIC   OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 
23 | # MAGIC   LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR 
24 | # MAGIC   IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 | # MAGIC * you may view, make limited copies of, and may compile the Source Code version of the Software into an Object Code version of the
26 | # MAGIC   Software.  For the avoidance of doubt, you may not make derivative works of Software (or make any any changes to the Source Code 
27 | # MAGIC   version of the unless you have agreed to separate terms with Databricks permitting such modifications (e.g., a contribution license
28 | # MAGIC   agreement)).
29 | # MAGIC
30 | # MAGIC If you have not agreed to an Agreement or otherwise do not agree to these terms, you may not use the Software or view, copy or compile
31 | # MAGIC the Source Code of the Software.
32 | # MAGIC   
33 | # MAGIC This license terminates automatically upon the termination of the Agreement or Licensee's breach of these terms.  Additionally, 
34 | # MAGIC Databricks may terminate this license at any time on notice.  Upon termination, you must permanently delete the Software and all
35 | # MAGIC copies thereof (including the Source Code).
36 | # MAGIC
37 | # MAGIC Agreement: the agreement between Databricks and Licensee governing the use of the Databricks Platform Services, which shall be, with
38 | # MAGIC respect to Databricks, the Databricks Terms of Service located at www.databricks.com/termsofservice, and with respect to Databricks
39 | # MAGIC Community Edition, the Community Edition Terms of Service located at www.databricks.com/ce-termsofuse, in each case unless Licensee 
40 | # MAGIC has entered into a separate written agreement with Databricks governing the use of the applicable Databricks Platform Services.
41 | # MAGIC  
42 | # MAGIC Databricks Platform Services: the Databricks services or the Databricks Community Edition services, according to where the Software is used.
43 | # MAGIC
44 | # MAGIC Licensee: the user of the Software, or, if the Software is being used on behalf of a company, the company.
45 | # MAGIC
46 | # MAGIC Object Code: is version of the Software produced when an interpreter or a compiler translates the Source Code into recognizable and 
47 | # MAGIC executable machine code.
48 | # MAGIC
49 | # MAGIC Source Code: the human readable portion of the Software.
50 | 


--------------------------------------------------------------------------------
/workshops/Course Agenda 2 Days.py:
--------------------------------------------------------------------------------
  1 | # Databricks notebook source
  2 | # MAGIC %md # Databricks Workshop » Agenda
  3 | # MAGIC
  4 | # MAGIC This notebook is a sample agenda of a Databricks workshop that can be used as a guidance for further customization.
  5 | # MAGIC
  6 | # MAGIC This agenda has been used in a 2-day workshop format for a group of data engineers, architects, data analysts and testers (with some automation skills).
  7 | # MAGIC
  8 | # MAGIC Duration: 2 days (8 hours / day)
  9 | # MAGIC
 10 | # MAGIC Recommended number of participants: 8-12 people
 11 | 
 12 | # COMMAND ----------
 13 | 
 14 | # MAGIC %md ## Agenda
 15 | 
 16 | # COMMAND ----------
 17 | 
 18 | # MAGIC %md
 19 | # MAGIC
 20 | # MAGIC The idea is to go over all the different types of artifacts that can be built using "New" menu:
 21 | # MAGIC
 22 | # MAGIC 1. Notebook
 23 | # MAGIC 1. Repo
 24 | # MAGIC 1. Data
 25 | # MAGIC     * File upload
 26 | # MAGIC     * Add data
 27 | # MAGIC 1. Compute
 28 | # MAGIC     * Cluster
 29 | # MAGIC     * SQL Warehouse
 30 | # MAGIC 1. SQL
 31 | # MAGIC     * Query
 32 | # MAGIC     * Dashoboard
 33 | # MAGIC     * Lakeview Dashboard
 34 | # MAGIC     * Alert
 35 | # MAGIC 1. Data Engineering
 36 | # MAGIC     * Job
 37 | # MAGIC     * DLT Pipeline
 38 | # MAGIC 1. Machine Learning (not covered)
 39 | 
 40 | # COMMAND ----------
 41 | 
 42 | # MAGIC %md
 43 | # MAGIC
 44 | # MAGIC 1. Overview of Databricks Workspace for Data Engineers
 45 | # MAGIC 1. Data Engineering with Spark SQL
 46 | # MAGIC 1. Managing Data in Delta Lake Tables
 47 | # MAGIC     * DLT Pipelines
 48 | # MAGIC     * `dbfs` magic command
 49 | # MAGIC 1. Databricks Workflows and Jobs
 50 | # MAGIC 1. Developing Data Pipelines using Delta Live Tables
 51 | # MAGIC     * Converting Spark exercises to Databricks tools' mindset
 52 | # MAGIC 1. Databricks SQL for Data Analytics
 53 | # MAGIC 1. Unity Catalog
 54 | # MAGIC 1. Setting up Development Environment
 55 | # MAGIC     * IntelliJ IDEA / PyCharm
 56 | # MAGIC     * Visual Studio Code
 57 | # MAGIC     * Databricks JDBC Driver
 58 | # MAGIC     * Databricks CLI
 59 | # MAGIC     * Databricks SQL Connector for Python
 60 | # MAGIC 1. (optional) Databricks CI/CD
 61 | # MAGIC     * REST APIs
 62 | # MAGIC     * Terraform / Databricks Terraform Provider
 63 | 
 64 | # COMMAND ----------
 65 | 
 66 | # MAGIC %md ## Preprequisites
 67 | 
 68 | # COMMAND ----------
 69 | 
 70 | # MAGIC %md
 71 | # MAGIC
 72 | # MAGIC It is assumed that the course participants have got the following skills:
 73 | # MAGIC
 74 | # MAGIC 1. Familiarity with Apache Spark and/or PySpark
 75 | # MAGIC 1. Familiarity with one of the following programming languages:
 76 | # MAGIC     * Python
 77 | # MAGIC     * Scala
 78 | # MAGIC     * SQL
 79 | 
 80 | # COMMAND ----------
 81 | 
 82 | # MAGIC %md ## Schedule
 83 | 
 84 | # COMMAND ----------
 85 | 
 86 | # MAGIC %md
 87 | # MAGIC
 88 | # MAGIC A class is split into 1-hour blocks with a 12-minute break each
 89 | # MAGIC
 90 | # MAGIC A day starts at 9am and ends at 4pm to let students have an extra 1 hour at the end of a day to work alone on exercises and have enough room for some cognitive work at its own pace and perhaps even ask questions
 91 | # MAGIC
 92 | # MAGIC Lunch breaks at 1pm for 1 hour
 93 | 
 94 | # COMMAND ----------
 95 | 
 96 | # MAGIC %md
 97 | # MAGIC
 98 | # MAGIC * 9:00 – 9:48 (12’ break)
 99 | # MAGIC * 10:00 – 10:48 (12’ break)
100 | # MAGIC * 11:00 – 11:48 (12’ break)
101 | # MAGIC * Lunch break (1h)
102 | # MAGIC * 13:00 – 13:48 (12’ break)
103 | # MAGIC * 14:00 – 14:48 (12’ break)
104 | # MAGIC * 15:00 – 15:48 (12’ break)
105 | # MAGIC * 16:00 – 17:00 a quiet working hour
106 | 
107 | # COMMAND ----------
108 | 
109 | # MAGIC %md
110 | # MAGIC
111 | # MAGIC ## Databricks Workspace
112 | 
113 | # COMMAND ----------
114 | 
115 | # MAGIC %md
116 | # MAGIC
117 | # MAGIC * A Databricks Workspace with Unity Catalog enabled
118 | # MAGIC * Myself as a workspace admin
119 | # MAGIC * An extra (fake) non-admin user account for testing and demo
120 | # MAGIC * DLT pipelines and workflows (jobs)
121 | 


--------------------------------------------------------------------------------
/demo/delta-live-tables/my_streaming_table.sql:
--------------------------------------------------------------------------------
  1 | -- Databricks notebook source
  2 | -- MAGIC %md # my_streaming_table DLT Table
  3 | 
  4 | -- COMMAND ----------
  5 | 
  6 | -- MAGIC %md
  7 | -- MAGIC 
  8 | -- MAGIC ## Why SQL Considered Superior (over Python)
  9 | -- MAGIC 
 10 | -- MAGIC Unlike in Python, [Delta Live Tables SQL](https://docs.databricks.com/workflows/delta-live-tables/delta-live-tables-sql-ref.html) allow for:
 11 | -- MAGIC 
 12 | -- MAGIC 1. Executing DLT notebooks for syntax analysis (using `Run all`)
 13 | -- MAGIC 1. Markdown 😍
 14 | -- MAGIC 
 15 | -- MAGIC You can still use SQL notebooks with Python notebooks in a single DLT pipeline.
 16 | 
 17 | -- COMMAND ----------
 18 | 
 19 | -- MAGIC %md
 20 | -- MAGIC 
 21 | -- MAGIC When executing the notebook in a DLT pipeline you will get this WARN message:
 22 | -- MAGIC 
 23 | -- MAGIC ```
 24 | -- MAGIC Magic commands (e.g. %py, %sql and %run) are not supported with the exception of
 25 | -- MAGIC %pip within a Python notebook. Cells containing magic commands are ignored.
 26 | -- MAGIC Unsupported magic commands were found in the following notebooks
 27 | -- MAGIC 
 28 | -- MAGIC /Repos/jacek@japila.pl/learn-databricks/Delta Live Tables/auto_loader: %fs
 29 | -- MAGIC ```
 30 | 
 31 | -- COMMAND ----------
 32 | 
 33 | -- MAGIC %md
 34 | -- MAGIC 
 35 | -- MAGIC Let's start with an example non-DLT query with `cloud_files` TVF (Auto Loader).
 36 | -- MAGIC 
 37 | -- MAGIC It won't work.
 38 | -- MAGIC 
 39 | -- MAGIC `cloud_files` creates a streaming table while CTAS does not define one.
 40 | 
 41 | -- COMMAND ----------
 42 | 
 43 | -- MAGIC %md
 44 | -- MAGIC 
 45 | -- MAGIC The following won't work as is in a DLT notebook.
 46 | -- MAGIC 
 47 | -- MAGIC ```
 48 | -- MAGIC Unable to process top-level query. DLT currently only accepts 'CREATE TEMPORARY LIVE VIEW', 'CREATE OR REFRESH LIVE TABLE', 'APPLY CHANGES INTO', and 'SET' statements.
 49 | -- MAGIC ```
 50 | -- MAGIC 
 51 | -- MAGIC Don't forget to comment it out before executing the notebook in a DLT pipeline.
 52 | 
 53 | -- COMMAND ----------
 54 | 
 55 | -- SELECT * FROM cloud_files("/databricks-datasets/retail-org/customers/", "csv")
 56 | 
 57 | -- COMMAND ----------
 58 | 
 59 | -- MAGIC %md ## Schema Inference
 60 | 
 61 | -- COMMAND ----------
 62 | 
 63 | -- MAGIC %md
 64 | -- MAGIC 
 65 | -- MAGIC [Auto Loader](https://docs.databricks.com/workflows/delta-live-tables/delta-live-tables-data-sources.html#auto-loader):
 66 | -- MAGIC 
 67 | -- MAGIC * You can use supported format options with Auto Loader.
 68 | -- MAGIC * Use `map()` function to pass options to `cloud_files()`
 69 | 
 70 | -- COMMAND ----------
 71 | 
 72 | -- MAGIC %md
 73 | -- MAGIC 
 74 | -- MAGIC ```sql
 75 | -- MAGIC CREATE OR REFRESH STREAMING LIVE TABLE <table_name>
 76 | -- MAGIC AS SELECT *
 77 | -- MAGIC   FROM cloud_files(
 78 | -- MAGIC     "<file_path>",
 79 | -- MAGIC     "<file_format>",
 80 | -- MAGIC     map(
 81 | -- MAGIC       "<option_key>", "<option_value",
 82 | -- MAGIC       "<option_key>", "<option_value",
 83 | -- MAGIC       ...
 84 | -- MAGIC     )
 85 | -- MAGIC   )
 86 | -- MAGIC ```
 87 | 
 88 | -- COMMAND ----------
 89 | 
 90 | -- MAGIC %md
 91 | -- MAGIC 
 92 | -- MAGIC Use `schema` option to specify the schema manually
 93 | -- MAGIC 
 94 | -- MAGIC Mandatory for formats that with no [schema inference](https://docs.databricks.com/ingestion/auto-loader/schema.html)
 95 | -- MAGIC 
 96 | -- MAGIC ```
 97 | -- MAGIC CREATE OR REFRESH STREAMING LIVE TABLE <table_name>
 98 | -- MAGIC AS SELECT *
 99 | -- MAGIC   FROM cloud_files(
100 | -- MAGIC     "<file_path>",
101 | -- MAGIC     "<file_format>",
102 | -- MAGIC     map("schema", "title STRING, id INT, revisionId INT, revisionTimestamp TIMESTAMP, revisionUsername STRING, revisionUsernameId INT, text STRING")
103 | -- MAGIC   )
104 | -- MAGIC ```
105 | 
106 | -- COMMAND ----------
107 | 
108 | -- no "header", "true" by default
109 | CREATE OR REFRESH STREAMING LIVE TABLE raw_streaming_table(
110 |   CONSTRAINT names_at_least_5_char_long EXPECT (name IS NOT NULL AND len(name) >= 5),
111 |   CONSTRAINT ids_only_even EXPECT (id % 2 = 0)
112 | )
113 | AS SELECT * FROM
114 |   cloud_files(
115 |     "${cloud_files_input_path}",
116 |     "csv",
117 |     map(
118 |       "schema", "id INT, name STRING"
119 |     )
120 |   )
121 | 


--------------------------------------------------------------------------------
/Delta Live Tables/Storage location.sql:
--------------------------------------------------------------------------------
  1 | -- Databricks notebook source
  2 | -- MAGIC %md # Storage location
  3 | 
  4 | -- COMMAND ----------
  5 | 
  6 | -- MAGIC %md
  7 | -- MAGIC 
  8 | -- MAGIC **Storage location** can be specified explicitly by a user while creating a Delta Live Table pipeline or assigned automatically by the runtime.
  9 | -- MAGIC 
 10 | -- MAGIC It can only be specified once and for the whole lifecycle of a DLT pipeline. It cannot be changed ever.
 11 | -- MAGIC 
 12 | -- MAGIC If auto-assigned by the runtime, the storage location is under `dbfs:/pipelines` directory (in a directory with the same name as the pipeline ID).
 13 | -- MAGIC 
 14 | -- MAGIC You can find out about the **Storage location** of a DLT pipeline in the **Pipeline settings > Destination** section in the UI.
 15 | 
 16 | -- COMMAND ----------
 17 | 
 18 | -- MAGIC %fs ls dbfs:/pipelines/960da65b-c9df-4cb9-9456-1005ffe103a9/
 19 | 
 20 | -- COMMAND ----------
 21 | 
 22 | -- MAGIC %md
 23 | -- MAGIC 
 24 | -- MAGIC You can also find out about the Storage location of a DLT pipeline using [Delta Live Table API](https://docs.databricks.com/workflows/delta-live-tables/delta-live-tables-api-guide.html) directly or higher-level [Delta Live Tables CLI](https://docs.databricks.com/dev-tools/cli/dlt-cli.html) (`databricks pipelines`).
 25 | -- MAGIC 
 26 | -- MAGIC ```console
 27 | -- MAGIC $ databricks pipelines get --pipeline-id 960da65b-c9df-4cb9-9456-1005ffe103a9 | jq '.spec.storage'
 28 | -- MAGIC "dbfs:/pipelines/960da65b-c9df-4cb9-9456-1005ffe103a9"
 29 | -- MAGIC ```
 30 | 
 31 | -- COMMAND ----------
 32 | 
 33 | -- MAGIC %md
 34 | -- MAGIC 
 35 | -- MAGIC [Databricks recommends always storing checkpoint and schema evolution information in storage locations managed by Unity Catalog](https://docs.databricks.com/ingestion/auto-loader/unity-catalog.html#specifying-locations-for-auto-loader-resources-for-unity-catalog)
 36 | 
 37 | -- COMMAND ----------
 38 | 
 39 | -- MAGIC %md ## Autoloader Directory
 40 | -- MAGIC 
 41 | -- MAGIC Contains schema evolution information
 42 | 
 43 | -- COMMAND ----------
 44 | 
 45 | -- MAGIC %fs ls dbfs:/pipelines/960da65b-c9df-4cb9-9456-1005ffe103a9/autoloader
 46 | 
 47 | -- COMMAND ----------
 48 | 
 49 | -- MAGIC %fs ls dbfs:/pipelines/960da65b-c9df-4cb9-9456-1005ffe103a9/autoloader/schema_1493166085_/_schemas
 50 | 
 51 | -- COMMAND ----------
 52 | 
 53 | -- MAGIC %fs head dbfs:/pipelines/960da65b-c9df-4cb9-9456-1005ffe103a9/autoloader/schema_1493166085_/_schemas/0
 54 | 
 55 | -- COMMAND ----------
 56 | 
 57 | -- MAGIC %md ## System Directory
 58 | 
 59 | -- COMMAND ----------
 60 | 
 61 | -- MAGIC %fs ls dbfs:/pipelines/960da65b-c9df-4cb9-9456-1005ffe103a9/system/
 62 | 
 63 | -- COMMAND ----------
 64 | 
 65 | -- MAGIC %fs ls dbfs:/pipelines/960da65b-c9df-4cb9-9456-1005ffe103a9/system/events
 66 | 
 67 | -- COMMAND ----------
 68 | 
 69 | -- MAGIC %md ## Events Delta Table
 70 | 
 71 | -- COMMAND ----------
 72 | 
 73 | SELECT * FROM delta.`dbfs:/pipelines/a02952e6-7197-44a4-a072-5ea5124d7bce/system/events`
 74 | 
 75 | -- COMMAND ----------
 76 | 
 77 | -- MAGIC %md ## Data Quality Checks
 78 | 
 79 | -- COMMAND ----------
 80 | 
 81 | -- MAGIC %python
 82 | -- MAGIC 
 83 | -- MAGIC pipelines = spark.createDataFrame(data = dbutils.fs.ls("dbfs:/pipelines/"))
 84 | -- MAGIC path = pipelines.orderBy(pipelines["modificationTime"].desc()).select("path").head().path
 85 | -- MAGIC spark.conf.set("pipeline.path", path)
 86 | -- MAGIC print(spark.conf.get('pipeline.path'))
 87 | 
 88 | -- COMMAND ----------
 89 | 
 90 | SELECT '${pipeline.path}' path
 91 | 
 92 | -- COMMAND ----------
 93 | 
 94 | DESCRIBE delta.`${pipeline.path}/system/events`
 95 | 
 96 | -- COMMAND ----------
 97 | 
 98 | -- MAGIC %md
 99 | -- MAGIC 
100 | -- MAGIC Inspired by [this article](https://www.linkedin.com/pulse/delta-live-tables-how-build-pipeline-run-data-quality-mathias-weber/) and feeling a bit adventurous to use some advanced "tools":
101 | -- MAGIC 
102 | -- MAGIC * [Common Table Expression (CTE)](https://spark.apache.org/docs/latest/sql-ref-syntax-qry-select-cte.html)
103 | -- MAGIC * [JSON path expression](https://docs.databricks.com/sql/language-manual/sql-ref-json-path-expression.html)
104 | 
105 | -- COMMAND ----------
106 | 
107 | WITH data_quality AS (
108 |   WITH details AS (
109 |     SELECT
110 |       id update_id,
111 |       details:flow_progress:data_quality:expectations
112 |     FROM delta.`dbfs:/pipelines/05740fff-c03e-4366-8061-2680f9e9ce48/system/events`
113 |     WHERE event_type = 'flow_progress'
114 |   )
115 |   SELECT
116 |     update_id,
117 |     explode(from_json(expectations, "array<struct<name: string, dataset: string, passed_records: int, failed_records: int>>")) expectations
118 |   FROM details
119 |   WHERE expectations IS NOT NULL
120 | )
121 | SELECT update_id, expectations.* FROM data_quality
122 | 


--------------------------------------------------------------------------------
/workshops/Databricks Workshop Day 3.sql:
--------------------------------------------------------------------------------
 1 | -- Databricks notebook source
 2 | -- MAGIC %md # Databricks Workshop Day 3
 3 | -- MAGIC
 4 | -- MAGIC Duration: 4.5 hours (9:30-14:00)
 5 | 
 6 | -- COMMAND ----------
 7 | 
 8 | -- MAGIC %md ## Schedule
 9 | 
10 | -- COMMAND ----------
11 | 
12 | -- MAGIC %md
13 | -- MAGIC
14 | -- MAGIC * The class starts at 9:30
15 | -- MAGIC * A class is split into 1-hour blocks with a 12-minute break each
16 | -- MAGIC     * Breaks at the end of an hour
17 | -- MAGIC     * However, the first 20' break is at 10:30 (till 10:50)
18 | 
19 | -- COMMAND ----------
20 | 
21 | -- MAGIC %md ## Agenda
22 | 
23 | -- COMMAND ----------
24 | 
25 | -- MAGIC %md
26 | -- MAGIC
27 | -- MAGIC 1. (spark) [The Internals of Structured Query Execution](https://jaceklaskowski.github.io/spark-workshop/slides/spark-sql-internals-of-structured-query-execution.html)
28 | -- MAGIC     * jak czytać, na co zwracać uwagę 
29 | -- MAGIC     * omówić ogólnie
30 | -- MAGIC     * analiza na przykładzie załadunku do silver i MOLka
31 | -- MAGIC     * [Exercise: How to add days (as values of a column) to date?](https://jaceklaskowski.github.io/spark-workshop/exercises/sql/How-to-add-days-as-values-of-a-column-to-date.html)
32 | -- MAGIC     * [Exercise: split function with variable delimiter per row](https://jaceklaskowski.github.io/spark-workshop/exercises/sql/split-function-with-variable-delimiter-per-row.html)
33 | -- MAGIC 1. (spark) Narrow and Wide Transformations
34 | -- MAGIC     * [Basic Aggregation](https://jaceklaskowski.github.io/spark-workshop/slides/spark-sql-basic-aggregation.html#/home)
35 | -- MAGIC     * [Exercise: Finding Ids of Rows with Word in Array Column](https://jaceklaskowski.github.io/spark-workshop/exercises/sql/Finding-Ids-of-Rows-with-Word-in-Array-Column.html)
36 | -- MAGIC     * [Windowed Aggregation](https://jaceklaskowski.github.io/spark-workshop/slides/spark-sql-windowed-aggregation.html#/home)
37 | -- MAGIC     * [Exercise: Finding 1st and 2nd Bestsellers Per Genre](https://jaceklaskowski.github.io/spark-workshop/exercises/sql/Finding-1st-and-2nd-Bestsellers-Per-Genre.html)
38 | -- MAGIC     * [Exercise: Calculating Gap Between Current And Highest Salaries Per Department](https://jaceklaskowski.github.io/spark-workshop/exercises/sql/Calculating-Gap-Between-Current-And-Highest-Salaries-Per-Department.html)
39 | -- MAGIC     * [Exercise: Calculating Difference Between Consecutive Rows Per Window](https://jaceklaskowski.github.io/spark-workshop/exercises/sql/Calculating-Difference-Between-Consecutive-Rows-Per-Window.html)
40 | -- MAGIC     * [Demo: Dynamic Partition Pruning](https://books.japila.pl/spark-sql-internals/demo/dynamic-partition-pruning/)
41 | -- MAGIC 1. (spark) Data Shuffle
42 | -- MAGIC     * jak sprawdzić czy występują w planie zapytania
43 | -- MAGIC     * w jakich sytuacjach i jak możemy przeciwdziałać
44 | -- MAGIC     * możemy zerknąć na plan wykonania naszego ładowania
45 | -- MAGIC     * kiedy warto stosować [Bucketing](https://books.japila.pl/spark-sql-internals/bucketing/)
46 | -- MAGIC     * [Demo: ObjectHashAggregateExec and Sort-Based Fallback Tasks](https://books.japila.pl/spark-sql-internals/demo/objecthashaggregateexec-sort-based-fallback-tasks/)
47 | -- MAGIC     * [Demo: Spilling](https://books.japila.pl/spark-sql-internals/demo/spilling/)
48 | -- MAGIC 1. (spark) [Joins](https://jaceklaskowski.github.io/spark-workshop/slides/spark-sql-joins.html)
49 | -- MAGIC     * joiny, hinty itp (broadcasty), jak sobie radzic z joinami duzych tabel
50 | -- MAGIC     * range joiny -> czy stosować, jak stosować, wszystkie podziały itp
51 | -- MAGIC     * [Bloom Filter Join](https://books.japila.pl/spark-sql-internals/bloom-filter-join/)
52 | -- MAGIC     * [Runtime Filtering](https://books.japila.pl/spark-sql-internals/runtime-filtering/)
53 | -- MAGIC     * [Exercise: Finding Most Populated Cities Per Country](https://jaceklaskowski.github.io/spark-workshop/exercises/sql/Finding-Most-Populated-Cities-Per-Country.html)
54 | -- MAGIC     * [Exercise: Selecting the most important rows per assigned priority](https://jaceklaskowski.github.io/spark-workshop/exercises/sql/selecting-the-most-important-rows-per-assigned-priority.html)
55 | -- MAGIC 1. [Adaptive Query Execution (AQE)](https://books.japila.pl/spark-sql-internals/adaptive-query-execution/)
56 | -- MAGIC 1. (delta lake/databricks) [Change Data Feed / Change Data Capture](https://books.japila.pl/delta-lake-internals/change-data-feed/)
57 | -- MAGIC     * "Pure" Delta Lake (not Delta Live Tables)
58 | -- MAGIC     * Mediallion Architecture
59 | -- MAGIC     * The Gold layer based on CDF of (a JOIN query of) tables from the Silver layer
60 | -- MAGIC     * [Use Delta Lake change data feed on Databricks](https://docs.databricks.com/en/delta/delta-change-data-feed.html)
61 | -- MAGIC     * [Demo: Change Data Feed](https://books.japila.pl/delta-lake-internals/demo/change-data-feed/)
62 | -- MAGIC     * [Notebook example: Propagate changes with Delta change data feed](https://docs.databricks.com/en/delta/delta-change-data-feed.html#notebook-example-propagate-changes-with-delta-change-data-feed)
63 | -- MAGIC
64 | 


--------------------------------------------------------------------------------
/Delta Live Tables/Building Delta Live Tables pipelines with SQL.sql:
--------------------------------------------------------------------------------
  1 | -- Databricks notebook source
  2 | -- MAGIC %md
  3 | -- MAGIC 
  4 | -- MAGIC # Building Data Pipelines with Delta Live Tables using SQL
  5 | -- MAGIC 
  6 | -- MAGIC [Meetup](https://www.meetup.com/warsaw-data-engineering/events/291905799/)
  7 | -- MAGIC 
  8 | -- MAGIC Delta Live Tables extends functionality of Apache Spark's Structured Streaming and allows you to write just a few lines of declarative Python or SQL to deploy a production-quality data pipeline (from [Tutorial: ingesting data with Databricks Auto Loader](https://docs.databricks.com/ingestion/auto-loader/index.html#tutorial-ingesting-data-with-databricks-auto-loader)).
  9 | -- MAGIC 
 10 | -- MAGIC Learn more in [Delta Live Tables SQL language reference](https://docs.databricks.com/workflows/delta-live-tables/delta-live-tables-sql-ref.html)
 11 | 
 12 | -- COMMAND ----------
 13 | 
 14 | -- MAGIC %md ## Introduction
 15 | -- MAGIC 
 16 | -- MAGIC * Delta Live Tables supports only SQL and Python (You cannot use JVM libraries in a DLT pipeline)
 17 | 
 18 | -- COMMAND ----------
 19 | 
 20 | -- MAGIC %md
 21 | -- MAGIC 
 22 | -- MAGIC ## CREATE LIVE TABLE
 23 | -- MAGIC 
 24 | -- MAGIC * `CREATE OR REFRESH [TEMPORARY] { STREAMING LIVE TABLE | LIVE TABLE } table_name`
 25 | -- MAGIC * This Delta Live Tables query is syntactically valid, but you must create a pipeline in order to define and populate your table.
 26 | 
 27 | -- COMMAND ----------
 28 | 
 29 | -- MAGIC %md ## CREATE TEMPORARY LIVE VIEW
 30 | -- MAGIC 
 31 | -- MAGIC * `CREATE TEMPORARY [STREAMING] LIVE VIEW view_name`
 32 | 
 33 | -- COMMAND ----------
 34 | 
 35 | -- MAGIC %md ## TEMPORARY
 36 | -- MAGIC 
 37 | -- MAGIC [SQL properties](https://docs.databricks.com/workflows/delta-live-tables/delta-live-tables-sql-ref.html#sql-properties-1):
 38 | -- MAGIC 
 39 | -- MAGIC * `TEMPORARY` creates a temporary table or view. No metadata is persisted for this table.
 40 | -- MAGIC * Use TEMPORARY marker to prevent publishing of intermediate tables that are not intended for external consumption (discussed later in this notebook)
 41 | 
 42 | -- COMMAND ----------
 43 | 
 44 | -- MAGIC %md ## STREAMING
 45 | -- MAGIC 
 46 | -- MAGIC * Creates a table or view that reads an input dataset as a stream
 47 | -- MAGIC * Input dataset must be a streaming data source, e.g. [Auto Loader](https://docs.databricks.com/ingestion/auto-loader/index.html) (discussed lated in this notebook) or a `STREAMING LIVE` table or view.
 48 | 
 49 | -- COMMAND ----------
 50 | 
 51 | -- MAGIC %md
 52 | -- MAGIC 
 53 | -- MAGIC ## Identity Columns
 54 | -- MAGIC 
 55 | -- MAGIC `CREATE LIVE TABLE` supports `GENERATED ALWAYS AS` clause (see [CREATE TABLE SQL reference](https://docs.databricks.com/sql/language-manual/sql-ref-syntax-ddl-create-table-using.html)).
 56 | -- MAGIC 
 57 | -- MAGIC `GENERATED ALWAYS AS IDENTITY` clause can only be used for columns with BIGINT data type.
 58 | 
 59 | -- COMMAND ----------
 60 | 
 61 | -- MAGIC %md ## Table properties
 62 | -- MAGIC 
 63 | -- MAGIC [Table properties](https://docs.databricks.com/workflows/delta-live-tables/delta-live-tables-sql-ref.html#tbl-properties)
 64 | 
 65 | -- COMMAND ----------
 66 | 
 67 | -- MAGIC %md
 68 | -- MAGIC 
 69 | -- MAGIC ## Publish data from Delta Live Tables pipelines
 70 | -- MAGIC 
 71 | -- MAGIC [Publish data from Delta Live Tables pipelines](https://docs.databricks.com/workflows/delta-live-tables/delta-live-tables-publish.html):
 72 | -- MAGIC 
 73 | -- MAGIC 1. make the output data of your pipeline discoverable and available to query by publishing datasets to the Databricks metastore.
 74 | -- MAGIC 1. enter a database name in the Target field when you create a pipeline
 75 | -- MAGIC 1. No support for publishing tables to Unity Catalog. Delta Live Tables supports publishing tables only to the workspace-level Hive metastore.
 76 | -- MAGIC 1. only tables and associated metadata are published. Views are not published to the metastore (because they are temporary by definition).
 77 | -- MAGIC 1. Use `TEMPORARY` marker to prevent publishing of intermediate tables that are not intended for external consumption
 78 | -- MAGIC     ```sql
 79 | -- MAGIC     CREATE TEMPORARY LIVE TABLE temp_table
 80 | -- MAGIC     AS SELECT ... ;
 81 | -- MAGIC     ```
 82 | 
 83 | -- COMMAND ----------
 84 | 
 85 | --- Pipeline settings > Destination > Target schema
 86 | SHOW TABLES IN jaceklaskowski_dlts;
 87 | 
 88 | -- COMMAND ----------
 89 | 
 90 | select * from jaceklaskowski_dlts.dlt_one;
 91 | 
 92 | -- COMMAND ----------
 93 | 
 94 | DESCRIBE EXTENDED jaceklaskowski_dlts.dlt_one;
 95 | 
 96 | -- COMMAND ----------
 97 | 
 98 | describe history jaceklaskowski_dlts.dlt_one;
 99 | 
100 | -- COMMAND ----------
101 | 
102 | -- MAGIC %md
103 | -- MAGIC 
104 | -- MAGIC ## The End
105 | 
106 | -- COMMAND ----------
107 | 
108 | -- MAGIC %md
109 | -- MAGIC 
110 | -- MAGIC ## Pipeline updates
111 | -- MAGIC 
112 | -- MAGIC [Pipeline updates](https://docs.databricks.com/workflows/delta-live-tables/delta-live-tables-concepts.html#dlt-concepts-updates)
113 | -- MAGIC 
114 | -- MAGIC An **update** does the following:
115 | -- MAGIC 
116 | -- MAGIC 1. Starts a cluster with the correct configuration.
117 | -- MAGIC 1. Discovers all the tables and views defined, and checks for any analysis errors such as invalid column names, missing dependencies, and syntax errors.
118 | -- MAGIC 1. Creates or updates tables and views with the most recent data available.
119 | 
120 | -- COMMAND ----------
121 | 
122 | -- MAGIC %md
123 | -- MAGIC 
124 | -- MAGIC ## Delta Live Tables FAQ
125 | -- MAGIC 
126 | -- MAGIC [Delta Live Tables frequently asked questions](https://docs.databricks.com/workflows/delta-live-tables/delta-live-tables-faqs-issues.html)
127 | 


--------------------------------------------------------------------------------
/meetups/MLflow on Databricks.py:
--------------------------------------------------------------------------------
  1 | # Databricks notebook source
  2 | # MAGIC %md
  3 | # MAGIC
  4 | # MAGIC https://learn.microsoft.com/en-us/azure/databricks/machine-learning/manage-model-lifecycle/
  5 | 
  6 | # COMMAND ----------
  7 | 
  8 | # MAGIC %pip install --upgrade "mlflow-skinny[databricks]"
  9 | 
 10 | # COMMAND ----------
 11 | 
 12 | # MAGIC %restart_python
 13 | 
 14 | # COMMAND ----------
 15 | 
 16 | import mlflow
 17 | mlflow.get_registry_uri()
 18 | 
 19 | # COMMAND ----------
 20 | 
 21 | from sklearn import datasets
 22 | from sklearn.ensemble import RandomForestClassifier
 23 | 
 24 | # Train a sklearn model on the iris dataset
 25 | X, y = datasets.load_iris(return_X_y=True, as_frame=True)
 26 | clf = RandomForestClassifier(max_depth=7)
 27 | clf.fit(X, y)
 28 | 
 29 | # Note that the UC model name follows the pattern
 30 | # <catalog_name>.<schema_name>.<model_name>, corresponding to
 31 | # the catalog, schema, and registered model name
 32 | # in Unity Catalog under which to create the version
 33 | # The registered model will be created if it doesn't already exist
 34 | autolog_run = mlflow.last_active_run()
 35 | print(autolog_run)
 36 | 
 37 | # COMMAND ----------
 38 | 
 39 | model_run = mlflow.active_run()
 40 | 
 41 | # COMMAND ----------
 42 | 
 43 | print(model_run.info)
 44 | 
 45 | # COMMAND ----------
 46 | 
 47 | model_uri = "runs:/{}/model".format(model_run.info.run_id)
 48 | mlflow.register_model(model_uri, "iris_model")
 49 | 
 50 | # COMMAND ----------
 51 | 
 52 | # MAGIC %md
 53 | # MAGIC
 54 | # MAGIC # Notes
 55 | # MAGIC
 56 | # MAGIC Recommended Training: [Use MLflow in Azure Databricks](https://learn.microsoft.com/en-us/training/modules/mlflow-azure-databricks/)
 57 | # MAGIC
 58 | # MAGIC > Prerequisites
 59 | # MAGIC >
 60 | # MAGIC > Before starting this module, you should be familiar with Azure Databricks and the **machine learning model training process**.
 61 | # MAGIC
 62 | # MAGIC This "machine learning model training process" is very important.
 63 | 
 64 | # COMMAND ----------
 65 | 
 66 | # MAGIC %md
 67 | # MAGIC # Run experiments with MLflow
 68 | # MAGIC
 69 | # MAGIC [Run experiments with MLflow](https://learn.microsoft.com/en-us/training/modules/mlflow-azure-databricks/3-run-experiments)
 70 | 
 71 | # COMMAND ----------
 72 | 
 73 | # MAGIC %md
 74 | # MAGIC
 75 | # MAGIC **MLflow experiments** allow data scientists to track training runs in a collection called an **experiment**.
 76 | # MAGIC
 77 | # MAGIC **Experiment runs** are useful for the following:
 78 | # MAGIC
 79 | # MAGIC 1. Compare changes over time.
 80 | # MAGIC 1. Compare the relative performance of models with different hyperparameter values.
 81 | 
 82 | # COMMAND ----------
 83 | 
 84 | # MAGIC %md
 85 | # MAGIC
 86 | # MAGIC Follow up in [Running an experiment](https://learn.microsoft.com/en-us/training/modules/mlflow-azure-databricks/3-run-experiments)
 87 | 
 88 | # COMMAND ----------
 89 | 
 90 | # MAGIC %md
 91 | # MAGIC # MLflow
 92 | 
 93 | # COMMAND ----------
 94 | 
 95 | # MAGIC %md
 96 | # MAGIC
 97 | # MAGIC From "Introduction" in [Use MLflow in Azure Databricks](https://learn.microsoft.com/en-us/training/modules/mlflow-azure-databricks/1-introduction) training:
 98 | # MAGIC
 99 | # MAGIC 1. MLflow is an open source platform for end-to-end machine learning operations.
100 | # MAGIC 1. Using MLflow, data scientists can track model training experiments; logging parameters, metrics, and other assets.
101 | # MAGIC 1. Machine learning engineers can use MLflow to deploy and manage models, enabling applications to consume the models and use them to inference predictions for new data.
102 | 
103 | # COMMAND ----------
104 | 
105 | # MAGIC %md
106 | # MAGIC ## Capabilities of MLflow
107 | # MAGIC
108 | # MAGIC There are four components to MLflow:
109 | # MAGIC
110 | # MAGIC 1. MLflow Tracking
111 | # MAGIC 1. MLflow Projects
112 | # MAGIC 1. MLflow Models
113 | # MAGIC 1. MLflow Model Registry
114 | 
115 | # COMMAND ----------
116 | 
117 | # MAGIC %md
118 | # MAGIC ## MLflow Tracking
119 | # MAGIC
120 | # MAGIC * **MLflow Tracking** allows data scientists to work with experiments in which they process and analyze data or train machine learning models.
121 | # MAGIC * For each run in an experiment, a data scientist can log parameter values, versions of libraries used, model evaluation metrics, and generated output files; including images of data visualizations and model files.
122 | # MAGIC * This ability to log important details about experiment runs makes it possible to audit and compare the results of prior model training executions.
123 | 
124 | # COMMAND ----------
125 | 
126 | # MAGIC %md
127 | # MAGIC
128 | # MAGIC ## MLflow Projects
129 | # MAGIC
130 | # MAGIC 1. An MLflow Project is a way of packaging up code for consistent deployment and reproducibility of results.
131 | # MAGIC 1. MLflow supports several environments for projects, including the use of Conda and Docker to define consistent Python code execution environments.
132 | 
133 | # COMMAND ----------
134 | 
135 | # MAGIC %md
136 | # MAGIC
137 | # MAGIC ## MLflow Models
138 | # MAGIC
139 | # MAGIC 1. MLflow offers a standardized format for packaging models for distribution.
140 | # MAGIC 1. This standardized model format allows MLflow to work with models generated from several popular libraries, including Scikit-Learn, PyTorch, MLlib, and others.
141 | # MAGIC
142 | # MAGIC Learn more in [MLflow Models](https://mlflow.org/docs/latest/model)
143 | 
144 | # COMMAND ----------
145 | 
146 | # MAGIC %md
147 | # MAGIC ## MLflow Model Registry
148 | # MAGIC
149 | # MAGIC 1. The MLflow Model Registry allows data scientists to register trained models.
150 | # MAGIC 1. MLflow Models and MLflow Projects use the MLflow Model Registry to enable machine learning engineers to deploy and serve models for client applications to consume.
151 | 


--------------------------------------------------------------------------------
/Delta Lake/Generated Columns.sql:
--------------------------------------------------------------------------------
  1 | -- Databricks notebook source
  2 | -- MAGIC %md # Generated Columns
  3 | -- MAGIC 
  4 | -- MAGIC [CREATE TABLE USING](https://docs.databricks.com/sql/language-manual/sql-ref-syntax-ddl-create-table-using.html)
  5 | 
  6 | -- COMMAND ----------
  7 | 
  8 | CREATE SCHEMA IF NOT EXISTS jacek_laskowski;
  9 | USE jacek_laskowski;
 10 | 
 11 | -- COMMAND ----------
 12 | 
 13 | CREATE OR REPLACE TABLE generated_columns (
 14 |   id BIGINT GENERATED ALWAYS AS IDENTITY,
 15 |   name STRING,
 16 |   five_by_default INT GENERATED ALWAYS AS (5))
 17 | USING delta
 18 | 
 19 | -- COMMAND ----------
 20 | 
 21 | -- MAGIC %md # SHOW CREATE TABLE
 22 | -- MAGIC 
 23 | -- MAGIC `SHOW CREATE TABLE` seems the only way to find out generated columns.
 24 | 
 25 | -- COMMAND ----------
 26 | 
 27 | SHOW CREATE TABLE generated_columns
 28 | 
 29 | -- COMMAND ----------
 30 | 
 31 | DESC TABLE EXTENDED generated_columns id
 32 | 
 33 | -- COMMAND ----------
 34 | 
 35 | -- MAGIC %md # Column Metadata
 36 | 
 37 | -- COMMAND ----------
 38 | 
 39 | -- MAGIC %scala
 40 | -- MAGIC 
 41 | -- MAGIC import org.apache.spark.sql.connector.catalog.TableCatalog
 42 | -- MAGIC import org.apache.spark.sql.connector.catalog.Identifier
 43 | -- MAGIC val table = spark.sessionState.catalogManager.currentCatalog.asInstanceOf[TableCatalog].loadTable(Identifier.of(Array("default"), "generated_columns"))
 44 | -- MAGIC 
 45 | -- MAGIC import com.databricks.sql.transaction.tahoe.catalog.DeltaTableV2
 46 | -- MAGIC table.asInstanceOf[DeltaTableV2].snapshot.tableDataSchema.map(_.metadata).foreach(println)
 47 | 
 48 | -- COMMAND ----------
 49 | 
 50 | -- MAGIC %md # Using High-Level API
 51 | -- MAGIC 
 52 | -- MAGIC Using the Developer API seems fruitless as column metadata (where generated column expressions are stored) is cleared up :(
 53 | 
 54 | -- COMMAND ----------
 55 | 
 56 | -- MAGIC %scala
 57 | -- MAGIC 
 58 | -- MAGIC import io.delta.tables.DeltaTable
 59 | -- MAGIC val dt = DeltaTable.forName("generated_columns")
 60 | -- MAGIC display(dt.toDF.schema.map(c => (c.name, c.dataType.sql, c.metadata.json)).toDF("name", "dataType", "metadata"))
 61 | 
 62 | -- COMMAND ----------
 63 | 
 64 | -- MAGIC %scala
 65 | -- MAGIC 
 66 | -- MAGIC val table = spark.sharedState.externalCatalog.getTable("default", "generated_columns")
 67 | -- MAGIC import org.apache.spark.sql.functions.from_json
 68 | -- MAGIC import org.apache.spark.sql.types._
 69 | -- MAGIC val metadata_schema = StructType.fromDDL("metadata map<string, string>")
 70 | -- MAGIC val schemaDf = table.schema
 71 | -- MAGIC   .map(c => (c.name, c.dataType.sql, c.metadata.json))
 72 | -- MAGIC   .toDF("name", "dataType", "metadata")
 73 | -- MAGIC   // FIXME How to display metadata as JSON using metadata_schema
 74 | -- MAGIC   //.withColumn("metadata", from_json($"metadata", metadata_schema))
 75 | -- MAGIC display(schemaDf)
 76 | 
 77 | -- COMMAND ----------
 78 | 
 79 | -- MAGIC %md # GENERATED AS IDENTITY Clause
 80 | -- MAGIC 
 81 | -- MAGIC ```antlr
 82 | -- MAGIC GENERATED { ALWAYS | BY DEFAULT } AS IDENTITY [ ( [ START WITH start ] [ INCREMENT BY step ] ) ]
 83 | -- MAGIC ```
 84 | -- MAGIC 
 85 | -- MAGIC When you write to the table, and do not provide values for the identity column, it will be automatically assigned a unique and statistically increasing (or decreasing if step is negative) value. This clause is only supported for Delta Lake tables. This clause can only be used for columns with BIGINT data type.
 86 | -- MAGIC 
 87 | -- MAGIC Assigned values are unique but are not guaranteed to be contiguous. Both parameters are optional, and the default value is 1. step cannot be 0.
 88 | -- MAGIC 
 89 | -- MAGIC If the automatically assigned values are beyond the range of the identity column type, the query will fail.
 90 | 
 91 | -- COMMAND ----------
 92 | 
 93 | -- MAGIC %md ## BY DEFAULT
 94 | 
 95 | -- COMMAND ----------
 96 | 
 97 | CREATE TABLE ident (
 98 |   name string,
 99 |   id BIGINT GENERATED BY DEFAULT AS IDENTITY (START WITH 0 INCREMENT BY 1)
100 | );
101 | INSERT INTO ident (name) VALUES ('Juli'), ('Mark');
102 | INSERT INTO ident (name, id) VALUES ('Dave', 5);
103 | SELECT * FROM ident;
104 | 
105 | -- COMMAND ----------
106 | 
107 | INSERT INTO ident (name) VALUES ('Jacek');
108 | 
109 | -- COMMAND ----------
110 | 
111 | select * from ident;
112 | 
113 | -- COMMAND ----------
114 | 
115 | INSERT INTO ident (name) VALUES ('Agata');
116 | 
117 | -- COMMAND ----------
118 | 
119 | select * from ident ORDER BY id;
120 | 
121 | -- COMMAND ----------
122 | 
123 | -- MAGIC %md What's going to be `ID` value now?
124 | 
125 | -- COMMAND ----------
126 | 
127 | INSERT INTO ident (name) VALUES ("Guess what's the ID?");
128 | SELECT * FROM ident;
129 | 
130 | -- COMMAND ----------
131 | 
132 | -- MAGIC %md ## ALWAYS
133 | 
134 | -- COMMAND ----------
135 | 
136 | -- MAGIC %md
137 | -- MAGIC 
138 | -- MAGIC * When ALWAYS is used, you cannot provide your own values for the identity column.
139 | 
140 | -- COMMAND ----------
141 | 
142 | -- MAGIC %md # GENERATED vs DEFAULT
143 | 
144 | -- COMMAND ----------
145 | 
146 | -- MAGIC %md
147 | -- MAGIC 
148 | -- MAGIC `GENERATED ALWAYS AS`
149 | -- MAGIC 
150 | -- MAGIC `GENERATED AS IDENTITY`
151 | -- MAGIC 
152 | -- MAGIC * This clause is only supported for Delta Lake tables
153 | 
154 | -- COMMAND ----------
155 | 
156 | -- MAGIC %md
157 | -- MAGIC 
158 | -- MAGIC `DEFAULT default_expression`
159 | -- MAGIC 
160 | -- MAGIC * Defines a `DEFAULT` value for the column which is used on `INSERT`, `UPDATE`, and `MERGE ... INSERT` when the column is not specified.
161 | -- MAGIC * Supported for CSV, JSON, PARQUET, and ORC sources
162 | -- MAGIC * If no default is specified `DEFAULT NULL` is applied for nullable columns.
163 | -- MAGIC * `default_expression` may be composed of literals, and built-in SQL functions or operators
164 | 


--------------------------------------------------------------------------------
/Data Visualization/Data Visualization on Databricks.py:
--------------------------------------------------------------------------------
  1 | # Databricks notebook source
  2 | # MAGIC %md # Data Visualization on Databricks
  3 | 
  4 | # COMMAND ----------
  5 | 
  6 | # MAGIC %md
  7 | # MAGIC
  8 | # MAGIC Inspired by the course [Introduction to Python for Data Science and Data Engineering](https://www.databricks.com/training/catalog/introduction-to-python-for-data-science-and-data-engineering-969)
  9 | 
 10 | # COMMAND ----------
 11 | 
 12 | # MAGIC %md
 13 | # MAGIC
 14 | # MAGIC ## pandas
 15 | 
 16 | # COMMAND ----------
 17 | 
 18 | # MAGIC %md
 19 | # MAGIC
 20 | # MAGIC ### pandas.core.frame.DataFrame
 21 | 
 22 | # COMMAND ----------
 23 | 
 24 | import pandas as pd
 25 | 
 26 | # COMMAND ----------
 27 | 
 28 | pd.__version__
 29 | 
 30 | # COMMAND ----------
 31 | 
 32 | # MAGIC %pip install --upgrade pandas
 33 | 
 34 | # COMMAND ----------
 35 | 
 36 | dbutils.library.restartPython()
 37 | 
 38 | # COMMAND ----------
 39 | 
 40 | import pandas as pd
 41 | pd.__version__
 42 | 
 43 | # COMMAND ----------
 44 | 
 45 | # data = a list of lists
 46 | data = [
 47 |     [0, 'she',  'She Senior Dev'],
 48 |     [1, 'he',   'He Junior Dev'],
 49 |     [2, 'them', 'Them Python Dev'],
 50 | ]
 51 | columns = ['id', 'name', 'role']
 52 | 
 53 | # COMMAND ----------
 54 | 
 55 | pandas_dataframe = pd.DataFrame(data=data, columns=columns)
 56 | 
 57 | # COMMAND ----------
 58 | 
 59 | type(pandas_dataframe)
 60 | 
 61 | # COMMAND ----------
 62 | 
 63 | display(pandas_dataframe)
 64 | 
 65 | # COMMAND ----------
 66 | 
 67 | # MAGIC %md
 68 | # MAGIC
 69 | # MAGIC ### pandas.core.series.Series
 70 | 
 71 | # COMMAND ----------
 72 | 
 73 | type(pandas_dataframe['id'])
 74 | 
 75 | # COMMAND ----------
 76 | 
 77 | display(pandas_dataframe['id'])
 78 | 
 79 | # COMMAND ----------
 80 | 
 81 | # Good ol' map in Functional Programming (FP)
 82 | pandas_dataframe['id'] + 1 * 2
 83 | 
 84 | # COMMAND ----------
 85 | 
 86 | # MAGIC %md
 87 | # MAGIC
 88 | # MAGIC ## Spark SQL
 89 | 
 90 | # COMMAND ----------
 91 | 
 92 | # MAGIC %md
 93 | # MAGIC
 94 | # MAGIC ### DataFrame
 95 | 
 96 | # COMMAND ----------
 97 | 
 98 | # Guess what happens in PySpark with no data provided for a cell
 99 | schema='id long, name string, role string'
100 | pyspark_dataframe = spark.createDataFrame(data=data, schema=schema)
101 | 
102 | # COMMAND ----------
103 | 
104 | type(pyspark_dataframe)
105 | 
106 | # COMMAND ----------
107 | 
108 | # MAGIC %md
109 | # MAGIC
110 | # MAGIC ## Databricks Built-in Visualizations
111 | 
112 | # COMMAND ----------
113 | 
114 | display(pyspark_dataframe)
115 | 
116 | # COMMAND ----------
117 | 
118 | display(pyspark_dataframe)
119 | 
120 | # COMMAND ----------
121 | 
122 | # MAGIC %md
123 | # MAGIC
124 | # MAGIC ## Python Visualization Libraries
125 | 
126 | # COMMAND ----------
127 | 
128 | # MAGIC %md
129 | # MAGIC
130 | # MAGIC On Day 2 of the [Introduction to Python for Data Science and Data Engineering](https://www.databricks.com/training/catalog/introduction-to-python-for-data-science-and-data-engineering-969) course, Databricks introduces [pandas.DataFrame.hist](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.hist.html).
131 | # MAGIC
132 | # MAGIC > A histogram is a representation of the distribution of data. This function calls `matplotlib.pyplot.hist()`, on each series in the DataFrame, resulting in one histogram per column.
133 | 
134 | # COMMAND ----------
135 | 
136 | # MAGIC %md
137 | # MAGIC
138 | # MAGIC In the highly-applauded book by the author of pandas [Python for Data Analysis: Data Wrangling with pandas, NumPy, and Jupyter](https://www.amazon.com/Python-Data-Analysis-Wrangling-Jupyter-dp-109810403X/dp/109810403X), one of the take-aways is:
139 | # MAGIC
140 | # MAGIC > Create informative visualizations with matplotlib
141 | 
142 | # COMMAND ----------
143 | 
144 | # MAGIC %md
145 | # MAGIC
146 | # MAGIC ## Matplotlib
147 | # MAGIC
148 | # MAGIC [Matplotlib: Visualization with Python](https://matplotlib.org/):
149 | # MAGIC
150 | # MAGIC > **Matplotlib** is a comprehensive library for creating static, animated, and interactive visualizations in Python. Matplotlib makes easy things easy and hard things possible.
151 | 
152 | # COMMAND ----------
153 | 
154 | # MAGIC %md
155 | # MAGIC
156 | # MAGIC ## pandas.DataFrame.hist
157 | 
158 | # COMMAND ----------
159 | 
160 | pandas_dataframe['id'].hist()
161 | 
162 | # COMMAND ----------
163 | 
164 | # MAGIC %md
165 | # MAGIC
166 | # MAGIC ## Seaborn
167 | # MAGIC
168 | # MAGIC [seaborn: statistical data visualization](https://seaborn.pydata.org/):
169 | # MAGIC
170 | # MAGIC > **Seaborn** is a Python data visualization library based on [matplotlib](https://matplotlib.org/). It provides a high-level interface for drawing attractive and informative statistical graphics.
171 | 
172 | # COMMAND ----------
173 | 
174 | # MAGIC %md
175 | # MAGIC
176 | # MAGIC ## Visualizations in Databricks notebooks
177 | # MAGIC
178 | # MAGIC Based on Databricks' [Visualizations in Databricks notebooks](https://docs.databricks.com/en/visualizations/index.html):
179 | # MAGIC
180 | # MAGIC * Databricks has built-in support for charts and visualizations in both Databricks SQL and in notebooks
181 | # MAGIC * To create a visualization, click `+` above the result and select Visualization
182 | # MAGIC * If you hover over the top right of a chart in the visualization editor, a Plotly toolbar appears with operations such as select, zoom, and pan
183 | # MAGIC * Click the downward pointing arrow at the right of the tab name for the following operations on a visualization:
184 | # MAGIC     * Download
185 | # MAGIC     * Remove
186 | # MAGIC     * Duplicate
187 | # MAGIC     * Rename
188 | # MAGIC     * Add to dashboard
189 | # MAGIC * You can change the name of a visualization by clicking directly and editing the name in place
190 | # MAGIC * You can edit a visualization
191 | 
192 | # COMMAND ----------
193 | 
194 | bikes = spark.read.csv("/databricks-datasets/bikeSharing/data-001/day.csv", header="true", inferSchema="true")
195 | display(bikes)
196 | 


--------------------------------------------------------------------------------