├── demo ├── uv_workflow │ ├── .python-version │ ├── src │ │ ├── uv_workflow │ │ │ ├── __init__.py │ │ │ └── main.py │ │ └── notebook.ipynb │ ├── .gitignore │ ├── uv.lock │ ├── tests │ │ └── main_test.py │ ├── README.md │ ├── pyproject.toml │ ├── databricks.yml │ └── resources │ │ └── uv_workflow.job.yml ├── pydantic_workflow │ ├── .python-version │ ├── src │ │ ├── pydantic_workflow │ │ │ ├── __init__.py │ │ │ ├── trip.py │ │ │ └── main.py │ │ └── notebook.ipynb │ ├── .gitignore │ ├── tests │ │ └── test_trip.py │ ├── resources │ │ └── pydantic_workflow.job.yml │ ├── README.md │ ├── databricks.yml │ └── pyproject.toml ├── delta-live-tables │ ├── input-data │ │ ├── 1.csv │ │ └── 2.csv │ ├── read_file.py │ ├── dlt_two.sql │ ├── main.tf │ ├── variables.tf │ ├── outputs.tf │ ├── dlt_one.sql │ ├── resources.tf │ ├── .gitignore │ ├── .terraform.lock.hcl │ ├── README.md │ └── my_streaming_table.sql ├── dlt-config-file.json ├── bestsellers │ ├── Book Ranks.sql │ ├── books.py │ └── pipeline_clone.py └── README.md ├── review_me ├── Databricks Runtime.py ├── dbfs-intro.py ├── DLT Step 1.sql ├── My Very First Notebook.py ├── Python example.py ├── CREATE TABLE IF NOT EXISTS jacek_laskowski.my_table.py └── python only please.py ├── Databricks SQL ├── Databricks SQL.py ├── Alerts.py ├── Dashboards.py ├── Agenda.py └── Queries.py ├── Databricks Asset Bundles ├── delta_live_tables_demo │ ├── src │ │ ├── delta_live_tables_demo │ │ │ ├── __init__.py │ │ │ └── main.py │ │ ├── notebook.ipynb │ │ └── dlt_pipeline.ipynb │ ├── pytest.ini │ ├── my_project │ │ ├── resources │ │ │ └── .gitkeep │ │ ├── .gitignore │ │ ├── scratch │ │ │ └── README.md │ │ ├── fixtures │ │ │ └── .gitkeep │ │ ├── README.md │ │ └── databricks.yml │ ├── scratch │ │ ├── README.md │ │ └── exploration.ipynb │ ├── resources │ │ ├── delta_live_tables_demo_pipeline.yml │ │ └── delta_live_tables_demo_job.yml │ ├── tests │ │ └── main_test.py │ ├── fixtures │ │ └── .gitkeep │ ├── requirements-dev.txt │ ├── setup.py │ ├── README.md │ └── databricks.yml ├── Tips and Tricks.py └── Job and Task Parameters.py ├── meetups ├── uv_workflow_job.png ├── poll-meetup-stationary.png ├── databricks_ml_model_versions.png ├── mlflow-pipeline-predictions.png ├── databricks_ml_registered_models.png ├── pydantic_workflow_main_task_python_wheel.png ├── Meetup_2025_01_09.sql ├── Meetup_2025_02_06.sql ├── README.md ├── Meetup_2025_01_30.sql └── MLflow on Databricks.py ├── Delta Live Tables ├── delta-live-tables-bundle │ ├── bronze_table.sql │ └── five_record_table.py ├── The Latest and Greatest meetup.sql ├── TODOs.py ├── Delta Live Tables Python.py ├── Deep Dive into DLTs.sql ├── DLT Lab.py ├── Pipeline settings.py ├── Full Refresh.sql ├── Materialization.sql ├── Agenda.sql ├── Expectations.sql ├── Storage location.sql └── Building Delta Live Tables pipelines with SQL.sql ├── Databricks Workflows ├── workflows-run-job-task.png ├── for_each_task_concurrency.png ├── for_each_task_demo │ ├── databricks.yml │ ├── src │ │ ├── Nested_Task.py │ │ └── Load_googlesheets_csv.py │ ├── resources │ │ └── for_each_task_job.yml │ └── README.md ├── Step 2. Transform.sql ├── Step 3. Build Aggregates.sql ├── 01 Conditional Workflows.py ├── Step 1. Load Raw Data.sql └── 02 Modular Orchestration with Run Job Task.py ├── Databricks Machine Learning ├── databricks-mlflow.png ├── databricks-mlflow-components.png ├── databricks-mlflow-model-tracking.png ├── databricks-machine-learning-lifecycle.png ├── Machine_Learning_Model_Deployment_course_completed.png └── databricks-machine-learning-model-deployment-lesson.png ├── .gitignore ├── Generative AI ├── dbdemos.py ├── Prompt Engineering.py ├── llm-rag-chatbot │ ├── _resources │ │ ├── README.py │ │ ├── NOTICE.py │ │ └── LICENSE.py │ ├── config.py │ └── 00-RAG-LLM-RAG-Introduction.py ├── AI Playground.py ├── SentenceTransformers.sql ├── Diffusion Models.sql ├── Foundation Models.py ├── Generative Pretrained Transformer.sql ├── Retrieval Augmented Generation.sql ├── Llama.py ├── Databricks Mosaic AI.sql ├── Model Serving.py └── 00 Generative AI.sql ├── Apache Spark ├── SparkSession across Python and Scala.py ├── Parameterized Queries.sql ├── Bucketing.sql ├── Parquet Connector.scala └── ANTI and SEMI joins in SQL and DataFrame API.py ├── workshops ├── Questions.py ├── Databricks Workshop Half-Day 5a.sql ├── Course Agenda 2 Days.py └── Databricks Workshop Day 3.sql ├── Development Tools └── Notebooks.py ├── README.md ├── PySpark ├── pyspark-jupyter-poetry │ ├── pyproject.toml │ ├── README.md │ └── install-pyspark.md └── PySpark.py ├── terraform ├── .terraform.lock.hcl ├── .gitignore └── pipeline.tf ├── Administration └── Databricks Administration.sql ├── Table-Valued Functions.sql ├── Delta Lake ├── Merge.sql ├── DESCRIBE HISTORY.sql ├── Delta Lake 3.1.0.sql ├── TRUNCATE TABLE in Delta Lake.sql └── Generated Columns.sql ├── Photon.py ├── Python └── Pyenv.py └── Data Visualization └── Data Visualization on Databricks.py /demo/uv_workflow/.python-version: -------------------------------------------------------------------------------- 1 | 3.10 2 | -------------------------------------------------------------------------------- /demo/pydantic_workflow/.python-version: -------------------------------------------------------------------------------- 1 | 3.12.3 2 | -------------------------------------------------------------------------------- /demo/delta-live-tables/input-data/1.csv: -------------------------------------------------------------------------------- 1 | id,name 2 | 0,zero 3 | 1,un -------------------------------------------------------------------------------- /demo/dlt-config-file.json: -------------------------------------------------------------------------------- 1 | { 2 | "project_name": "dlt_demo" 3 | } -------------------------------------------------------------------------------- /demo/uv_workflow/src/uv_workflow/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.0.1" 2 | -------------------------------------------------------------------------------- /review_me/Databricks Runtime.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | 3 | -------------------------------------------------------------------------------- /demo/delta-live-tables/input-data/2.csv: -------------------------------------------------------------------------------- 1 | id,name 2 | 2,deux 3 | 3,trois 4 | 4,quatre -------------------------------------------------------------------------------- /demo/pydantic_workflow/src/pydantic_workflow/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.0.1" 2 | -------------------------------------------------------------------------------- /Databricks SQL/Databricks SQL.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md # Databricks SQL 3 | -------------------------------------------------------------------------------- /Databricks SQL/Alerts.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md # Databricks SQL » Alerts 3 | -------------------------------------------------------------------------------- /Databricks SQL/Dashboards.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md # Databricks SQL » Dashboards 3 | -------------------------------------------------------------------------------- /Databricks Asset Bundles/delta_live_tables_demo/src/delta_live_tables_demo/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.0.1" 2 | -------------------------------------------------------------------------------- /Databricks Asset Bundles/delta_live_tables_demo/pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | testpaths = tests 3 | pythonpath = src 4 | -------------------------------------------------------------------------------- /meetups/uv_workflow_job.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jaceklaskowski/learn-databricks/HEAD/meetups/uv_workflow_job.png -------------------------------------------------------------------------------- /Delta Live Tables/delta-live-tables-bundle/bronze_table.sql: -------------------------------------------------------------------------------- 1 | CREATE MATERIALIZED VIEW bronze_table 2 | AS SELECT * FROM range(5) -------------------------------------------------------------------------------- /meetups/poll-meetup-stationary.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jaceklaskowski/learn-databricks/HEAD/meetups/poll-meetup-stationary.png -------------------------------------------------------------------------------- /demo/uv_workflow/.gitignore: -------------------------------------------------------------------------------- 1 | .databricks/ 2 | build/ 3 | dist/ 4 | __pycache__/ 5 | *.egg-info 6 | .venv/ 7 | scratch/** 8 | !scratch/README.md 9 | -------------------------------------------------------------------------------- /meetups/databricks_ml_model_versions.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jaceklaskowski/learn-databricks/HEAD/meetups/databricks_ml_model_versions.png -------------------------------------------------------------------------------- /meetups/mlflow-pipeline-predictions.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jaceklaskowski/learn-databricks/HEAD/meetups/mlflow-pipeline-predictions.png -------------------------------------------------------------------------------- /demo/pydantic_workflow/.gitignore: -------------------------------------------------------------------------------- 1 | .databricks/ 2 | build/ 3 | dist/ 4 | __pycache__/ 5 | *.egg-info 6 | .venv/ 7 | scratch/** 8 | !scratch/README.md 9 | -------------------------------------------------------------------------------- /meetups/databricks_ml_registered_models.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jaceklaskowski/learn-databricks/HEAD/meetups/databricks_ml_registered_models.png -------------------------------------------------------------------------------- /Databricks Asset Bundles/delta_live_tables_demo/my_project/resources/.gitkeep: -------------------------------------------------------------------------------- 1 | This folder is reserved for Databricks Asset Bundles resource definitions. 2 | -------------------------------------------------------------------------------- /Databricks Workflows/workflows-run-job-task.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jaceklaskowski/learn-databricks/HEAD/Databricks Workflows/workflows-run-job-task.png -------------------------------------------------------------------------------- /demo/delta-live-tables/read_file.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | import dlt 3 | 4 | @dlt.table 5 | def five_records(): 6 | return spark.range(5) 7 | 8 | -------------------------------------------------------------------------------- /Databricks Machine Learning/databricks-mlflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jaceklaskowski/learn-databricks/HEAD/Databricks Machine Learning/databricks-mlflow.png -------------------------------------------------------------------------------- /Databricks Workflows/for_each_task_concurrency.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jaceklaskowski/learn-databricks/HEAD/Databricks Workflows/for_each_task_concurrency.png -------------------------------------------------------------------------------- /demo/uv_workflow/uv.lock: -------------------------------------------------------------------------------- 1 | version = 1 2 | requires-python = ">=3.10" 3 | 4 | [[package]] 5 | name = "uv-workflow" 6 | version = "1.0.0" 7 | source = { editable = "." } 8 | -------------------------------------------------------------------------------- /meetups/pydantic_workflow_main_task_python_wheel.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jaceklaskowski/learn-databricks/HEAD/meetups/pydantic_workflow_main_task_python_wheel.png -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .databricks/ 2 | build/ 3 | dist/ 4 | __pycache__/ 5 | *.egg-info 6 | .venv/ 7 | scratch/** 8 | !scratch/README.md 9 | .ipynb_checkpoints/ 10 | .vscode/ 11 | -------------------------------------------------------------------------------- /Databricks Machine Learning/databricks-mlflow-components.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jaceklaskowski/learn-databricks/HEAD/Databricks Machine Learning/databricks-mlflow-components.png -------------------------------------------------------------------------------- /demo/uv_workflow/tests/main_test.py: -------------------------------------------------------------------------------- 1 | from uv_workflow.main import get_taxis, get_spark 2 | 3 | 4 | def test_main(): 5 | taxis = get_taxis(get_spark()) 6 | assert taxis.count() > 5 7 | -------------------------------------------------------------------------------- /Databricks Machine Learning/databricks-mlflow-model-tracking.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jaceklaskowski/learn-databricks/HEAD/Databricks Machine Learning/databricks-mlflow-model-tracking.png -------------------------------------------------------------------------------- /Databricks Asset Bundles/delta_live_tables_demo/my_project/.gitignore: -------------------------------------------------------------------------------- 1 | 2 | .databricks/ 3 | build/ 4 | dist/ 5 | __pycache__/ 6 | *.egg-info 7 | .venv/ 8 | scratch/** 9 | !scratch/README.md 10 | -------------------------------------------------------------------------------- /Databricks Machine Learning/databricks-machine-learning-lifecycle.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jaceklaskowski/learn-databricks/HEAD/Databricks Machine Learning/databricks-machine-learning-lifecycle.png -------------------------------------------------------------------------------- /Generative AI/dbdemos.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %pip install dbdemos 3 | 4 | # COMMAND ---------- 5 | 6 | import dbdemos 7 | dbdemos.install('llm-rag-chatbot', catalog='main', schema='rag_chatbot') 8 | -------------------------------------------------------------------------------- /Databricks Machine Learning/Machine_Learning_Model_Deployment_course_completed.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jaceklaskowski/learn-databricks/HEAD/Databricks Machine Learning/Machine_Learning_Model_Deployment_course_completed.png -------------------------------------------------------------------------------- /Databricks Machine Learning/databricks-machine-learning-model-deployment-lesson.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jaceklaskowski/learn-databricks/HEAD/Databricks Machine Learning/databricks-machine-learning-model-deployment-lesson.png -------------------------------------------------------------------------------- /demo/delta-live-tables/dlt_two.sql: -------------------------------------------------------------------------------- 1 | -- Databricks notebook source 2 | CREATE OR REFRESH LIVE TABLE dlt_two 3 | COMMENT "live table dlt_two" 4 | AS 5 | SELECT * FROM live.dlt_one 6 | 7 | -- COMMAND ---------- 8 | 9 | 10 | -------------------------------------------------------------------------------- /Databricks Asset Bundles/delta_live_tables_demo/scratch/README.md: -------------------------------------------------------------------------------- 1 | # scratch 2 | 3 | This folder is reserved for personal, exploratory notebooks. 4 | By default these are not committed to Git, as 'scratch' is listed in .gitignore. 5 | -------------------------------------------------------------------------------- /Databricks Asset Bundles/delta_live_tables_demo/my_project/scratch/README.md: -------------------------------------------------------------------------------- 1 | # scratch 2 | 3 | This folder is reserved for personal, exploratory notebooks. 4 | By default these are not committed to Git, as 'scratch' is listed in .gitignore. 5 | -------------------------------------------------------------------------------- /demo/uv_workflow/README.md: -------------------------------------------------------------------------------- 1 | # uv_workflow 2 | 3 | The 'uv_workflow' project was generated by using the default-python template. 4 | 5 | > [!TIP] 6 | > 7 | > Use [Meetup_2025_01_09](../../meetups/Meetup_2025_01_09.sql) notebook for guidance. 8 | -------------------------------------------------------------------------------- /Apache Spark/SparkSession across Python and Scala.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %scala 3 | # MAGIC 4 | # MAGIC spark.range(5).createTempView("kevin_view_scala") 5 | 6 | # COMMAND ---------- 7 | 8 | spark.table('kevin_view_scala') 9 | -------------------------------------------------------------------------------- /demo/delta-live-tables/main.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | required_providers { 3 | databricks = { 4 | source = "databricks/databricks" 5 | version = "1.13.0" 6 | } 7 | } 8 | required_version = ">= 1.4.0" 9 | } 10 | 11 | provider "databricks" {} 12 | -------------------------------------------------------------------------------- /demo/delta-live-tables/variables.tf: -------------------------------------------------------------------------------- 1 | variable "input_dir" { 2 | description = "The input directory for Auto Loader to load CSV files from" 3 | type = string 4 | nullable = false 5 | default = "/FileStore/jacek_laskowski/delta-live-tables-demo-input" 6 | } 7 | -------------------------------------------------------------------------------- /demo/bestsellers/Book Ranks.sql: -------------------------------------------------------------------------------- 1 | -- Databricks notebook source 2 | -- CTAS = Create Table As Select 3 | -- CTE = Common Table Expressions (WITH) 4 | CREATE LIVE TABLE book_ranks 5 | AS SELECT 6 | *, 7 | RANK() over ( 8 | PARTITION BY genre 9 | ORDER BY 10 | quantity DESC 11 | ) as book_rank 12 | FROM LIVE.books 13 | -------------------------------------------------------------------------------- /demo/pydantic_workflow/tests/test_trip.py: -------------------------------------------------------------------------------- 1 | from pydantic_workflow.trip import Trip 2 | 3 | import pytest 4 | 5 | 6 | def test_valid_trip(): 7 | Trip(id=10, pickup_zip=10103, dropoff_zip=10110) 8 | 9 | 10 | def test_invalid_trip(): 11 | with pytest.raises(ValueError): 12 | Trip(id=10, pickup_zip=10023, dropoff_zip=10023) -------------------------------------------------------------------------------- /Databricks Asset Bundles/delta_live_tables_demo/src/delta_live_tables_demo/main.py: -------------------------------------------------------------------------------- 1 | from pyspark.sql import SparkSession 2 | 3 | def get_taxis(): 4 | spark = SparkSession.builder.getOrCreate() 5 | return spark.read.table("samples.nyctaxi.trips") 6 | 7 | def main(): 8 | get_taxis().show(5) 9 | 10 | if __name__ == '__main__': 11 | main() 12 | -------------------------------------------------------------------------------- /demo/README.md: -------------------------------------------------------------------------------- 1 | # Demo 2 | 3 | You can find different demo in this directory: 4 | 5 | 1. [uv_workflow](./uv_workflow/) - Use [uv](https://docs.astral.sh/uv/) to manage a Python project that uses [Databricks Asset Bundles](https://docs.databricks.com/en/dev-tools/bundles/index.html) to manage a Databricks job (that uses the Python library). 6 | 1. _others_ 7 | -------------------------------------------------------------------------------- /Generative AI/Prompt Engineering.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md 3 | # MAGIC 4 | # MAGIC # Prompt Engineering for Generative AI 5 | 6 | # COMMAND ---------- 7 | 8 | # MAGIC %md 9 | # MAGIC 10 | # MAGIC ## Resources 11 | # MAGIC 12 | # MAGIC * [developers.google.com](https://developers.google.com/machine-learning/resources/prompt-eng) 13 | -------------------------------------------------------------------------------- /demo/uv_workflow/pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "uv-workflow" 3 | version = "1.0.0" 4 | description = "Add your description here" 5 | readme = "README.md" 6 | requires-python = ">=3.10" 7 | dependencies = [] 8 | 9 | # uv init --lib 10 | # Uses src directory for sources 11 | [build-system] 12 | requires = ["hatchling"] 13 | build-backend = "hatchling.build" -------------------------------------------------------------------------------- /workshops/Questions.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md # Questions 3 | # MAGIC 4 | # MAGIC Open questions to explore 5 | 6 | # COMMAND ---------- 7 | 8 | # MAGIC %md 9 | # MAGIC 10 | # MAGIC 1. Job Notification Templates 11 | # MAGIC 1. Auto Optimize 12 | # MAGIC 1. Auto Compact 13 | # MAGIC 1. Can `CommitHook`s help executing `OPTIMIZE` at write time 14 | -------------------------------------------------------------------------------- /demo/delta-live-tables/outputs.tf: -------------------------------------------------------------------------------- 1 | output "storage" { 2 | description = "Storage location" 3 | value = databricks_pipeline.this.storage 4 | } 5 | 6 | output "pipeline_id" { 7 | description = "Pipeline ID" 8 | value = databricks_pipeline.this.id 9 | } 10 | 11 | output "input_dir" { 12 | description = "Input directory to load CSV data from" 13 | value = var.input_dir 14 | } 15 | -------------------------------------------------------------------------------- /Development Tools/Notebooks.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md # Notebooks 3 | 4 | # COMMAND ---------- 5 | 6 | # MAGIC %md 7 | # MAGIC 8 | # MAGIC ## Command Palette 9 | # MAGIC 10 | # MAGIC * Read [Command palette](https://docs.databricks.com/en/notebooks/notebooks-code.html#command-palette) 11 | # MAGIC * Use `Cmd + Shift + P` on macOS (or `Ctrl + Shift + P` on Windows) 🔥 12 | -------------------------------------------------------------------------------- /Databricks SQL/Agenda.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md # Databricks SQL » Agenda 3 | 4 | # COMMAND ---------- 5 | 6 | # MAGIC %md 7 | # MAGIC 8 | # MAGIC | # | Module | 9 | # MAGIC | --- | --- | 10 | # MAGIC | 0 | [Introduction]($./Databricks SQL) | 11 | # MAGIC | 1 | [Queries]($./Queries) | 12 | # MAGIC | 2 | [Dashboards]($./Dashboards) | 13 | # MAGIC | 3 | [Alerts]($./Alerts) | 14 | -------------------------------------------------------------------------------- /demo/pydantic_workflow/resources/pydantic_workflow.job.yml: -------------------------------------------------------------------------------- 1 | resources: 2 | jobs: 3 | pydantic_workflow_job: 4 | name: pydantic_workflow_job 5 | tasks: 6 | - task_key: taxi_qc 7 | existing_cluster_id: 1128-165651-khbd6ndl 8 | notebook_task: 9 | notebook_path: ../src/taxi_qc.ipynb 10 | libraries: 11 | - whl: ../dist/*.whl 12 | -------------------------------------------------------------------------------- /Generative AI/llm-rag-chatbot/_resources/README.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md 3 | # MAGIC ## DBDemos asset 4 | # MAGIC 5 | # MAGIC The notebooks available under `_/resources` are technical resources. 6 | # MAGIC 7 | # MAGIC Do not edit these notebooks or try to run them directly. These notebooks will load data / run some setup. They are indirectly called from the main notebook (`%run ./_resources/.....`) 8 | -------------------------------------------------------------------------------- /Delta Live Tables/The Latest and Greatest meetup.sql: -------------------------------------------------------------------------------- 1 | -- Databricks notebook source 2 | -- MAGIC %md # The Latest and Greatest in Delta Live Tables 3 | 4 | -- COMMAND ---------- 5 | 6 | -- MAGIC %md 7 | -- MAGIC 8 | -- MAGIC 1. Graph (the default view) vs **List** 🔥 9 | -- MAGIC 1. **Flows** tab in the details of a live table 10 | -- MAGIC 1. [Validate update](https://docs.databricks.com/en/delta-live-tables/updates.html#validate-update) 11 | -------------------------------------------------------------------------------- /Delta Live Tables/delta-live-tables-bundle/five_record_table.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md 3 | # MAGIC 4 | # MAGIC Define a Delta Live Tables dataset 5 | # MAGIC 6 | # MAGIC It must return either a Spark or Koalas DataFrame. 7 | 8 | # COMMAND ---------- 9 | 10 | import dlt 11 | from pyspark.sql import DataFrame 12 | 13 | @dlt.table() 14 | def five_record_table() -> DataFrame: 15 | print('Hello world') 16 | return spark.range(5) 17 | -------------------------------------------------------------------------------- /Databricks Workflows/for_each_task_demo/databricks.yml: -------------------------------------------------------------------------------- 1 | # This is a Databricks asset bundle definition for for_each_task_demo. 2 | # See https://docs.databricks.com/dev-tools/bundles/index.html for documentation. 3 | bundle: 4 | name: for_each_task_demo 5 | 6 | include: 7 | - resources/for_each_task_job.yml 8 | 9 | variables: 10 | cluster_id: 11 | default: 0401-114149-an94vdde 12 | 13 | targets: 14 | dev: 15 | mode: development 16 | default: true 17 | -------------------------------------------------------------------------------- /Generative AI/AI Playground.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md 3 | # MAGIC # AI Playground 4 | 5 | # COMMAND ---------- 6 | 7 | # MAGIC %md 8 | # MAGIC 9 | # MAGIC [Chat with supported LLMs using AI Playground](https://docs.databricks.com/en/large-language-models/ai-playground.html): 10 | # MAGIC 11 | # MAGIC * Interact with supported LLMs 12 | # MAGIC * A chat-like environment to test, prompt, and compare LLMs 13 | # MAGIC * Available in your Databricks workspace 14 | -------------------------------------------------------------------------------- /demo/pydantic_workflow/README.md: -------------------------------------------------------------------------------- 1 | # pydantic_workflow 2 | 3 | The 'pydantic_workflow' project was generated by using the default-python template. 4 | 5 | ## Run 6 | 7 | ```bash 8 | databricks bundle deploy && \ 9 | databricks bundle run pydantic_workflow_job 10 | ``` 11 | 12 | ## Clean Up 13 | 14 | ```bash 15 | databricks bundle destroy --auto-approve 16 | ``` 17 | 18 | ## Learn More 19 | 20 | * [What are Databricks Asset Bundles?](https://docs.databricks.com/aws/en/dev-tools/bundles). 21 | -------------------------------------------------------------------------------- /demo/delta-live-tables/dlt_one.sql: -------------------------------------------------------------------------------- 1 | -- Databricks notebook source 2 | --- Almost like CTAS 3 | CREATE OR REFRESH LIVE TABLE dlt_one 4 | ( 5 | id INTEGER COMMENT 'Identifier', 6 | auto_generated BIGINT GENERATED ALWAYS AS IDENTITY (START WITH 0 INCREMENT BY 1) COMMENT 'Auto-generated using GENERATED ALWAYS AS IDENTITY' 7 | ) 8 | COMMENT "live table dlt_one with ${jacek.pipeline.message}" 9 | AS 10 | SELECT 11 | INT((rand() * ID) * 100) AS id 12 | FROM VALUES 13 | (1), 14 | (2), 15 | (3) t(id) 16 | -------------------------------------------------------------------------------- /Databricks Asset Bundles/delta_live_tables_demo/resources/delta_live_tables_demo_pipeline.yml: -------------------------------------------------------------------------------- 1 | # The main pipeline for delta_live_tables_demo 2 | resources: 3 | pipelines: 4 | delta_live_tables_demo_pipeline: 5 | name: delta_live_tables_demo_pipeline 6 | target: delta_live_tables_demo_${bundle.environment} 7 | libraries: 8 | - notebook: 9 | path: ../src/dlt_pipeline.ipynb 10 | meetup_pipeline: 11 | name: dlt_pipeline_v${bundle.git.commit} 12 | libraries: 13 | - notebook: 14 | path: ../src/dlt_pipeline.ipynb 15 | -------------------------------------------------------------------------------- /Databricks Asset Bundles/delta_live_tables_demo/tests/main_test.py: -------------------------------------------------------------------------------- 1 | from databricks.connect import DatabricksSession 2 | from pyspark.sql import SparkSession 3 | from delta_live_tables_demo import main 4 | 5 | # Create a new Databricks Connect session. If this fails, 6 | # check that you have configured Databricks Connect correctly. 7 | # See https://docs.databricks.com/dev-tools/databricks-connect.html. 8 | 9 | SparkSession.builder = DatabricksSession.builder 10 | SparkSession.builder.getOrCreate() 11 | 12 | def test_main(): 13 | taxis = main.get_taxis() 14 | assert taxis.count() > 5 15 | -------------------------------------------------------------------------------- /Generative AI/SentenceTransformers.sql: -------------------------------------------------------------------------------- 1 | -- Databricks notebook source 2 | -- MAGIC %md # SentenceTransformers 3 | 4 | -- COMMAND ---------- 5 | 6 | -- MAGIC %md 7 | -- MAGIC 8 | -- MAGIC [SentenceTransformers](https://www.sbert.net/) 9 | -- MAGIC 10 | -- MAGIC Sentence, text and image embeddings 11 | -- MAGIC 12 | -- MAGIC Compute sentence / text embeddings for more than 100 languages. 13 | -- MAGIC 14 | -- MAGIC Embeddings can then be compared e.g. with cosine-similarity to find sentences with a similar meaning. This can be useful for semantic textual similar, semantic search, or paraphrase mining. 15 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Learn Databricks Lakehouse Platform 2 | 3 | This repository contains notebooks to learn [Databricks Lakehouse Platform](https://www.databricks.com/product/data-lakehouse), featuring but not limited to: 4 | 5 | 1. [Delta Live Tables](Delta%20Live%20Tables/) 6 | 1. [Workflow Jobs](Workflow%20Jobs/) 7 | 1. [Unity Catalog](Unity%20Catalog/) 8 | 9 | ## Databricks Workshop 10 | 11 | Loose notes about the topics of future Databricks workshops. 12 | 13 | 1. Developer Settings (under `/settings/user/developer/`) 14 | * review available options 15 | * what are the other URLs? (under `/settings/user/`) 16 | 17 | -------------------------------------------------------------------------------- /Databricks Workflows/for_each_task_demo/src/Nested_Task.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md 3 | # MAGIC 4 | # MAGIC # Nested Task 5 | 6 | # COMMAND ---------- 7 | 8 | # MAGIC %md 9 | # MAGIC 10 | # MAGIC This notebook is a **nested task** used as part of a For each task in the For each Task demo. 11 | # MAGIC 12 | # MAGIC 1. The nested task is the task to run for each iteration of the For each task. 13 | # MAGIC 1. Can be one of the standard Databricks Jobs task types. 14 | # MAGIC 1. Cannot be another For each task. 15 | 16 | # COMMAND ---------- 17 | 18 | single_csv_line = dbutils.widgets.get("single_csv_line") 19 | print(single_csv_line) 20 | -------------------------------------------------------------------------------- /Generative AI/Diffusion Models.sql: -------------------------------------------------------------------------------- 1 | -- Databricks notebook source 2 | -- MAGIC %md 3 | -- MAGIC 4 | -- MAGIC # Diffusion Models 5 | -- MAGIC 6 | -- MAGIC * [Wikipedia](https://en.wikipedia.org/wiki/Diffusion_model) 7 | -- MAGIC * Models like [DALL·E](https://openai.com/dall-e-3) to generate images (**image generation**) 8 | -- MAGIC * fine-tuned to create objects with specific desired properties 9 | -- MAGIC * training phase 10 | 11 | -- COMMAND ---------- 12 | 13 | -- MAGIC %md 14 | -- MAGIC 15 | -- MAGIC ## DALL-E 3 16 | -- MAGIC 17 | -- MAGIC * [Home page](https://openai.com/dall-e-3) 18 | -- MAGIC * [Research paper](https://cdn.openai.com/papers/dall-e-3.pdf) 19 | -------------------------------------------------------------------------------- /Delta Live Tables/TODOs.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md # Open Topics / TODOs 3 | # MAGIC 4 | # MAGIC The following is a list of things to explore in more details. 5 | 6 | # COMMAND ---------- 7 | 8 | # MAGIC %md 9 | # MAGIC 10 | # MAGIC 1. CDF Demo 11 | # MAGIC 2. STREAMING clause 12 | # MAGIC 3. Continuous execution pipeline mode 13 | 14 | # COMMAND ---------- 15 | 16 | # MAGIC %md 17 | # MAGIC 18 | # MAGIC 1. What's the diff between abfss vs adls? 19 | # MAGIC 1. Different uses of [libraries](https://docs.databricks.com/api/azure/workspace/pipelines/create) in a DLT pipeline (esp. `file` and `jar` JSON fields) 20 | 21 | # COMMAND ---------- 22 | 23 | 24 | -------------------------------------------------------------------------------- /PySpark/pyspark-jupyter-poetry/pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "jupyter-spark" 3 | version = "0.1.0" 4 | description = "" 5 | authors = ["Jacek Laskowski "] 6 | readme = "README.md" 7 | 8 | [tool.poetry.dependencies] 9 | python = "^3.12" 10 | pyspark = { version = "^3.5.2", extras = [ "connect,sql,ml" ] } 11 | # Required for Python 3.12 12 | # Nothing to do with PySpark 13 | # https://stackoverflow.com/q/77233855/1305344 14 | setuptools = "^74.1.2" 15 | jupyterlab = "^4.2.5" 16 | faker = "^28.4.1" 17 | pandas = "^2.2.2" 18 | pyarrow = "^17.0.0" 19 | 20 | [build-system] 21 | requires = ["poetry-core"] 22 | build-backend = "poetry.core.masonry.api" 23 | -------------------------------------------------------------------------------- /demo/pydantic_workflow/src/pydantic_workflow/trip.py: -------------------------------------------------------------------------------- 1 | from typing_extensions import Self 2 | 3 | from pydantic import BaseModel, model_validator 4 | 5 | from datetime import datetime 6 | 7 | 8 | class Trip(BaseModel): 9 | tpep_pickup_datetime: datetime = datetime.now() 10 | tpep_dropoff_datetime: datetime = datetime.now() 11 | trip_distance: float = -1.0 12 | fare_amount: float = -1.0 13 | pickup_zip: int = -1 14 | dropoff_zip: int = -1 15 | 16 | @model_validator(mode='after') 17 | def enforce_different_zips(self) -> Self: 18 | if self.pickup_zip == self.dropoff_zip: 19 | raise ValueError('pickup_zip and dropoff_zip must be different') 20 | return self 21 | -------------------------------------------------------------------------------- /Generative AI/Foundation Models.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md 3 | # MAGIC # Foundation Models 4 | 5 | # COMMAND ---------- 6 | 7 | # MAGIC %md 8 | # MAGIC 9 | # MAGIC ## Databricks Foundation Model APIs 10 | # MAGIC 11 | # MAGIC [Databricks Foundation Model APIs](https://docs.databricks.com/en/machine-learning/foundation-models/index.html): 12 | # MAGIC 13 | # MAGIC * Includes requirements for use, supported models, and limitations 14 | 15 | # COMMAND ---------- 16 | 17 | # MAGIC %md 18 | # MAGIC 19 | # MAGIC [Get started querying LLMs on Databricks](https://docs.databricks.com/en/large-language-models/llm-serving-intro.html): 20 | # MAGIC 21 | # MAGIC * Use Foundation Model APIs to serve and query LLMs on Databricks 22 | -------------------------------------------------------------------------------- /demo/uv_workflow/src/uv_workflow/main.py: -------------------------------------------------------------------------------- 1 | from pyspark.sql import SparkSession, DataFrame 2 | 3 | def get_taxis(spark: SparkSession) -> DataFrame: 4 | return spark.read.table("samples.nyctaxi.trips") 5 | 6 | 7 | # Create a new Databricks Connect session. If this fails, 8 | # check that you have configured Databricks Connect correctly. 9 | # See https://docs.databricks.com/dev-tools/databricks-connect.html. 10 | def get_spark() -> SparkSession: 11 | try: 12 | from databricks.connect import DatabricksSession 13 | return DatabricksSession.builder.getOrCreate() 14 | except ImportError: 15 | return SparkSession.builder.getOrCreate() 16 | 17 | def main(): 18 | get_taxis(get_spark()).show(5) 19 | 20 | if __name__ == '__main__': 21 | main() 22 | -------------------------------------------------------------------------------- /review_me/dbfs-intro.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %py 3 | # MAGIC 4 | # MAGIC # this is NOT a Spark code (pyspark) 5 | # MAGIC # We're about to load people dataset 6 | # MAGIC spark.read.format('csv').load('/people.csv') 7 | 8 | # COMMAND ---------- 9 | 10 | # MAGIC %fs ls dbfs:/FileStore/books.csv 11 | 12 | # COMMAND ---------- 13 | 14 | # MAGIC %md ## cat 15 | 16 | # COMMAND ---------- 17 | 18 | # MAGIC %fs cat dbfs:/FileStore/books.csv 19 | 20 | # COMMAND ---------- 21 | 22 | dbutils.fs.head('dbfs:/FileStore/books.csv') 23 | 24 | # COMMAND ---------- 25 | 26 | # MAGIC %fs head dbfs:/FileStore/books.csv 27 | 28 | # COMMAND ---------- 29 | 30 | dbutils.help() 31 | 32 | # COMMAND ---------- 33 | 34 | # MAGIC %sh ls /dbfs/FileStore 35 | -------------------------------------------------------------------------------- /terraform/.terraform.lock.hcl: -------------------------------------------------------------------------------- 1 | # This file is maintained automatically by "terraform init". 2 | # Manual edits may be lost in future updates. 3 | 4 | provider "registry.terraform.io/databricks/databricks" { 5 | version = "1.19.0" 6 | hashes = [ 7 | "h1:uk8gR88qcyVvkvDoXTHkTnT8g+S7QgvV3w1H7osVLMU=", 8 | "zh:1e2bbfd4af2cf0369a51baea67d5884e87f180ea56542aa70a470022bffed7f9", 9 | "zh:4057f818461060bb85bf2282d88a0caccc16bee89f8da471743282bfd9dffa6e", 10 | "zh:50c24b1c3744861262d23112b8a8f74902b43c8c406f79cd3ada84452daaeb79", 11 | "zh:91047db0b13cb849424eeb5050c86510005569c38df52b9794363cc82f18d407", 12 | "zh:9d22bbc77f3b790d3e4732406a1982bd0654debdabd317f81dbe38a860fc174d", 13 | "zh:d4587098bc487ea2c4161a4b7aee94b7bf8df2cf50b6ffd44047a0b4663064bf", 14 | ] 15 | } 16 | -------------------------------------------------------------------------------- /demo/pydantic_workflow/src/pydantic_workflow/main.py: -------------------------------------------------------------------------------- 1 | from pyspark.sql import SparkSession, DataFrame 2 | 3 | 4 | def get_taxis(spark: SparkSession) -> DataFrame: 5 | return spark.read.table("samples.nyctaxi.trips") 6 | 7 | 8 | # Create a new Databricks Connect session. If this fails, 9 | # check that you have configured Databricks Connect correctly. 10 | # See https://docs.databricks.com/dev-tools/databricks-connect.html. 11 | def get_spark() -> SparkSession: 12 | try: 13 | from databricks.connect import DatabricksSession 14 | 15 | return DatabricksSession.builder.getOrCreate() 16 | except ImportError: 17 | return SparkSession.builder.getOrCreate() 18 | 19 | 20 | def main(): 21 | get_taxis(get_spark()).show(5) 22 | 23 | 24 | if __name__ == "__main__": 25 | main() 26 | -------------------------------------------------------------------------------- /demo/uv_workflow/databricks.yml: -------------------------------------------------------------------------------- 1 | # This is a Databricks asset bundle definition for uv_workflow. 2 | # See https://docs.databricks.com/dev-tools/bundles/index.html for documentation. 3 | bundle: 4 | name: uv_workflow 5 | 6 | include: 7 | - resources/*.yml 8 | 9 | workspace: 10 | host: https://curriculum-dev.cloud.databricks.com 11 | 12 | artifacts: 13 | uv_built_wheel: 14 | type: whl 15 | build: uv build --wheel 16 | path: . 17 | 18 | targets: 19 | dev: 20 | # The default target uses 'mode: development' to create a development copy. 21 | # - Deployed resources get prefixed with '[dev my_user_name]' 22 | # - Any job schedules and triggers are paused by default. 23 | # See also https://docs.databricks.com/dev-tools/bundles/deployment-modes.html. 24 | mode: development 25 | default: true 26 | -------------------------------------------------------------------------------- /Databricks Asset Bundles/delta_live_tables_demo/fixtures/.gitkeep: -------------------------------------------------------------------------------- 1 | # Fixtures 2 | 3 | This folder is reserved for fixtures, such as CSV files. 4 | 5 | Below is an example of how to load fixtures as a data frame: 6 | 7 | ``` 8 | import pandas as pd 9 | import os 10 | 11 | def get_absolute_path(*relative_parts): 12 | if 'dbutils' in globals(): 13 | base_dir = os.path.dirname(dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get()) # type: ignore 14 | path = os.path.normpath(os.path.join(base_dir, *relative_parts)) 15 | return path if path.startswith("/Workspace") else "/Workspace" + path 16 | else: 17 | return os.path.join(*relative_parts) 18 | 19 | csv_file = get_absolute_path("..", "fixtures", "mycsv.csv") 20 | df = pd.read_csv(csv_file) 21 | display(df) 22 | ``` 23 | -------------------------------------------------------------------------------- /demo/pydantic_workflow/databricks.yml: -------------------------------------------------------------------------------- 1 | # This is a Databricks asset bundle definition for pydantic_workflow. 2 | # See https://docs.databricks.com/dev-tools/bundles/index.html for documentation. 3 | bundle: 4 | name: pydantic_workflow 5 | 6 | include: 7 | - resources/*.yml 8 | 9 | artifacts: 10 | pydantic_workflow_wheel: 11 | type: whl 12 | build: uv build --wheel 13 | path: . 14 | 15 | workspace: 16 | host: https://curriculum-dev.cloud.databricks.com 17 | 18 | targets: 19 | dev: 20 | # The default target uses 'mode: development' to create a development copy. 21 | # - Deployed resources get prefixed with '[dev my_user_name]' 22 | # - Any job schedules and triggers are paused by default. 23 | # See also https://docs.databricks.com/dev-tools/bundles/deployment-modes.html. 24 | mode: development 25 | -------------------------------------------------------------------------------- /Databricks Asset Bundles/delta_live_tables_demo/my_project/fixtures/.gitkeep: -------------------------------------------------------------------------------- 1 | # Fixtures 2 | 3 | This folder is reserved for fixtures, such as CSV files. 4 | 5 | Below is an example of how to load fixtures as a data frame: 6 | 7 | ``` 8 | import pandas as pd 9 | import os 10 | 11 | def get_absolute_path(*relative_parts): 12 | if 'dbutils' in globals(): 13 | base_dir = os.path.dirname(dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get()) # type: ignore 14 | path = os.path.normpath(os.path.join(base_dir, *relative_parts)) 15 | return path if path.startswith("/Workspace") else "/Workspace" + path 16 | else: 17 | return os.path.join(*relative_parts) 18 | 19 | csv_file = get_absolute_path("..", "fixtures", "mycsv.csv") 20 | df = pd.read_csv(csv_file) 21 | display(df) 22 | ``` 23 | -------------------------------------------------------------------------------- /Generative AI/Generative Pretrained Transformer.sql: -------------------------------------------------------------------------------- 1 | -- Databricks notebook source 2 | -- MAGIC %md # Generative Pretrained Transformer (GPT) 3 | 4 | -- COMMAND ---------- 5 | 6 | -- MAGIC %md ## GPT-4 7 | 8 | -- COMMAND ---------- 9 | 10 | -- MAGIC %md 11 | -- MAGIC 12 | -- MAGIC [GPT-4 API general availability and deprecation of older models in the Completions API](https://openai.com/blog/gpt-4-api-general-availability): 13 | -- MAGIC 14 | -- MAGIC * GPT-4 API is now available to all paying OpenAI API customers 15 | -- MAGIC * GPT-3.5 Turbo, DALL·E and Whisper APIs are also GA 16 | -- MAGIC * a deprecation plan for older models of the Completions API 17 | -- MAGIC 18 | -- MAGIC > **Note** 19 | -- MAGIC > 20 | -- MAGIC > The OpenAI API does not provide an isolated environment therefore is likely not suitable for enterprise production applications. 21 | -- MAGIC 22 | -------------------------------------------------------------------------------- /demo/bestsellers/books.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md # Exercise: Finding 1st and 2nd Bestsellers Per Genre 3 | 4 | # COMMAND ---------- 5 | 6 | # MAGIC %md 7 | # MAGIC 8 | # MAGIC This is a DLT pipeline for [Exercise: Finding 1st and 2nd Bestsellers Per Genre](https://jaceklaskowski.github.io/spark-workshop/exercises/sql/Finding-1st-and-2nd-Bestsellers-Per-Genre.html). 9 | 10 | # COMMAND ---------- 11 | 12 | # MAGIC %md 13 | # MAGIC 14 | # MAGIC 1. Create a live table (with a raw data) = `books` table 15 | 16 | # COMMAND ---------- 17 | 18 | source_path = 'dbfs:/FileStore/books.csv' 19 | 20 | # COMMAND ---------- 21 | 22 | import dlt 23 | from pyspark.sql import DataFrame 24 | 25 | @dlt.table 26 | def books() -> DataFrame: 27 | return spark \ 28 | .read \ 29 | .option('header', True) \ 30 | .option('inferSchema', True) \ 31 | .csv(source_path) 32 | -------------------------------------------------------------------------------- /Databricks Workflows/for_each_task_demo/resources/for_each_task_job.yml: -------------------------------------------------------------------------------- 1 | resources: 2 | jobs: 3 | for_each_task_demo_job: 4 | name: For Each Task Demo Job 5 | tasks: 6 | - task_key: Load_googlesheets_csv 7 | notebook_task: 8 | notebook_path: ../src/Load_googlesheets_csv.py 9 | existing_cluster_id: ${var.cluster_id} 10 | - task_key: for_each_task 11 | depends_on: 12 | - task_key: Load_googlesheets_csv 13 | for_each_task: 14 | inputs: "{{tasks.Load_googlesheets_csv.values.gsheets}}" 15 | concurrency: 100 16 | task: 17 | task_key: for_each_task_iteration 18 | notebook_task: 19 | notebook_path: ../src/Nested_Task.py 20 | base_parameters: 21 | single_csv_line: "{{input}}" 22 | existing_cluster_id: ${var.cluster_id} 23 | -------------------------------------------------------------------------------- /PySpark/pyspark-jupyter-poetry/README.md: -------------------------------------------------------------------------------- 1 | # PySpark with Jupyter 2 | 3 | This project shows how to run Apache Spark (PySpark) with Jupyter. 4 | 5 | Run PySpark as follows: 6 | 7 | ```bash 8 | poetry run pyspark 9 | ``` 10 | 11 | Start Spark Connect server. 12 | 13 | ```bash 14 | ./sbin/start-connect-server.sh 15 | ``` 16 | 17 | Review the logs of this Spark Connect server. 18 | 19 | ```bash 20 | tail -f /Users/jacek/dev/oss/spark/logs/spark-jacek-org.apache.spark.sql.connect.service.SparkConnectServer-1-Jaceks-Mac-mini.local.out 21 | ``` 22 | 23 | At the end of the logs, you should see the following INFO message that says the URL of this Spark Connect instance. 24 | 25 | ```text 26 | ... 27 | [main] INFO org.apache.spark.sql.connect.service.SparkConnectServer:60 - Spark Connect server started at: 0:0:0:0:0:0:0:0:15002 28 | ``` 29 | 30 | Run PySpark within Jupyter as follows: 31 | 32 | ```bash 33 | poetry run jupyter lab 34 | ``` 35 | -------------------------------------------------------------------------------- /Administration/Databricks Administration.sql: -------------------------------------------------------------------------------- 1 | -- Databricks notebook source 2 | -- MAGIC %md # Databricks Administration 3 | -- MAGIC 4 | -- MAGIC Commands and tricks to manage Databricks clusters 5 | 6 | -- COMMAND ---------- 7 | 8 | -- MAGIC %scala 9 | -- MAGIC 10 | -- MAGIC println(s""" 11 | -- MAGIC |Spark version: ${sc.version} 12 | -- MAGIC |runtime_commit: ${org.apache.spark.BuildInfo.gitHash} 13 | -- MAGIC |universe_commit: ${com.databricks.BuildInfo.gitHash} 14 | -- MAGIC """.stripMargin) 15 | 16 | -- COMMAND ---------- 17 | 18 | -- MAGIC %py 19 | -- MAGIC 20 | -- MAGIC import os 21 | -- MAGIC os.environ 22 | 23 | -- COMMAND ---------- 24 | 25 | -- MAGIC %sh ls /databricks/spark 26 | 27 | -- COMMAND ---------- 28 | 29 | -- MAGIC %sh cat /databricks/spark/VERSION 30 | 31 | -- COMMAND ---------- 32 | 33 | -- MAGIC %sh ls /databricks/spark/conf 34 | 35 | -- COMMAND ---------- 36 | 37 | -- MAGIC %sh cat /databricks/spark/conf/spark-env.sh 38 | -------------------------------------------------------------------------------- /demo/delta-live-tables/resources.tf: -------------------------------------------------------------------------------- 1 | # https://registry.terraform.io/providers/databricks/databricks/latest/docs/resources/dbfs_file 2 | # Deploy a single file for the demo to have something to consume 3 | resource "databricks_dbfs_file" "this" { 4 | source = "${path.module}/input-data/1.csv" 5 | path = "${var.input_dir}/1.csv" 6 | } 7 | 8 | # https://registry.terraform.io/providers/databricks/databricks/latest/docs/resources/repo 9 | resource "databricks_repo" "learn_databricks" { 10 | path = "/Repos/jacek@japila.pl/delta-live-tables-demo" 11 | url = "https://github.com/jaceklaskowski/learn-databricks" 12 | } 13 | 14 | resource "databricks_pipeline" "this" { 15 | name = "EXPECT Clause Demo" 16 | development = true 17 | library { 18 | notebook { 19 | path = "${databricks_repo.learn_databricks.path}/demo/delta-live-tables/my_streaming_table" 20 | } 21 | } 22 | configuration = { 23 | cloud_files_input_path = var.input_dir 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /demo/pydantic_workflow/pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "pydantic-workflow" 3 | version = "0.1.0" 4 | description = "Add your description here" 5 | readme = "README.md" 6 | # Databricks Runtime 16.2 7 | # https://docs.databricks.com/aws/en/release-notes/runtime/16.2#system-environment 8 | requires-python = "==3.12.3" 9 | dependencies = [ 10 | "pydantic>=2.10.6", 11 | ] 12 | 13 | # A command definition (entry point) 14 | # uv run pydantic_workflow 15 | # https://docs.astral.sh/uv/concepts/projects/init/#packaged-applications 16 | [project.scripts] 17 | pydantic_workflow = "pydantic_workflow.main:main" 18 | 19 | # uv run --dev ... 20 | # https://docs.astral.sh/uv/concepts/projects/dependencies/#dependency-groups 21 | [dependency-groups] 22 | dev = [ 23 | "pyspark>=3.5.4", 24 | "pytest>=8.3.4", 25 | ] 26 | 27 | # https://docs.astral.sh/uv/concepts/projects/init/#packaged-applications 28 | [build-system] 29 | requires = ["hatchling"] 30 | build-backend = "hatchling.build" -------------------------------------------------------------------------------- /terraform/.gitignore: -------------------------------------------------------------------------------- 1 | # Local .terraform directories 2 | **/.terraform/* 3 | 4 | # .tfstate files 5 | *.tfstate 6 | *.tfstate.* 7 | 8 | # Crash log files 9 | crash.log 10 | crash.*.log 11 | 12 | # Exclude all .tfvars files, which are likely to contain sensitive data, such as 13 | # password, private keys, and other secrets. These should not be part of version 14 | # control as they are data points which are potentially sensitive and subject 15 | # to change depending on the environment. 16 | *.tfvars 17 | *.tfvars.json 18 | 19 | # Ignore override files as they are usually used to override resources locally and so 20 | # are not checked in 21 | override.tf 22 | override.tf.json 23 | *_override.tf 24 | *_override.tf.json 25 | 26 | # Include override files you do wish to add to version control using negated pattern 27 | # !example_override.tf 28 | 29 | # Include tfplan files to ignore the plan output of command: terraform plan -out=tfplan 30 | # example: *tfplan* 31 | 32 | # Ignore CLI configuration files 33 | .terraformrc 34 | terraform.rc 35 | -------------------------------------------------------------------------------- /Databricks Asset Bundles/Tips and Tricks.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md 3 | # MAGIC 4 | # MAGIC # Tips and Tricks 5 | 6 | # COMMAND ---------- 7 | 8 | # MAGIC %md 9 | # MAGIC 10 | # MAGIC ## List Resources 11 | # MAGIC 12 | # MAGIC There's no command line option to list the resources managed in a DAB project. 13 | # MAGIC 14 | # MAGIC Use [jq](https://jqlang.github.io/jq/) and [keys](https://jqlang.github.io/jq/manual/#keys-keys_unsorted). 15 | # MAGIC 16 | # MAGIC [How to get key names from JSON using jq](https://stackoverflow.com/q/23118341/1305344) 17 | 18 | # COMMAND ---------- 19 | 20 | # MAGIC %md 21 | # MAGIC 22 | # MAGIC ``` console 23 | # MAGIC $ databricks bundle validate --output json | jq '.resources | keys' 24 | # MAGIC [ 25 | # MAGIC "jobs" 26 | # MAGIC ] 27 | # MAGIC ``` 28 | 29 | # COMMAND ---------- 30 | 31 | # MAGIC %md 32 | # MAGIC 33 | # MAGIC ``` console 34 | # MAGIC $ databricks bundle validate --output json | jq '.resources.jobs | keys' 35 | # MAGIC [ 36 | # MAGIC "my_job" 37 | # MAGIC ] 38 | # MAGIC ``` 39 | -------------------------------------------------------------------------------- /demo/delta-live-tables/.gitignore: -------------------------------------------------------------------------------- 1 | # Local .terraform directories 2 | **/.terraform/* 3 | 4 | # .tfstate files 5 | *.tfstate 6 | *.tfstate.* 7 | 8 | # Crash log files 9 | crash.log 10 | crash.*.log 11 | 12 | # Exclude all .tfvars files, which are likely to contain sensitive data, such as 13 | # password, private keys, and other secrets. These should not be part of version 14 | # control as they are data points which are potentially sensitive and subject 15 | # to change depending on the environment. 16 | *.tfvars 17 | *.tfvars.json 18 | 19 | # Ignore override files as they are usually used to override resources locally and so 20 | # are not checked in 21 | override.tf 22 | override.tf.json 23 | *_override.tf 24 | *_override.tf.json 25 | 26 | # Include override files you do wish to add to version control using negated pattern 27 | # !example_override.tf 28 | 29 | # Include tfplan files to ignore the plan output of command: terraform plan -out=tfplan 30 | # example: *tfplan* 31 | 32 | # Ignore CLI configuration files 33 | .terraformrc 34 | terraform.rc 35 | -------------------------------------------------------------------------------- /Databricks Asset Bundles/delta_live_tables_demo/requirements-dev.txt: -------------------------------------------------------------------------------- 1 | ## requirements-dev.txt: dependencies for local development. 2 | ## 3 | ## For defining dependencies used by jobs in Databricks Workflows, see 4 | ## https://docs.databricks.com/dev-tools/bundles/library-dependencies.html 5 | 6 | ## pytest is the default package used for testing 7 | pytest 8 | 9 | ## databricks-connect can be used to run parts of this project locally. 10 | ## See https://docs.databricks.com/dev-tools/databricks-connect.html. 11 | ## 12 | ## databricks-connect is automatically installed if you're using Databricks 13 | ## extension for Visual Studio Code 14 | ## (https://docs.databricks.com/dev-tools/vscode-ext/dev-tasks/databricks-connect.html). 15 | ## 16 | ## To manually install databricks-connect, either follow the instructions 17 | ## at https://docs.databricks.com/dev-tools/databricks-connect.html 18 | ## to install the package system-wide. Or uncomment the line below to install a 19 | ## version of db-connect that corresponds to the Databricks Runtime version used 20 | ## for this project. 21 | # 22 | # databricks-connect>=13.3,<13.4 23 | -------------------------------------------------------------------------------- /Table-Valued Functions.sql: -------------------------------------------------------------------------------- 1 | -- Databricks notebook source 2 | -- MAGIC %md # Table-Valued Functions 3 | -- MAGIC 4 | -- MAGIC [The Internals of Spark SQL](https://books.japila.pl/spark-sql-internals/table-valued-functions/) 5 | 6 | -- COMMAND ---------- 7 | 8 | -- MAGIC %python 9 | -- MAGIC 10 | -- MAGIC import os 11 | -- MAGIC print('DATABRICKS_RUNTIME_VERSION:', os.environ.get('DATABRICKS_RUNTIME_VERSION', '(undefined)')) 12 | 13 | -- COMMAND ---------- 14 | 15 | -- MAGIC %scala 16 | -- MAGIC 17 | -- MAGIC import org.apache.spark.sql.catalyst.analysis.TableFunctionRegistry 18 | -- MAGIC display(TableFunctionRegistry.builtin.listFunction.map(_.funcName).sorted.toDF("Table-Valued Function")) 19 | 20 | -- COMMAND ---------- 21 | 22 | -- MAGIC %fs mkdirs /tmp/jacek-laskowski 23 | 24 | -- COMMAND ---------- 25 | 26 | select * from read_files("/tmp/jacek-laskowski") 27 | 28 | -- COMMAND ---------- 29 | 30 | -- MAGIC %md 31 | -- MAGIC 32 | -- MAGIC ## Databricks TVFs 33 | -- MAGIC 34 | -- MAGIC [Alphabetical list of built-in functions](https://docs.databricks.com/en/sql/language-manual/sql-ref-functions-builtin-alpha.html) 35 | -------------------------------------------------------------------------------- /demo/delta-live-tables/.terraform.lock.hcl: -------------------------------------------------------------------------------- 1 | # This file is maintained automatically by "terraform init". 2 | # Manual edits may be lost in future updates. 3 | 4 | provider "registry.terraform.io/databricks/databricks" { 5 | version = "1.13.0" 6 | constraints = "1.13.0" 7 | hashes = [ 8 | "h1:ga1T48DAVVP1ifvG2261xDYZ9d6zBlJc8OlkaTCHlyU=", 9 | "zh:1f043e67b45749a0c58638b62b5859e367931e72f3b91c8e12151fe6222ab672", 10 | "zh:32db31687e501c55412c97b998bfd853f21ef302a2b562abefc9bc2a3bbdabf2", 11 | "zh:7d092ac4a1e079c482ecbe541518dff756adcf7f80cac9f7e8152ee93673bcae", 12 | "zh:90a1700cbe727597c2e1fc7f27b1e41531a16dbbfad3fd7e1a31de270b4e80d5", 13 | "zh:9338406075c45eb732a55927479c114759b6fb24a819e8c744317b57ec04af41", 14 | "zh:b17faafdcc8e69d890037683538eb3a8d92279efd2bc1c9a4b3e1351981218f5", 15 | "zh:e0c603a36facc6724ef512a946e5793fb2e021bbecc68ae7993a4f5f6bcd4d92", 16 | "zh:ec5b3ab15be55e1da39c04afe02ab6b9b37f0c9135284c6df22358dc7d253316", 17 | "zh:ef236b53004957e6f3ef19892ce046d90cddc260cbf54acc14ea736d7f365f61", 18 | "zh:f841ea8f17d65dd8a51d6ff7728f17ccb2aaa04e43e6baa65235cc8960b77a4f", 19 | ] 20 | } 21 | -------------------------------------------------------------------------------- /review_me/DLT Step 1.sql: -------------------------------------------------------------------------------- 1 | -- Databricks notebook source 2 | CREATE LIVE TABLE five_row_table 3 | AS SELECT * FROM range(0, 5) 4 | 5 | -- COMMAND ---------- 6 | 7 | CREATE LIVE TABLE all_rows_multiplied_by_5 8 | AS SELECT id * 5 id FROM live.five_row_table 9 | 10 | -- COMMAND ---------- 11 | 12 | -- MAGIC %py 13 | -- MAGIC 14 | -- MAGIC # A regular PySpark data loading pattern 15 | -- MAGIC # dataframe = spark.read.format('csv').option('header', True).load('dbfs:/FileStore/books.csv') 16 | -- MAGIC # display(dataframe) 17 | -- MAGIC 18 | -- MAGIC # What am I supposed to do with the two below 19 | -- MAGIC # to create a DLT live table in Python? 20 | -- MAGIC 21 | -- MAGIC # @dlt.table Decorator 22 | -- MAGIC # The Python table and view functions must return a DataFrame 23 | -- MAGIC 24 | -- MAGIC from pyspark.sql import DataFrame 25 | -- MAGIC import dlt 26 | -- MAGIC 27 | -- MAGIC # decorators beg for methods 28 | -- MAGIC 29 | -- MAGIC # A DLT data loading pattern 30 | -- MAGIC 31 | -- MAGIC @dlt.table 32 | -- MAGIC def python_in_sql() -> DataFrame: 33 | -- MAGIC return spark.read.format('csv').option('header', True).load('dbfs:/FileStore/books.csv') 34 | -------------------------------------------------------------------------------- /Databricks Workflows/Step 2. Transform.sql: -------------------------------------------------------------------------------- 1 | -- Databricks notebook source 2 | -- MAGIC %md # Transform 3 | 4 | -- COMMAND ---------- 5 | 6 | -- MAGIC %python 7 | -- MAGIC 8 | -- MAGIC # Creates a text input widget with a given name and default value. 9 | -- MAGIC # Notebook Widgets are only for Run all (when executed outside a job) 10 | -- MAGIC dbutils.widgets.removeAll() 11 | -- MAGIC dbutils.widgets.text(name = "database_name", defaultValue = "jaceklaskowski", label = "Database Name") 12 | -- MAGIC dbutils.widgets.text(name = "raw_table_name", defaultValue = "workflows_raw_data", label = "Raw Table Name") 13 | -- MAGIC dbutils.widgets.text(name = "silver_table_name", defaultValue = "workflows_transform", label = "Silver Table Name") 14 | -- MAGIC dbutils.widgets.text(name = "gold_table_name", defaultValue = "workflows_aggregates", label = "Gold Table Name") 15 | 16 | -- COMMAND ---------- 17 | 18 | USE ${database_name} 19 | 20 | -- COMMAND ---------- 21 | 22 | CREATE OR REPLACE VIEW ${silver_table_name} 23 | COMMENT 'Silver layer' 24 | AS 25 | SELECT id, upper(name) name 26 | FROM ${raw_table_name} 27 | 28 | -- COMMAND ---------- 29 | 30 | SHOW VIEWS 31 | -------------------------------------------------------------------------------- /PySpark/pyspark-jupyter-poetry/install-pyspark.md: -------------------------------------------------------------------------------- 1 | # Install PySpark 2 | 3 | 1. The official [Apache Spark docs](https://spark.apache.org/docs/latest/api/python/getting_started/install.html) 4 | 1. [Getting Started](https://spark.apache.org/docs/latest/api/python/getting_started/index.html) 5 | 6 | ## Spark Connect 7 | 8 | [Spark Connect](https://spark.apache.org/docs/latest/api/python/getting_started/quickstart_connect.html) 9 | 10 | Start Spark Connect server. 11 | 12 | ```bash 13 | ./sbin/start-connect-server.sh 14 | ``` 15 | 16 | Open http://localhost:4040/connect/ to review the Spark Connect server application UI. 17 | 18 | ```bash 19 | poetry run pyspark --remote sc://localhost:15002 20 | ``` 21 | 22 | In the other terminal, `tail -f` the logs to learn more about the connection. 23 | 24 | ```bash 25 | tail -f /Users/jacek/dev/oss/spark/logs/spark-jacek-org.apache.spark.sql.connect.service.SparkConnectServer-1-Jaceks-Mac-mini.local.out 26 | ``` 27 | 28 | Refresh the Spark Connect server application UI. 29 | 30 | ## JupyterLab 31 | 32 | [JupyterLab: A Next-Generation Notebook Interface](https://jupyter.org/) 33 | 34 | ```bash 35 | poetry run jupyter lab 36 | ``` 37 | -------------------------------------------------------------------------------- /workshops/Databricks Workshop Half-Day 5a.sql: -------------------------------------------------------------------------------- 1 | -- Databricks notebook source 2 | -- MAGIC %md # Databricks Workshop Half-Day 5a 3 | -- MAGIC 4 | -- MAGIC Duration: 2 hours 15 minutes (9:45-12:00) 5 | 6 | -- COMMAND ---------- 7 | 8 | -- MAGIC %md ## Schedule 9 | 10 | -- COMMAND ---------- 11 | 12 | -- MAGIC %md 13 | -- MAGIC 14 | -- MAGIC * The class starts at 9:45 15 | -- MAGIC * A class is split into 1-hour blocks with a 12-minute break each 16 | -- MAGIC * Breaks at the end of an "hour" 17 | -- MAGIC * 09:45 - 10:25 18 | -- MAGIC * 10:45 - 12:00 19 | 20 | -- COMMAND ---------- 21 | 22 | -- MAGIC %md ## Agenda 23 | 24 | -- COMMAND ---------- 25 | 26 | -- MAGIC %md 27 | -- MAGIC 28 | -- MAGIC 1. Databricks Workflows 29 | -- MAGIC 1. Modular Orchestration with Run Job Task 30 | -- MAGIC 1. Conditional Workflows 31 | -- MAGIC 1. Delta Lake (_unlikely and leaving as an option_) 32 | -- MAGIC 1. DESCRIBE HISTORY Command 33 | -- MAGIC 1. [REORG TABLE Command](https://books.japila.pl/delta-lake-internals/commands/reorg/) 34 | -------------------------------------------------------------------------------- /Delta Live Tables/Delta Live Tables Python.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md 3 | # MAGIC 4 | # MAGIC # Delta Live Tables Python API 5 | 6 | # COMMAND ---------- 7 | 8 | # MAGIC %md ## dlt Module 9 | # MAGIC 10 | # MAGIC Delta Live Tables Python functions are defined in the `dlt` module 11 | # MAGIC 12 | # MAGIC ```py 13 | # MAGIC import dlt 14 | # MAGIC ``` 15 | 16 | # COMMAND ---------- 17 | 18 | # MAGIC %md ## @dlt.table Decorator 19 | # MAGIC 20 | # MAGIC Used to define tables (incl. streaming tables) 21 | 22 | # COMMAND ---------- 23 | 24 | # MAGIC %md ## @dlt.view Decorator 25 | 26 | # COMMAND ---------- 27 | 28 | # MAGIC %md 29 | # MAGIC 30 | # MAGIC ## How Dataflow Graph is Rendered 31 | # MAGIC 32 | # MAGIC * The Python `table` and `view` methods must return either a Spark or Koalas `DataFrame` 33 | # MAGIC * DataFrame transformations are executed **after** the full dataflow graph has been resolved 34 | # MAGIC * Non-`table` or `view` functions are executed once at the graph initialization phase 35 | 36 | # COMMAND ---------- 37 | 38 | # MAGIC %md ## Learn More 39 | # MAGIC 40 | # MAGIC * [Delta Live Tables Python language reference](https://docs.databricks.com/en/delta-live-tables/python-ref.html) 41 | -------------------------------------------------------------------------------- /terraform/pipeline.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | # Each argument in the required_providers block enables one provider 3 | # The key determines the provider's local name (its unique identifier within this module) 4 | required_providers { 5 | databricks = { 6 | source = "databricks/databricks" 7 | version >= "1.19.0" 8 | } 9 | } 10 | 11 | required_version = ">= 1.5.0" 12 | } 13 | 14 | # https://developer.hashicorp.com/terraform/language/providers/configuration#default-provider-configurations 15 | # A provider block without an alias argument is the default configuration for that provider. 16 | # Resources that don't set the provider meta-argument 17 | # will use the default provider configuration that matches the first word of the resource type name. 18 | # E.g. databricks_repo, databricks_pipeline below 19 | provider "databricks" {} 20 | 21 | resource "databricks_repo" "learn_databricks" { 22 | url = "https://github.com/jaceklaskowski/learn-databricks" 23 | } 24 | 25 | resource "databricks_pipeline" "this" { 26 | name = "My Terraform-deployed DLT Pipeline" 27 | library { 28 | notebook { 29 | path = "${databricks_repo.learn_databricks.path}/Delta Live Tables/my_streaming_table" 30 | } 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /Databricks Asset Bundles/delta_live_tables_demo/scratch/exploration.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "application/vnd.databricks.v1+cell": { 8 | "cellMetadata": { 9 | "byteLimit": 2048000, 10 | "rowLimit": 10000 11 | }, 12 | "inputWidgets": {}, 13 | "nuid": "6bca260b-13d1-448f-8082-30b60a85c9ae", 14 | "showTitle": false, 15 | "title": "" 16 | } 17 | }, 18 | "outputs": [], 19 | "source": [ 20 | "import sys\n", 21 | "sys.path.append('../src')\n", 22 | "from delta_live_tables_demo import main\n", 23 | "\n", 24 | "main.get_taxis().show(10)" 25 | ] 26 | } 27 | ], 28 | "metadata": { 29 | "application/vnd.databricks.v1+notebook": { 30 | "dashboards": [], 31 | "language": "python", 32 | "notebookMetadata": { 33 | "pythonIndentUnit": 2 34 | }, 35 | "notebookName": "ipynb-notebook", 36 | "widgets": {} 37 | }, 38 | "kernelspec": { 39 | "display_name": "Python 3", 40 | "language": "python", 41 | "name": "python3" 42 | }, 43 | "language_info": { 44 | "name": "python", 45 | "version": "3.11.4" 46 | } 47 | }, 48 | "nbformat": 4, 49 | "nbformat_minor": 0 50 | } 51 | -------------------------------------------------------------------------------- /Generative AI/Retrieval Augmented Generation.sql: -------------------------------------------------------------------------------- 1 | -- Databricks notebook source 2 | -- MAGIC %md # Retrieval Augmented Generation (RAG) 3 | 4 | -- COMMAND ---------- 5 | 6 | -- MAGIC %md 7 | -- MAGIC 8 | -- MAGIC [Retrieval Augmented Generation](https://www.databricks.com/glossary/retrieval-augmented-generation-rag): 9 | -- MAGIC 10 | -- MAGIC > **Retrieval augmented generation** or **RAG** is an architectural approach that can improve the efficacy of large language model (LLM) applications by leveraging custom data. 11 | -- MAGIC 12 | -- MAGIC > This is done by retrieving relevant data/documents relevant to a question or task and providing them as context for the LLM. RAG has shown success in support chatbots and Q&A systems that need to maintain up-to-date information or access domain-specific knowledge. 13 | 14 | -- COMMAND ---------- 15 | 16 | -- MAGIC %md ## Creating High Quality RAG Applications with Databricks 17 | -- MAGIC 18 | -- MAGIC [Creating High Quality RAG Applications with Databricks](https://www.databricks.com/blog/building-high-quality-rag-applications-databricks) 19 | 20 | -- COMMAND ---------- 21 | 22 | -- MAGIC %md 23 | -- MAGIC 24 | -- MAGIC > a powerful way to incorporate proprietary, real-time data into Large Language Model (LLM) applications. 25 | -------------------------------------------------------------------------------- /review_me/My Very First Notebook.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | dbutils.widgets.text(name='name', defaultValue='Jacek', label='Name') 3 | 4 | # COMMAND ---------- 5 | 6 | table_name = dbutils.widgets.get('name') 7 | 8 | # COMMAND ---------- 9 | 10 | # MAGIC %md 11 | # MAGIC 12 | # MAGIC # SQL 13 | 14 | # COMMAND ---------- 15 | 16 | # MAGIC %sql 17 | # MAGIC 18 | # MAGIC SHOW SCHEMAS 19 | 20 | # COMMAND ---------- 21 | 22 | # MAGIC %sql 23 | # MAGIC 24 | # MAGIC -- catalog'.'schema'.'table 25 | # MAGIC -- database.table 26 | # MAGIC SELECT '${name}' 27 | 28 | # COMMAND ---------- 29 | 30 | # MAGIC %sql 31 | # MAGIC 32 | # MAGIC SHOW TABLES LIKE '${name}' 33 | 34 | # COMMAND ---------- 35 | 36 | # MAGIC %sql 37 | # MAGIC 38 | # MAGIC SHOW TABLES LIKE '${name}' 39 | 40 | # COMMAND ---------- 41 | 42 | # MAGIC %md # Some Python 43 | 44 | # COMMAND ---------- 45 | 46 | print('Bonjour a tous !') 47 | 48 | # COMMAND ---------- 49 | 50 | # MAGIC %md 51 | # MAGIC 52 | # MAGIC # Some Scala 53 | 54 | # COMMAND ---------- 55 | 56 | # MAGIC %scala 57 | # MAGIC println("How are you today?") 58 | 59 | # COMMAND ---------- 60 | 61 | # MAGIC %md ## Questions 62 | 63 | # COMMAND ---------- 64 | 65 | # MAGIC %md 66 | # MAGIC 67 | # MAGIC 1. Where to find notebooks? 68 | -------------------------------------------------------------------------------- /Databricks Asset Bundles/delta_live_tables_demo/setup.py: -------------------------------------------------------------------------------- 1 | """ 2 | setup.py configuration script describing how to build and package this project. 3 | 4 | This file is primarily used by the setuptools library and typically should not 5 | be executed directly. See README.md for how to deploy, test, and run 6 | the delta_live_tables_demo project. 7 | """ 8 | from setuptools import setup, find_packages 9 | 10 | import sys 11 | sys.path.append('./src') 12 | 13 | import delta_live_tables_demo 14 | 15 | setup( 16 | name="delta_live_tables_demo", 17 | version=delta_live_tables_demo.__version__, 18 | url="https://databricks.com", 19 | author="jacek@japila.pl", 20 | description="wheel file based on delta_live_tables_demo/src", 21 | packages=find_packages(where='./src'), 22 | package_dir={'': 'src'}, 23 | entry_points={ 24 | "packages": [ 25 | "main=delta_live_tables_demo.main:main" 26 | ] 27 | }, 28 | install_requires=[ 29 | # Dependencies in case the output wheel file is used as a library dependency. 30 | # For defining dependencies, when this package is used in Databricks, see: 31 | # https://docs.databricks.com/dev-tools/bundles/library-dependencies.html 32 | "setuptools" 33 | ], 34 | ) 35 | -------------------------------------------------------------------------------- /Delta Live Tables/Deep Dive into DLTs.sql: -------------------------------------------------------------------------------- 1 | -- Databricks notebook source 2 | -- MAGIC %md # Deep Dive into DLTs 3 | 4 | -- COMMAND ---------- 5 | 6 | -- MAGIC %py 7 | -- MAGIC 8 | -- MAGIC dbutils.widgets.text( 9 | -- MAGIC name='storage_location', 10 | -- MAGIC defaultValue='Please specify Storage location', 11 | -- MAGIC label='Storage location') 12 | 13 | -- COMMAND ---------- 14 | 15 | -- MAGIC %py 16 | -- MAGIC 17 | -- MAGIC storage_location = dbutils.widgets.get('storage_location') 18 | 19 | -- COMMAND ---------- 20 | 21 | -- MAGIC %python 22 | -- MAGIC 23 | -- MAGIC # %fs ls dbfs:/pipelines/75fb9324-5321-4be6-b9ca-a3a8f9b47a9b 24 | -- MAGIC display(dbutils.fs.ls(storage_location)) 25 | 26 | -- COMMAND ---------- 27 | 28 | -- MAGIC %python 29 | -- MAGIC 30 | -- MAGIC # %fs ls dbfs:/pipelines/75fb9324-5321-4be6-b9ca-a3a8f9b47a9b/tables/ 31 | -- MAGIC display(dbutils.fs.ls(f'{storage_location}/tables')) 32 | 33 | -- COMMAND ---------- 34 | 35 | SELECT '${storage_location}' 36 | 37 | -- COMMAND ---------- 38 | 39 | select * from delta.`${storage_location}/system/events` 40 | 41 | -- COMMAND ---------- 42 | 43 | DESCRIBE HISTORY delta.`${storage_location}/system/events` 44 | 45 | -- COMMAND ---------- 46 | 47 | SELECT count(*) FROM delta.`${storage_location}/system/events`@v524 48 | -------------------------------------------------------------------------------- /Delta Lake/Merge.sql: -------------------------------------------------------------------------------- 1 | -- Databricks notebook source 2 | -- MAGIC %md # Merge 3 | 4 | -- COMMAND ---------- 5 | 6 | -- MAGIC %md ## Examples 7 | 8 | -- COMMAND ---------- 9 | 10 | -- MAGIC %md ### Conditional Update with Delete 11 | 12 | -- COMMAND ---------- 13 | 14 | DROP TABLE IF EXISTS source; 15 | DROP TABLE IF EXISTS target; 16 | 17 | -- COMMAND ---------- 18 | 19 | CREATE TABLE source 20 | USING delta 21 | AS VALUES 22 | (0, 0), 23 | (1, 10), 24 | (2, 20) AS data(key, value); 25 | 26 | -- COMMAND ---------- 27 | 28 | select * from source; 29 | 30 | -- COMMAND ---------- 31 | 32 | CREATE TABLE target 33 | USING delta 34 | AS VALUES 35 | (1, 1), 36 | (2, 2), 37 | (3, 3) AS data(key, value); 38 | 39 | -- COMMAND ---------- 40 | 41 | select * from target; 42 | 43 | -- COMMAND ---------- 44 | 45 | MERGE INTO target t 46 | USING source s 47 | ON s.key = t.key 48 | WHEN MATCHED AND s.key <> 1 THEN UPDATE SET key = s.key, value = s.value 49 | WHEN MATCHED THEN DELETE 50 | 51 | -- COMMAND ---------- 52 | 53 | select * from target; 54 | 55 | -- COMMAND ---------- 56 | 57 | -- MAGIC %md 58 | -- MAGIC 59 | -- MAGIC ## Learning Resources 60 | 61 | -- COMMAND ---------- 62 | 63 | -- MAGIC %md 64 | -- MAGIC 65 | -- MAGIC * [The Internals of Delta Lake](https://books.japila.pl/delta-lake-internals/commands/merge/) 66 | -------------------------------------------------------------------------------- /review_me/Python example.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md 3 | # MAGIC 4 | # MAGIC # **Matplotlib** 5 | # MAGIC You can display Matplotlib objects in Python notebooks. 6 | 7 | # COMMAND ---------- 8 | 9 | # in DBR 6.4 and below, uncomment the line below 10 | # %matplotlib inline 11 | 12 | # COMMAND ---------- 13 | 14 | import numpy as np 15 | import matplotlib.pyplot as plt 16 | 17 | x = np.linspace(0, 2*np.pi, 50) 18 | y = np.sin(x) 19 | y2 = y + 0.1 * np.random.normal(size=x.shape) 20 | 21 | fig, ax = plt.subplots() 22 | ax.plot(x, y, 'k--') 23 | ax.plot(x, y2, 'ro') 24 | 25 | # set ticks and tick labels 26 | ax.set_xlim((0, 2*np.pi)) 27 | ax.set_xticks([0, np.pi, 2*np.pi]) 28 | ax.set_xticklabels(['0', '$\pi$','2$\pi$']) 29 | ax.set_ylim((-1.5, 1.5)) 30 | ax.set_yticks([-1, 0, 1]) 31 | 32 | # Only draw spine between the y-ticks 33 | ax.spines['left'].set_bounds(-1, 1) 34 | # Hide the right and top spines 35 | ax.spines['right'].set_visible(False) 36 | ax.spines['top'].set_visible(False) 37 | # Only show ticks on the left and bottom spines 38 | ax.yaxis.set_ticks_position('left') 39 | ax.xaxis.set_ticks_position('bottom') 40 | 41 | # COMMAND ---------- 42 | 43 | # MAGIC %md In Databricks Runtime 6.2 and below, run the `display` command to view the plot. 44 | 45 | # COMMAND ---------- 46 | 47 | display(fig) 48 | -------------------------------------------------------------------------------- /Databricks Workflows/Step 3. Build Aggregates.sql: -------------------------------------------------------------------------------- 1 | -- Databricks notebook source 2 | -- MAGIC %md # Build Aggregates 3 | -- MAGIC 4 | -- MAGIC ...for presentation layer 5 | 6 | -- COMMAND ---------- 7 | 8 | -- MAGIC %python 9 | -- MAGIC 10 | -- MAGIC # Creates a text input widget with a given name and default value. 11 | -- MAGIC # Notebook Widgets are only for Run all (when executed outside a job) 12 | -- MAGIC dbutils.widgets.removeAll() 13 | -- MAGIC dbutils.widgets.text(name = "database_name", defaultValue = "jaceklaskowski", label = "Database Name") 14 | -- MAGIC dbutils.widgets.text(name = "raw_table_name", defaultValue = "workflows_raw_data", label = "Raw Table Name") 15 | -- MAGIC dbutils.widgets.text(name = "silver_table_name", defaultValue = "workflows_transform", label = "Silver Table Name") 16 | -- MAGIC dbutils.widgets.text(name = "gold_table_name", defaultValue = "workflows_aggregates", label = "Gold Table Name") 17 | 18 | -- COMMAND ---------- 19 | 20 | USE ${database_name} 21 | 22 | -- COMMAND ---------- 23 | 24 | CREATE OR REPLACE VIEW ${gold_table_name} 25 | COMMENT 'Golden layer' 26 | AS 27 | SELECT length(name) % 2 gid, count(name) count, collect_set(name) names 28 | FROM ${silver_table_name} 29 | GROUP BY 1 30 | 31 | -- COMMAND ---------- 32 | 33 | SHOW VIEWS 34 | 35 | -- COMMAND ---------- 36 | 37 | SELECT * FROM ${gold_table_name} 38 | -------------------------------------------------------------------------------- /Databricks Asset Bundles/Job and Task Parameters.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md 3 | # MAGIC 4 | # MAGIC # Job and Task Parameters 5 | 6 | # COMMAND ---------- 7 | 8 | # DBTITLE 0,job.yml 9 | # MAGIC %md 10 | # MAGIC 11 | # MAGIC ``` 12 | # MAGIC resources: 13 | # MAGIC jobs: 14 | # MAGIC demo_job: 15 | # MAGIC name: demo_job 16 | # MAGIC description: My custom description that should describe the purpose of this job 17 | # MAGIC # https://docs.databricks.com/api/workspace/jobs/create#parameters 18 | # MAGIC # Job-level parameters 19 | # MAGIC parameters: 20 | # MAGIC - name: jacek_custom_variable 21 | # MAGIC default: FIXME_parameters 22 | # MAGIC tasks: 23 | # MAGIC - task_key: notebook_task 24 | # MAGIC existing_cluster_id: ${var.my-human-readable-name} 25 | # MAGIC notebook_task: 26 | # MAGIC notebook_path: ../src/notebook.ipynb 27 | # MAGIC # https://docs.databricks.com/api/workspace/jobs/create#tasks-notebook_task-base_parameters 28 | # MAGIC # Base parameters used for each run of this job 29 | # MAGIC # Parameters at job level take precedence 30 | # MAGIC # Use dbutils.widgets.get to access the value 31 | # MAGIC base_parameters: 32 | # MAGIC jacek_custom_variable: FIXME_base_parameters 33 | # MAGIC ``` 34 | -------------------------------------------------------------------------------- /Delta Live Tables/DLT Lab.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md # DLT Lab 3 | 4 | # COMMAND ---------- 5 | 6 | # MAGIC %md 7 | # MAGIC 8 | # MAGIC 1. Create a 3-table DLT pipeline 9 | # MAGIC 1. 3 tables for each layer (bronze, silver, gold) 10 | # MAGIC 1. A DLT pipeline based on 1 (at least) or (better / highly recommended) many notebooks 11 | # MAGIC 1. `CREATE TABLE` regular table (non-live) that you can use to `INSERT` records into so your pipeline can digest it and do all the transformations 12 | # MAGIC 1. Think of JSON-encoded medical records 13 | # MAGIC 1. A raw table = JSON intact 14 | # MAGIC 1. A silver table = JSON flatten out (`explode` standard function + `:` JSON access pattern) 15 | # MAGIC 1. A(nother) silver table = some unification (e.g. LonDON, london, LONDON) 16 | # MAGIC 1. A Gold table = some aggs (`count`s = how many people live in different cities or hobbies) 17 | 18 | # COMMAND ---------- 19 | 20 | # MAGIC %md 21 | # MAGIC 22 | # MAGIC ## Hint: Create pipeline with blank notebook 23 | # MAGIC 24 | # MAGIC **Source code** are the paths to notebooks or files that contain pipeline source code. 25 | # MAGIC 26 | # MAGIC Paths can be modified after the pipeline is created. 27 | # MAGIC 28 | # MAGIC With no source code specified, Databricks will create an empty notebook for the pipeline. 29 | # MAGIC 30 | # MAGIC You can edit this notebook later. 31 | -------------------------------------------------------------------------------- /demo/bestsellers/pipeline_clone.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md # Exercise: Finding 1st and 2nd Bestsellers Per Genre 3 | 4 | # COMMAND ---------- 5 | 6 | # MAGIC %md 7 | # MAGIC 8 | # MAGIC This is a DLT pipeline for [Exercise: Finding 1st and 2nd Bestsellers Per Genre](https://jaceklaskowski.github.io/spark-workshop/exercises/sql/Finding-1st-and-2nd-Bestsellers-Per-Genre.html). 9 | 10 | # COMMAND ---------- 11 | 12 | # MAGIC %md 13 | # MAGIC 14 | # MAGIC 1. Create a live table (with a raw data) = `books` table 15 | 16 | # COMMAND ---------- 17 | 18 | import dlt 19 | from pyspark.sql import DataFrame 20 | 21 | @dlt.table 22 | def books() -> DataFrame: 23 | return spark.range(4) 24 | 25 | # COMMAND ---------- 26 | 27 | # MAGIC %md 28 | # MAGIC 29 | # MAGIC -- CREATE TABLE jacek_laskowski.books 30 | # MAGIC -- OPTIONS (header=true) 31 | # MAGIC -- AS SELECT * FROM csv.`/FileStore/books.csv` 32 | 33 | # COMMAND ---------- 34 | 35 | # MAGIC %md 36 | # MAGIC 37 | # MAGIC -- SELECT * FROM jacek_laskowski.books 38 | 39 | # COMMAND ---------- 40 | 41 | # MAGIC %sql 42 | # MAGIC -- CREATE LIVE TABLE books 43 | # MAGIC -- OPTIONS (header=true) 44 | # MAGIC -- AS SELECT * FROM csv.`/FileStore/books.csv` 45 | 46 | # COMMAND ---------- 47 | 48 | # MAGIC %md 49 | # MAGIC 50 | # MAGIC %scala 51 | # MAGIC 52 | # MAGIC val books = spark 53 | # MAGIC .read 54 | # MAGIC .option("header", true) 55 | # MAGIC .option("inferSchema", true) 56 | # MAGIC .csv("/FileStore/books.csv") 57 | # MAGIC books.schema 58 | -------------------------------------------------------------------------------- /Databricks Asset Bundles/delta_live_tables_demo/my_project/README.md: -------------------------------------------------------------------------------- 1 | # my_project 2 | 3 | The 'my_project' project was generated by using the default-python template. 4 | 5 | ## Getting started 6 | 7 | 1. Install the Databricks CLI from https://docs.databricks.com/dev-tools/cli/databricks-cli.html 8 | 9 | 2. Authenticate to your Databricks workspace: 10 | ``` 11 | $ databricks configure 12 | ``` 13 | 14 | 3. To deploy a development copy of this project, type: 15 | ``` 16 | $ databricks bundle deploy --target dev 17 | ``` 18 | (Note that "dev" is the default target, so the `--target` parameter 19 | is optional here.) 20 | 21 | This deploys everything that's defined for this project. 22 | For example, the default template would deploy a job called 23 | `[dev yourname] my_project_job` to your workspace. 24 | You can find that job by opening your workpace and clicking on **Workflows**. 25 | 26 | 4. Similarly, to deploy a production copy, type: 27 | ``` 28 | $ databricks bundle deploy --target prod 29 | ``` 30 | 31 | 5. To run a job or pipeline, use the "run" comand: 32 | ``` 33 | $ databricks bundle run 34 | ``` 35 | 36 | 6. Optionally, install developer tools such as the Databricks extension for Visual Studio Code from 37 | https://docs.databricks.com/dev-tools/vscode-ext.html. 38 | 39 | 7. For documentation on the Databricks asset bundles format used 40 | for this project, and for CI/CD configuration, see 41 | https://docs.databricks.com/dev-tools/bundles/index.html. 42 | -------------------------------------------------------------------------------- /Generative AI/llm-rag-chatbot/_resources/NOTICE.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md 3 | # MAGIC ## Licence 4 | # MAGIC See LICENSE file. 5 | # MAGIC 6 | # MAGIC ## Data collection 7 | # MAGIC To improve users experience and dbdemos asset quality, dbdemos sends report usage and capture views in the installed notebook (usually in the first cell) and other assets like dashboards. This information is captured for product improvement only and not for marketing purpose, and doesn't contain PII information. By using `dbdemos` and the assets it provides, you consent to this data collection. If you wish to disable it, you can set `Tracker.enable_tracker` to False in the `tracker.py` file. 8 | # MAGIC 9 | # MAGIC ## Resource creation 10 | # MAGIC To simplify your experience, `dbdemos` will create and start for you resources. As example, a demo could start (not exhaustive): 11 | # MAGIC - A cluster to run your demo 12 | # MAGIC - A Delta Live Table Pipeline to ingest data 13 | # MAGIC - A DBSQL endpoint to run DBSQL dashboard 14 | # MAGIC - An ML model 15 | # MAGIC 16 | # MAGIC While `dbdemos` does its best to limit the consumption and enforce resource auto-termination, you remain responsible for the resources created and the potential consumption associated. 17 | # MAGIC 18 | # MAGIC ## Support 19 | # MAGIC Databricks does not offer official support for `dbdemos` and the associated assets. 20 | # MAGIC For any issue with `dbdemos` or the demos installed, please open an issue and the demo team will have a look on a best effort basis. 21 | # MAGIC 22 | # MAGIC 23 | -------------------------------------------------------------------------------- /Databricks Workflows/for_each_task_demo/src/Load_googlesheets_csv.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md 3 | # MAGIC 4 | # MAGIC # Load googlesheets csv Notebook 5 | 6 | # COMMAND ---------- 7 | 8 | # MAGIC %md 9 | # MAGIC 10 | # MAGIC This notebook is part of the For each task Demo. 11 | 12 | # COMMAND ---------- 13 | 14 | # MAGIC %md ## Step 1. Load CSV file 15 | # MAGIC 16 | # MAGIC It could load a CSV file with `google_spreadsheet`s (to load in parallel in the For each task). 17 | 18 | # COMMAND ---------- 19 | 20 | google_spreadsheet_df = spark.createDataFrame( 21 | [ 22 | ("Non-logistics - Energy", "[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16]", "[1]"), 23 | ("Use Phase Factors", "[0,1,2,3,4,5,6]", "[1]"), 24 | ], 25 | "google_spreadsheet string, use_cols string, skip_rows string", 26 | ) 27 | 28 | # COMMAND ---------- 29 | 30 | display(google_spreadsheet_df) 31 | 32 | # COMMAND ---------- 33 | 34 | # MAGIC %md 35 | # MAGIC 36 | # MAGIC ## Step 2. Collect Data 37 | 38 | # COMMAND ---------- 39 | 40 | gsheets = google_spreadsheet_df.toJSON().collect() 41 | 42 | # COMMAND ---------- 43 | 44 | print(gsheets) 45 | 46 | # COMMAND ---------- 47 | 48 | print(type(gsheets)) 49 | 50 | # COMMAND ---------- 51 | 52 | print(type(gsheets[0])) 53 | 54 | # COMMAND ---------- 55 | 56 | # MAGIC %md 57 | # MAGIC 58 | # MAGIC ## Step 3. Define Task Value 59 | 60 | # COMMAND ---------- 61 | 62 | help(dbutils.jobs.taskValues.set) 63 | 64 | # COMMAND ---------- 65 | 66 | dbutils.jobs.taskValues.set(key='gsheets', value=gsheets) 67 | -------------------------------------------------------------------------------- /demo/uv_workflow/resources/uv_workflow.job.yml: -------------------------------------------------------------------------------- 1 | # The main job for uv_workflow. 2 | resources: 3 | jobs: 4 | uv_workflow_job: 5 | name: uv_workflow_job 6 | 7 | trigger: 8 | # Run this job every day, exactly one day from the last run; see https://docs.databricks.com/api/workspace/jobs/create#trigger 9 | periodic: 10 | interval: 1 11 | unit: DAYS 12 | 13 | email_notifications: 14 | on_failure: 15 | - jacek@japila.pl 16 | 17 | tasks: 18 | - task_key: notebook_task 19 | job_cluster_key: job_cluster 20 | notebook_task: 21 | notebook_path: ../src/notebook.ipynb 22 | 23 | - task_key: main_task 24 | depends_on: 25 | - task_key: notebook_task 26 | 27 | job_cluster_key: job_cluster 28 | python_wheel_task: 29 | package_name: uv_workflow 30 | entry_point: main 31 | libraries: 32 | # By default we just include the .whl file generated for the uv_workflow package. 33 | # See https://docs.databricks.com/dev-tools/bundles/library-dependencies.html 34 | # for more information on how to add other libraries. 35 | - whl: ../dist/*.whl 36 | 37 | job_clusters: 38 | - job_cluster_key: job_cluster 39 | new_cluster: 40 | spark_version: 15.4.x-scala2.12 41 | node_type_id: i3.xlarge 42 | autoscale: 43 | min_workers: 1 44 | max_workers: 4 45 | -------------------------------------------------------------------------------- /Databricks SQL/Queries.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md # Databricks SQL » Queries 3 | 4 | # COMMAND ---------- 5 | 6 | # MAGIC %md 7 | # MAGIC 8 | # MAGIC 1. Queries are associated with a catalog and a schema 9 | # MAGIC 1. Queries can be used as tasks in Workflow jobs (see [Workflow Jobs]($../Workflow Jobs/Databricks Jobs)) 10 | # MAGIC 1. The results of executing queries can be added to dashboards (see [Dashboards]($./Dashboards)) 11 | 12 | # COMMAND ---------- 13 | 14 | # MAGIC %md 15 | # MAGIC 16 | # MAGIC ```sql 17 | # MAGIC SELECT 18 | # MAGIC * 19 | # MAGIC FROM 20 | # MAGIC book_ranks 21 | # MAGIC WHERE 22 | # MAGIC book_rank in (1, 2) 23 | # MAGIC ``` 24 | 25 | # COMMAND ---------- 26 | 27 | # MAGIC %md ## Parametrized Queries 28 | # MAGIC 29 | # MAGIC [Query parameters](https://docs.databricks.com/en/sql/user/queries/query-parameters.html) 30 | 31 | # COMMAND ---------- 32 | 33 | # MAGIC %md 34 | # MAGIC 35 | # MAGIC 1. Queries can be parameterized with curly brackets (`{{ table_pattern }}`) 36 | # MAGIC 1. Substitute values into a query at runtime 37 | # MAGIC 1. A widget appears above the results pane 38 | # MAGIC 1. Query parameters are more flexible than query filters, and should only be used in cases where query filters are not sufficient 39 | # MAGIC 1. `Cmd + I` to define a query parameter at the text caret 40 | # MAGIC 1. Click **Apply Changes** to run a query with a parameter value 41 | 42 | # COMMAND ---------- 43 | 44 | # MAGIC %md 45 | # MAGIC 46 | # MAGIC ```sql 47 | # MAGIC show tables like {{ table_pattern }} 48 | # MAGIC ``` 49 | -------------------------------------------------------------------------------- /Delta Lake/DESCRIBE HISTORY.sql: -------------------------------------------------------------------------------- 1 | -- Databricks notebook source 2 | -- MAGIC %md # DESCRIBE HISTORY 3 | -- MAGIC 4 | -- MAGIC `DESCRIBE HISTORY` command can be used in subqueries in Delta Lake (on [Databricks only](https://twitter.com/jaceklaskowski/status/1733466666749526278)). 5 | 6 | -- COMMAND ---------- 7 | 8 | -- https://docs.databricks.com/en/sql/language-manual/sql-ref-syntax-ddl-create-table-using.html 9 | CREATE TABLE my_students (id INT, name STRING, age INT); 10 | 11 | -- COMMAND ---------- 12 | 13 | -- https://docs.databricks.com/en/sql/language-manual/sql-ref-syntax-dml-insert-into.html 14 | INSERT INTO my_students 15 | VALUES 16 | (0, 'Jacek', 50); 17 | 18 | -- COMMAND ---------- 19 | 20 | SELECT * 21 | FROM ( 22 | DESCRIBE HISTORY my_students 23 | ) 24 | WHERE version = 1; 25 | 26 | -- COMMAND ---------- 27 | 28 | -- MAGIC %scala 29 | -- MAGIC 30 | -- MAGIC val q = sql("""SELECT * 31 | -- MAGIC FROM ( 32 | -- MAGIC DESCRIBE HISTORY my_students 33 | -- MAGIC ) 34 | -- MAGIC WHERE version = 1;""") 35 | -- MAGIC q.explain(extended = true) 36 | 37 | -- COMMAND ---------- 38 | 39 | -- MAGIC %md ## DESCRIBE HISTORY Command 40 | -- MAGIC 41 | -- MAGIC A little about the internals of [DESCRIBE HISTORY Command](https://books.japila.pl/delta-lake-internals/commands/describe-history/) 42 | -- MAGIC 43 | -- MAGIC * a mere wrapper around `DeltaHistoryManager` to access the history of a delta table 44 | -- MAGIC * Possible Cost Optimization on Microsoft Azure using `spark.databricks.delta.history.maxKeysPerList` configuration property 45 | 46 | -- COMMAND ---------- 47 | 48 | SET spark.databricks.delta.history.maxKeysPerList 49 | -------------------------------------------------------------------------------- /meetups/Meetup_2025_01_09.sql: -------------------------------------------------------------------------------- 1 | -- Databricks notebook source 2 | -- MAGIC %md # Meetup 2025-01-09 3 | -- MAGIC 4 | -- MAGIC ➡️ [Deploying Databricks Workflows with uv and Databricks Asset Bundles](https://www.meetup.com/warsaw-data-engineering/events/305473028/) 5 | -- MAGIC 6 | -- MAGIC Agenda: 7 | -- MAGIC 8 | -- MAGIC 1. 5 minut rogrzewki na luźne pomysły na ten i przyszłe meetupy 9 | -- MAGIC * News (new versions, etc.) 10 | -- MAGIC 1. 50 minut Live coding session, a w nim: 11 | -- MAGIC * Stworzenie projektu w Pythonie z uv 12 | -- MAGIC * Stworzenie Databricks job z notebookiem z naszym projektem w Pythonie wyżej (wszystko ręcznie / klikamy w UI / pełny manual) 13 | -- MAGIC * Automatyzacja z Databricks Asset Bundles (DAB) 14 | -- MAGIC 1. Q&A / Zbieranie pomysłów na kolejne edycje (5 minut) 15 | 16 | -- COMMAND ---------- 17 | 18 | -- MAGIC %md # News 19 | 20 | -- COMMAND ---------- 21 | 22 | -- MAGIC %md 23 | -- MAGIC 24 | -- MAGIC ## New Versions 25 | -- MAGIC 26 | -- MAGIC * [uv 0.5.16](https://github.com/astral-sh/uv/releases/tag/0.5.16) 27 | -- MAGIC * [Databricks CLI 0.238.0](https://github.com/databricks/cli/releases/tag/v0.238.0) 28 | -- MAGIC * [Delta Lake 3.3.0](https://github.com/delta-io/delta/releases/tag/v3.3.0) 29 | -- MAGIC * [awscli 2.22.31](https://github.com/aws/aws-cli/releases/tag/2.22.31) 30 | 31 | -- COMMAND ---------- 32 | 33 | -- MAGIC %md 34 | -- MAGIC 35 | -- MAGIC ## Open focus mode 36 | -- MAGIC 37 | -- MAGIC [Databricks notebook interface and controls](https://docs.databricks.com/en/notebooks/notebook-ui.html) 38 | 39 | -- COMMAND ---------- 40 | 41 | -- MAGIC %md 42 | -- MAGIC 43 | -- MAGIC # Live Coding Session 44 | -------------------------------------------------------------------------------- /meetups/Meetup_2025_02_06.sql: -------------------------------------------------------------------------------- 1 | -- Databricks notebook source 2 | -- MAGIC %md # Data Quality in Databricks Workflows with Pydantic cntd. 3 | -- MAGIC 4 | -- MAGIC ➡️ [Meetup Announcement](https://www.meetup.com/warsaw-data-engineering/events/305995327/) 5 | -- MAGIC 6 | -- MAGIC Zakładamy, że mamy 2 projekty. Pierwszy projekt z pydantic (libka w Pythonie), a drugi to "hello world" Databricks Asset Bundle project z przykładowym job'em. Nic specjalnie wyrafinowanego. Od tego zaczniemy. 7 | -- MAGIC 8 | -- MAGIC Agenda: 9 | -- MAGIC 10 | -- MAGIC 1. 5 minut rozgrzewki na luźne pomysły na ten i przyszłe meetupy 11 | -- MAGIC * News (new versions, new features, etc.) 12 | -- MAGIC 1. 50 minut Live coding session, a w nim: 13 | -- MAGIC * Za pomocą Databricks Asset Bundles (DAB), uruchomisz Databricks job z notebookiem z libką w Pythonie z Pydantic (takie tam "hello world"). Wszystko z pomocą uv do zarządzania projektem. 14 | -- MAGIC * Stworzymy UDFa do walidacji rekordów, którego "uzbroimy" w pydantic'a. To główny cel meetupu, którego osiągnięcie będzie naszym "najosobityczniejszym" sukcesem 🥂 15 | -- MAGIC * Może coś jeszcze, ale nie zdradzę teraz 🤷‍♂️ 16 | -- MAGIC 1. 5 minut Q&A / Zbieranie pomysłów na kolejne edycje 17 | 18 | -- COMMAND ---------- 19 | 20 | -- MAGIC %md # 📢 News 21 | 22 | -- COMMAND ---------- 23 | 24 | -- MAGIC %md 25 | -- MAGIC 26 | -- MAGIC ## New Versions 27 | -- MAGIC 28 | -- MAGIC * [uv 0.5.29](https://github.com/astral-sh/uv/releases/tag/0.5.29) 29 | -- MAGIC * [awscli 2.24.0](https://github.com/aws/aws-cli/releases/tag/2.24.0) 30 | 31 | -- COMMAND ---------- 32 | 33 | -- MAGIC %md 34 | -- MAGIC 35 | -- MAGIC # Live Coding Session 36 | -------------------------------------------------------------------------------- /Databricks Asset Bundles/delta_live_tables_demo/src/notebook.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "application/vnd.databricks.v1+cell": { 7 | "cellMetadata": {}, 8 | "inputWidgets": {}, 9 | "nuid": "ee353e42-ff58-4955-9608-12865bd0950e", 10 | "showTitle": false, 11 | "title": "" 12 | } 13 | }, 14 | "source": [ 15 | "# Default notebook\n", 16 | "\n", 17 | "This default notebook is executed using Databricks Workflows as defined in resources/delta_live_tables_demo_job.yml." 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": 0, 23 | "metadata": { 24 | "application/vnd.databricks.v1+cell": { 25 | "cellMetadata": { 26 | "byteLimit": 2048000, 27 | "rowLimit": 10000 28 | }, 29 | "inputWidgets": {}, 30 | "nuid": "6bca260b-13d1-448f-8082-30b60a85c9ae", 31 | "showTitle": false, 32 | "title": "" 33 | } 34 | }, 35 | "outputs": [], 36 | "source": [ 37 | "from delta_live_tables_demo import main\n", 38 | "\n", 39 | "main.get_taxis().show(10)" 40 | ] 41 | } 42 | ], 43 | "metadata": { 44 | "application/vnd.databricks.v1+notebook": { 45 | "dashboards": [], 46 | "language": "python", 47 | "notebookMetadata": { 48 | "pythonIndentUnit": 2 49 | }, 50 | "notebookName": "notebook", 51 | "widgets": {} 52 | }, 53 | "kernelspec": { 54 | "display_name": "Python 3", 55 | "language": "python", 56 | "name": "python3" 57 | }, 58 | "language_info": { 59 | "name": "python", 60 | "version": "3.11.4" 61 | } 62 | }, 63 | "nbformat": 4, 64 | "nbformat_minor": 0 65 | } 66 | -------------------------------------------------------------------------------- /Databricks Asset Bundles/delta_live_tables_demo/README.md: -------------------------------------------------------------------------------- 1 | # delta_live_tables_demo 2 | 3 | The 'delta_live_tables_demo' project was generated by using the default-python template. 4 | 5 | ## Getting started 6 | 7 | 1. Install the Databricks CLI from https://docs.databricks.com/dev-tools/cli/databricks-cli.html 8 | 9 | 2. Authenticate to your Databricks workspace: 10 | ``` 11 | $ databricks configure 12 | ``` 13 | 14 | 3. To deploy a development copy of this project, type: 15 | ``` 16 | $ databricks bundle deploy --target dev 17 | ``` 18 | (Note that "dev" is the default target, so the `--target` parameter 19 | is optional here.) 20 | 21 | This deploys everything that's defined for this project. 22 | For example, the default template would deploy a job called 23 | `[dev yourname] delta_live_tables_demo_job` to your workspace. 24 | You can find that job by opening your workpace and clicking on **Workflows**. 25 | 26 | 4. Similarly, to deploy a production copy, type: 27 | ``` 28 | $ databricks bundle deploy --target prod 29 | ``` 30 | 31 | 5. To run a job or pipeline, use the "run" comand: 32 | ``` 33 | $ databricks bundle run 34 | ``` 35 | 36 | 6. Optionally, install developer tools such as the Databricks extension for Visual Studio Code from 37 | https://docs.databricks.com/dev-tools/vscode-ext.html. Or read the "getting started" documentation for 38 | **Databricks Connect** for instructions on running the included Python code from a different IDE. 39 | 40 | 7. For documentation on the Databricks asset bundles format used 41 | for this project, and for CI/CD configuration, see 42 | https://docs.databricks.com/dev-tools/bundles/index.html. 43 | -------------------------------------------------------------------------------- /review_me/CREATE TABLE IF NOT EXISTS jacek_laskowski.my_table.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md # Day 1 Exercise One 3 | # MAGIC 4 | # MAGIC Create a table 5 | 6 | # COMMAND ---------- 7 | 8 | # MAGIC %md ## Define Variable 9 | 10 | # COMMAND ---------- 11 | 12 | # MAGIC %md ## Execute Code (Python) 13 | 14 | # COMMAND ---------- 15 | 16 | dbutils.widgets.text(name='table_name', defaultValue='jacek_laskowski.my_table', label='Table Name') 17 | table_name_param = dbutils.widgets.get('table_name') 18 | 19 | # COMMAND ---------- 20 | 21 | print(f'Table name {table_name_param}') 22 | 23 | # COMMAND ---------- 24 | 25 | # MAGIC %md ## Execute Code (Scala) 26 | 27 | # COMMAND ---------- 28 | 29 | # MAGIC %scala 30 | # MAGIC 31 | # MAGIC val table_name_param = dbutils.widgets.get("table_name") 32 | 33 | # COMMAND ---------- 34 | 35 | # MAGIC %scala 36 | # MAGIC 37 | # MAGIC println(s"Table name: $table_name_param") 38 | 39 | # COMMAND ---------- 40 | 41 | # MAGIC %sql 42 | # MAGIC 43 | # MAGIC SHOW TABLES 44 | 45 | # COMMAND ---------- 46 | 47 | # MAGIC %md ## Exercise / Question 48 | # MAGIC 49 | # MAGIC Change (find the way how to do it) the default schema to be `main`. 50 | 51 | # COMMAND ---------- 52 | 53 | # MAGIC %sql 54 | # MAGIC 55 | # MAGIC CREATE SCHEMA IF NOT EXISTS jacek_laskowski 56 | 57 | # COMMAND ---------- 58 | 59 | # MAGIC %sql 60 | # MAGIC 61 | # MAGIC CREATE TABLE IF NOT EXISTS ${table_name} ( 62 | # MAGIC id LONG, 63 | # MAGIC name STRING 64 | # MAGIC ) 65 | # MAGIC USING delta 66 | 67 | # COMMAND ---------- 68 | 69 | # MAGIC %sql 70 | # MAGIC 71 | # MAGIC SHOW TABLES IN jacek_laskowski 72 | 73 | # COMMAND ---------- 74 | 75 | # MAGIC %sql 76 | # MAGIC 77 | # MAGIC SELECT * FROM ${table_name} 78 | -------------------------------------------------------------------------------- /Photon.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md # Photon 3 | # MAGIC 4 | # MAGIC This notebook is a list of _things_ (articles, talks, demos, etc.) to learn Photon from. 5 | 6 | # COMMAND ---------- 7 | 8 | # MAGIC %md 9 | # MAGIC 10 | # MAGIC ## Slogans 11 | # MAGIC 12 | # MAGIC [Slogan](https://dictionary.cambridge.org/dictionary/english/slogan) is a _a short and striking or memorable phrase used in advertising._ so it makes much sense to learn Photon by how the people behind this product want us to remember it. 13 | # MAGIC 14 | # MAGIC * [The next generation engine for the Lakehouse](https://www.databricks.com/product/photon) 15 | # MAGIC * [Photon: A High-Performance Query Engine for the Lakehouse](https://www.cidrdb.org/cidr2022/papers/a100-behm.pdf) 16 | # MAGIC * [Photon: A Fast Query Engine for Lakehouse Systems](https://cs.stanford.edu/~matei/papers/2022/sigmod_photon.pdf) 17 | 18 | # COMMAND ---------- 19 | 20 | # MAGIC %md ## Learn More 21 | # MAGIC 22 | # MAGIC ### Reading 23 | # MAGIC 24 | # MAGIC 1. [Photon: A Fast Query Engine for Lakehouse Systems](https://cs.stanford.edu/~matei/papers/2022/sigmod_photon.pdf) (_Photon whitepaper_) 25 | # MAGIC 1. [Photon: A High-Performance Query Engine for the Lakehouse](https://www.cidrdb.org/cidr2022/papers/a100-behm.pdf) 26 | # MAGIC 27 | # MAGIC ### Watching 28 | # MAGIC 29 | # MAGIC 1. [Photon for Dummies: How Does this New Execution Engine Actually Work?](https://www.databricks.com/dataaisummit/session/photon-dummies-how-does-new-execution-engine-actually-work/) 30 | # MAGIC 1. [Advancing Spark - The Photon Whitepaper](https://youtu.be/hxvQxI4FksY) 31 | # MAGIC 1. [Photon Technical Deep Dive: How to Think Vectorized](https://youtu.be/pNn5W4ujP3w) 32 | 33 | # COMMAND ---------- 34 | 35 | 36 | -------------------------------------------------------------------------------- /Databricks Asset Bundles/delta_live_tables_demo/resources/delta_live_tables_demo_job.yml: -------------------------------------------------------------------------------- 1 | # The main job for delta_live_tables_demo 2 | resources: 3 | jobs: 4 | delta_live_tables_demo_job: 5 | name: delta_live_tables_demo_job 6 | 7 | schedule: 8 | quartz_cron_expression: '44 37 8 * * ?' 9 | timezone_id: Europe/Amsterdam 10 | 11 | email_notifications: 12 | on_failure: 13 | - jacek@japila.pl 14 | 15 | tasks: 16 | - task_key: notebook_task 17 | job_cluster_key: job_cluster 18 | notebook_task: 19 | notebook_path: ../src/notebook.ipynb 20 | 21 | - task_key: refresh_pipeline 22 | depends_on: 23 | - task_key: notebook_task 24 | pipeline_task: 25 | pipeline_id: ${resources.pipelines.delta_live_tables_demo_pipeline.id} 26 | 27 | - task_key: main_task 28 | depends_on: 29 | - task_key: refresh_pipeline 30 | job_cluster_key: job_cluster 31 | python_wheel_task: 32 | package_name: delta_live_tables_demo 33 | entry_point: main 34 | libraries: 35 | # By default we just include the .whl file generated for the delta_live_tables_demo package. 36 | # See https://docs.databricks.com/dev-tools/bundles/library-dependencies.html 37 | # for more information on how to add other libraries. 38 | - whl: ../dist/*.whl 39 | 40 | job_clusters: 41 | - job_cluster_key: job_cluster 42 | new_cluster: 43 | spark_version: 13.3.x-scala2.12 44 | node_type_id: i3.xlarge 45 | autoscale: 46 | min_workers: 1 47 | max_workers: 4 48 | -------------------------------------------------------------------------------- /Delta Live Tables/Pipeline settings.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md # Pipeline settings 3 | # MAGIC 4 | # MAGIC Click the **Settings** button in the DLT UI 5 | # MAGIC 6 | # MAGIC There are two main UI settings views: 7 | # MAGIC 8 | # MAGIC 1. **UI** - a human-friendly view 9 | # MAGIC 1. **JSON** 10 | # MAGIC 11 | # MAGIC There is one extra available under the three-dots menu: 12 | # MAGIC 13 | # MAGIC 1. **Pipeline settings YAML** that can be used with Databricks Asset Bundles to source control and apply CI/CD to pipelines. 14 | 15 | # COMMAND ---------- 16 | 17 | # MAGIC %md 18 | # MAGIC ## General 19 | 20 | # COMMAND ---------- 21 | 22 | # MAGIC %md 23 | # MAGIC ## Source code 24 | # MAGIC 25 | # MAGIC ➡️ [Configure source code libraries](https://docs.databricks.com/en/delta-live-tables/settings.html#select-a-cluster-policy) 26 | # MAGIC 27 | # MAGIC Use the file selector in the Delta Live Tables UI to configure the source code defining your pipeline. 28 | # MAGIC 29 | # MAGIC Pipeline source code is defined in Databricks notebooks or in SQL or Python scripts stored in workspace files. 30 | # MAGIC 31 | # MAGIC You can add one or more notebooks or workspace files or a combination of notebooks and workspace files. 32 | 33 | # COMMAND ---------- 34 | 35 | # MAGIC %md 36 | # MAGIC 37 | # MAGIC ``` 38 | # MAGIC "libraries": [ 39 | # MAGIC { 40 | # MAGIC "notebook": { 41 | # MAGIC "path": "/Repos/jacek@japila.pl/learn-databricks/Delta Live Tables/delta-live-tables-bundle/five_record_table" 42 | # MAGIC } 43 | # MAGIC }, 44 | # MAGIC { 45 | # MAGIC "file": { 46 | # MAGIC "path": "/Repos/jacek@japila.pl/learn-databricks/Delta Live Tables/delta-live-tables-bundle/bronze_table.sql" 47 | # MAGIC } 48 | # MAGIC } 49 | # MAGIC ], 50 | # MAGIC ``` 51 | -------------------------------------------------------------------------------- /demo/uv_workflow/src/notebook.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "application/vnd.databricks.v1+cell": { 7 | "cellMetadata": {}, 8 | "inputWidgets": {}, 9 | "nuid": "ee353e42-ff58-4955-9608-12865bd0950e", 10 | "showTitle": false, 11 | "title": "" 12 | } 13 | }, 14 | "source": [ 15 | "# Default notebook\n", 16 | "\n", 17 | "This default notebook is executed using Databricks Workflows as defined in resources/uv_workflow.job.yml." 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": 2, 23 | "metadata": {}, 24 | "outputs": [], 25 | "source": [ 26 | "%load_ext autoreload\n", 27 | "%autoreload 2" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 0, 33 | "metadata": { 34 | "application/vnd.databricks.v1+cell": { 35 | "cellMetadata": { 36 | "byteLimit": 2048000, 37 | "rowLimit": 10000 38 | }, 39 | "inputWidgets": {}, 40 | "nuid": "6bca260b-13d1-448f-8082-30b60a85c9ae", 41 | "showTitle": false, 42 | "title": "" 43 | } 44 | }, 45 | "outputs": [], 46 | "source": [ 47 | "from uv_workflow import main\n", 48 | "\n", 49 | "main.get_taxis(spark).show(10)" 50 | ] 51 | } 52 | ], 53 | "metadata": { 54 | "application/vnd.databricks.v1+notebook": { 55 | "dashboards": [], 56 | "language": "python", 57 | "notebookMetadata": { 58 | "pythonIndentUnit": 2 59 | }, 60 | "notebookName": "notebook", 61 | "widgets": {} 62 | }, 63 | "kernelspec": { 64 | "display_name": "Python 3", 65 | "language": "python", 66 | "name": "python3" 67 | }, 68 | "language_info": { 69 | "name": "python", 70 | "version": "3.11.4" 71 | } 72 | }, 73 | "nbformat": 4, 74 | "nbformat_minor": 0 75 | } 76 | -------------------------------------------------------------------------------- /demo/pydantic_workflow/src/notebook.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "application/vnd.databricks.v1+cell": { 7 | "cellMetadata": {}, 8 | "inputWidgets": {}, 9 | "nuid": "ee353e42-ff58-4955-9608-12865bd0950e", 10 | "showTitle": false, 11 | "title": "" 12 | } 13 | }, 14 | "source": [ 15 | "# Default notebook\n", 16 | "\n", 17 | "This default notebook is executed using Databricks Workflows as defined in resources/pydantic_workflow.job.yml." 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": 2, 23 | "metadata": {}, 24 | "outputs": [], 25 | "source": [ 26 | "%load_ext autoreload\n", 27 | "%autoreload 2" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 0, 33 | "metadata": { 34 | "application/vnd.databricks.v1+cell": { 35 | "cellMetadata": { 36 | "byteLimit": 2048000, 37 | "rowLimit": 10000 38 | }, 39 | "inputWidgets": {}, 40 | "nuid": "6bca260b-13d1-448f-8082-30b60a85c9ae", 41 | "showTitle": false, 42 | "title": "" 43 | } 44 | }, 45 | "outputs": [], 46 | "source": [ 47 | "from pydantic_workflow import main\n", 48 | "\n", 49 | "main.get_taxis(spark).show(10)" 50 | ] 51 | } 52 | ], 53 | "metadata": { 54 | "application/vnd.databricks.v1+notebook": { 55 | "dashboards": [], 56 | "language": "python", 57 | "notebookMetadata": { 58 | "pythonIndentUnit": 2 59 | }, 60 | "notebookName": "notebook", 61 | "widgets": {} 62 | }, 63 | "kernelspec": { 64 | "display_name": "Python 3", 65 | "language": "python", 66 | "name": "python3" 67 | }, 68 | "language_info": { 69 | "name": "python", 70 | "version": "3.11.4" 71 | } 72 | }, 73 | "nbformat": 4, 74 | "nbformat_minor": 0 75 | } 76 | -------------------------------------------------------------------------------- /Delta Lake/Delta Lake 3.1.0.sql: -------------------------------------------------------------------------------- 1 | -- Databricks notebook source 2 | -- MAGIC %md # Delta Lake 3.1.0 3 | -- MAGIC 4 | -- MAGIC [DeltaLake 3.1.0 RC3](https://github.com/delta-io/delta/releases/tag/v3.1.0rc3) just hit the shelves! 🚀 5 | -- MAGIC 6 | -- MAGIC Learn more in the [LinkedIn post](https://www.linkedin.com/feed/update/urn:li:activity:7157783263820861441?updateEntityUrn=urn%3Ali%3Afs_updateV2%3A%28urn%3Ali%3Aactivity%3A7157783263820861441%2CFEED_DETAIL%2CEMPTY%2CDEFAULT%2Cfalse%29), too. 7 | 8 | -- COMMAND ---------- 9 | 10 | -- MAGIC %md ## Auto Compaction 11 | -- MAGIC 12 | -- MAGIC **Auto compaction** to address the small files problem during table writes. Auto compaction which runs at the end of the write query combines small files within partitions to large files to reduce the metadata size and improve query performance. 13 | -- MAGIC 14 | -- MAGIC ### Learn More 15 | -- MAGIC 16 | -- MAGIC 1. [The official documentation of Delta Lake](https://docs.delta.io/3.1.0/optimizations-oss.html#auto-compaction) 17 | -- MAGIC 1. [The Internals of Delta Lake](https://books.japila.pl/delta-lake-internals/auto-compaction/) 18 | 19 | -- COMMAND ---------- 20 | 21 | -- MAGIC %md ## Liquid Clustering 22 | -- MAGIC 23 | -- MAGIC That's really really huge! 🔥 It is still marked as Experimental, but at least let people have a peek under the hood at how it really works. 24 | -- MAGIC 25 | -- MAGIC From the [announcement](https://github.com/delta-io/delta/releases/tag/v3.1.0rc3): 26 | -- MAGIC 27 | -- MAGIC > (Experimental) Liquid clustering for better table layout Now Delta allows clustering the data in a Delta table for better data skipping. Currently this is an experimental feature. See [documentation](https://docs.delta.io/3.1.0/delta-clustering.html) and [example](https://github.com/delta-io/delta/blob/branch-3.1/examples/scala/src/main/scala/example/Clustering.scala) for how to try out this feature. 28 | -------------------------------------------------------------------------------- /demo/delta-live-tables/README.md: -------------------------------------------------------------------------------- 1 | # Delta Live Tables Pipeline Demo 2 | 3 | ```console 4 | $ tfi 5 | ... 6 | Terraform has been successfully initialized! 7 | ``` 8 | 9 | ```console 10 | tfa -auto-approve 11 | ``` 12 | 13 | Check out the pipeline. This step is completely optional. 14 | 15 | ```console 16 | $ databricks pipelines list | jq '.[] | { name, pipeline_id }' 17 | { 18 | "name": "EXPECT Clause Demo", 19 | "pipeline_id": "a02952e6-7197-44a4-a072-5ea5124d7bce" 20 | } 21 | ``` 22 | 23 | **IMPORTANT** Every push to the repo is not reflected (`git pull`) by the repo after `tfa` so you have to `tfd`. 24 | 25 | Run the pipeline. 26 | 27 | ```console 28 | databricks pipelines start --pipeline-id $(tfo -raw pipeline_id) 29 | ``` 30 | 31 | Wait until the pipeline finishes (until `IDLE` comes up from the following command). 32 | 33 | ```console 34 | while (true) 35 | do 36 | state=$(databricks pipelines get --pipeline-id $(tfo -raw pipeline_id) | jq --raw-output '.state') 37 | if [[ $state =~ "IDLE" ]]; then 38 | echo "Pipeline stopped (state: $state)" 39 | break; 40 | fi 41 | echo "Waiting for the pipeline to stop (state: $state)" 42 | sleep 5 43 | done 44 | ``` 45 | 46 | Switch to the DLT UI. Select (_click_) the `raw_streaming_table` streaming live table and review the **Data quality** section. 47 | 48 | Upload data again and re-run the pipeline. 49 | 50 | ```console 51 | databricks fs cp input-data/2.csv dbfs:$(tfo -raw input_dir) 52 | ``` 53 | 54 | ```console 55 | databricks pipelines start --pipeline-id $(tfo -raw pipeline_id) 56 | ``` 57 | 58 | Review the events delta table (use **Data Quality Checks** cell in [Storage location](../../Delta%20Live%20Tables/Storage%20location.sql) notebook). 59 | 60 | ## Clean Up 61 | 62 | ```console 63 | tfd -auto-approve 64 | ``` 65 | 66 | ```console 67 | databricks fs rm -r dbfs:/FileStore/jacek_laskowski/delta-live-tables-demo-input 68 | ``` 69 | -------------------------------------------------------------------------------- /Databricks Asset Bundles/delta_live_tables_demo/my_project/databricks.yml: -------------------------------------------------------------------------------- 1 | # This is a Databricks asset bundle definition for my_project. 2 | # See https://docs.databricks.com/dev-tools/bundles/index.html for documentation. 3 | bundle: 4 | name: my_project 5 | 6 | include: 7 | - resources/*.yml 8 | 9 | targets: 10 | # The 'dev' target, used for development purposes. 11 | # Whenever a developer deploys using 'dev', they get their own copy. 12 | dev: 13 | # We use 'mode: development' to make sure everything deployed to this target gets a prefix 14 | # like '[dev my_user_name]'. Setting this mode also disables any schedules and 15 | # automatic triggers for jobs and enables the 'development' mode for Delta Live Tables pipelines. 16 | mode: development 17 | default: true 18 | workspace: 19 | host: https://training-partners.cloud.databricks.com 20 | 21 | # Optionally, there could be a 'staging' target here. 22 | # (See Databricks docs on CI/CD at https://docs.databricks.com/dev-tools/bundles/index.html.) 23 | # 24 | # staging: 25 | # workspace: 26 | # host: https://training-partners.cloud.databricks.com 27 | 28 | # The 'prod' target, used for production deployment. 29 | prod: 30 | # For production deployments, we only have a single copy, so we override the 31 | # workspace.root_path default of 32 | # /Users/${workspace.current_user.userName}/.bundle/${bundle.target}/${bundle.name} 33 | # to a path that is not specific to the current user. 34 | mode: production 35 | workspace: 36 | host: https://training-partners.cloud.databricks.com 37 | root_path: /Shared/.bundle/prod/${bundle.name} 38 | run_as: 39 | # This runs as jacek@japila.pl in production. Alternatively, 40 | # a service principal could be used here using service_principal_name 41 | # (see Databricks documentation). 42 | user_name: jacek@japila.pl 43 | -------------------------------------------------------------------------------- /Generative AI/Llama.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md # Meta Llama 3 | 4 | # COMMAND ---------- 5 | 6 | # MAGIC %md 7 | # MAGIC 8 | # MAGIC ## Introduction 9 | # MAGIC 10 | # MAGIC * State-of-the-art open source language model 11 | # MAGIC * Available on Databricks using [Foundation Model APIs]($./Foundation Models) 12 | 13 | # COMMAND ---------- 14 | 15 | # MAGIC %md 16 | # MAGIC 17 | # MAGIC ## Llama 3.1 18 | # MAGIC 19 | # MAGIC [Announcing the availability of Llama 3.1 models on the Databricks Data Intelligence Platform](https://www.databricks.com/blog/new-standard-open-source-ai-meta-llama-31-databricks): 20 | # MAGIC 21 | # MAGIC * Llama 3.1 series of open source language models 22 | # MAGIC * Meta Llama 3.1-8B-Instruct 23 | # MAGIC * Meta Llama 3.1-70B-Instruct 24 | # MAGIC * Meta Llama 3.1-405B-Instruct 25 | # MAGIC * New Llama 3.1 models available on Databricks 26 | # MAGIC * Unity Catalog's [system.ai](https://docs.databricks.com/en/generative-ai/pretrained-models.html) catalog 27 | # MAGIC * Served using [Mosaic AI Model Serving]($./Model Serving) 28 | # MAGIC * Used to build production-scale and high-quality GenAI applications 29 | # MAGIC * Databricks customers can use Mosaic AI to serve and fine-tune the Llama 3.1 models 30 | # MAGIC * Connect the models to [Retrieval Augmented Generation (RAG)]($./Retrieval Augmented Generation) and agentic systems 31 | # MAGIC * Synthetic data generation 32 | # MAGIC * Real-time batch inference 33 | # MAGIC * Leverage the models for scalable evaluation 34 | 35 | # COMMAND ---------- 36 | 37 | # MAGIC %md 38 | # MAGIC 39 | # MAGIC ## Get Started 40 | # MAGIC 41 | # MAGIC * Visit the [Mosaic AI Playground](https://docs.databricks.com/en/large-language-models/ai-playground.html) to quickly try Meta Llama 3.1 and other Foundation Models directly from your workspace 42 | # MAGIC * [Get started querying LLMs on Databricks](https://docs.databricks.com/en/large-language-models/llm-serving-intro.html) 43 | -------------------------------------------------------------------------------- /meetups/README.md: -------------------------------------------------------------------------------- 1 | # Meetups 2 | 3 | This directory contains the "agenda" notebooks of the past **Databricks Talks** series of the [Warsaw Data Engineering](https://www.meetup.com/warsaw-data-engineering/) meetup group. 4 | 5 | The meetup group uses [lu.ma](https://lu.ma/warsaw-data-engineering) for meetup announcements. 6 | 7 | 1. [Meetup next](./Meetup_next.ipynb) 8 | 1. [2025_12_11](./Meetup_2025_12_11.ipynb) Developing AI Agents with DSPy and MLflow in Databricks 9 | 1. [2025_10_30](./Meetup_2025_10_30.ipynb) Developing AI Programs with DSPy 10 | 1. [2025_10_23](./Meetup_2025_10_23.ipynb) Building Model Context Protocol (MCP) servers in Python cntd. 11 | 1. [2025_10_16](./Meetup_2025_10_16.ipynb) Building Model Context Protocol (MCP) servers in Python 12 | 1. [2025_08_21](./Meetup_2025_08_21.ipynb) Learn Python through functools module (and OpenAI's Python API) 13 | 1. [2025_06_26](./Meetup_2025_06_26.ipynb) MLflow 3.1 and Classic ML Models on Lakeflow Declarative Pipelines 14 | 1. [2025_06_12](./Meetup_2025_06_12.ipynb) Deploy and Query Models in Databricks 15 | 1. [2025_05_29](./Meetup_2025_05_29.ipynb) Model Lifecycle in Databricks Machine Learning 16 | 1. [2025_05_22](./Meetup_2025_05_22.ipynb) Databricks Machine Learning and MLflow Client API 17 | 1. [2025_05_15](./Meetup_2025_05_15.ipynb) Managed MLflow and Databricks Machine Learning 18 | 1. [2025_05_08](./Meetup_2025_05_08.ipynb) MLflow on Databricks 19 | 1. [2025_04_10](./Meetup_2025_04_10.ipynb) Learn MLflow from `mlflow/models/docker_utils.py` 20 | 1. [2025_03_27](./Meetup_2025_03_27.ipynb) MLflow (Local) Serving 21 | 1. [2025_03_13](./2025_03_13.ipynb) Intro to Delta Live Tables (DLT) 22 | 1. [2025_03_06](./2025_03_06.ipynb) Intro to MLflow (with uv) 23 | 1. [2025_02_20](./Meetup_2025_02_20.ipynb) Data Quality in Databricks Workflows (jobs) with Pydantic 24 | 1. [2025_02_06](./Meetup_2025_02_06.sql) Data Quality in Databricks Workflows with Pydantic cntd. 25 | 1. [2025_01_30](./Meetup_2025_01_30.sql) 26 | 1. [2025_01_09](./Meetup_2025_01_09.sql) 27 | -------------------------------------------------------------------------------- /Delta Live Tables/Full Refresh.sql: -------------------------------------------------------------------------------- 1 | -- Databricks notebook source 2 | -- MAGIC %md # Full Refresh 3 | -- MAGIC 4 | -- MAGIC Let's deep dive into **Full Refresh** feature of DLTs. 5 | -- MAGIC 6 | -- MAGIC You can trigger a full refresh of a DLT pipeline using: 7 | -- MAGIC 8 | -- MAGIC * The Pipelines UI under **Start > Full refresh all** 9 | -- MAGIC * Delta Live Tables CLI `databricks pipelines start --full-refresh` 10 | 11 | -- COMMAND ---------- 12 | 13 | -- MAGIC %md ## DESCRIBE HISTORY 14 | 15 | -- COMMAND ---------- 16 | 17 | USE jaceklaskowski_meetup 18 | 19 | -- COMMAND ---------- 20 | 21 | SHOW TABLES; 22 | 23 | -- COMMAND ---------- 24 | 25 | SELECT version, operation, operationParameters, readVersion, isolationLevel, isBlindAppend, operationMetrics, engineInfo 26 | FROM (DESCRIBE HISTORY my_streaming_table) 27 | 28 | -- COMMAND ---------- 29 | 30 | -- MAGIC %md ## Full Refresh All 31 | -- MAGIC 32 | -- MAGIC ```console 33 | -- MAGIC databricks pipelines start --full-refresh --pipeline-id 3a69ffe2-d42a-47b5-8731-84e7ffb3c844 34 | -- MAGIC ``` 35 | 36 | -- COMMAND ---------- 37 | 38 | SELECT * FROM my_streaming_table 39 | 40 | -- COMMAND ---------- 41 | 42 | -- MAGIC %md ## Demo 43 | -- MAGIC 44 | -- MAGIC This demo shows Full refresh all to fix a header issue with Auto Loader in a DLT pipeline. 45 | -- MAGIC 46 | -- MAGIC 1. Without `header` option, CSVs with headers are processed as if they had one record extra (the header) 47 | -- MAGIC 1. Once fixed and Full refresh all, the Streaming table should have proper records 48 | -- MAGIC 49 | -- MAGIC Use my_streaming_table notebook. 50 | 51 | -- COMMAND ---------- 52 | 53 | SHOW TABLES in jaceklaskowski_meetup 54 | 55 | -- COMMAND ---------- 56 | 57 | -- WRONG: Show all records, incl. headers 58 | select * from jaceklaskowski_meetup.my_streaming_table 59 | 60 | -- COMMAND ---------- 61 | 62 | -- CORRECT: Show all records with no headers this time 63 | select * from jaceklaskowski_meetup.my_streaming_table 64 | -------------------------------------------------------------------------------- /Databricks Asset Bundles/delta_live_tables_demo/databricks.yml: -------------------------------------------------------------------------------- 1 | # This is a Databricks asset bundle definition for delta_live_tables_demo. 2 | # See https://docs.databricks.com/dev-tools/bundles/index.html for documentation. 3 | bundle: 4 | name: delta_live_tables_demo 5 | 6 | include: 7 | - resources/*.yml 8 | 9 | targets: 10 | # The 'dev' target, used for development purposes. 11 | # Whenever a developer deploys using 'dev', they get their own copy. 12 | meetup: 13 | default: true 14 | # FIXME: A bug? 15 | # workspace: 16 | # profile: default 17 | dev: 18 | # We use 'mode: development' to make sure everything deployed to this target gets a prefix 19 | # like '[dev my_user_name]'. Setting this mode also disables any schedules and 20 | # automatic triggers for jobs and enables the 'development' mode for Delta Live Tables pipelines. 21 | mode: development 22 | workspace: 23 | host: https://training-partners.cloud.databricks.com 24 | 25 | # Optionally, there could be a 'staging' target here. 26 | # (See Databricks docs on CI/CD at https://docs.databricks.com/dev-tools/bundles/index.html.) 27 | # 28 | # staging: 29 | # workspace: 30 | # host: https://training-partners.cloud.databricks.com 31 | 32 | # The 'prod' target, used for production deployment. 33 | prod: 34 | # For production deployments, we only have a single copy, so we override the 35 | # workspace.root_path default of 36 | # /Users/${workspace.current_user.userName}/.bundle/${bundle.target}/${bundle.name} 37 | # to a path that is not specific to the current user. 38 | mode: production 39 | workspace: 40 | host: https://training-partners.cloud.databricks.com 41 | root_path: /Shared/.bundle/prod/${bundle.name} 42 | run_as: 43 | # This runs as jacek@japila.pl in production. Alternatively, 44 | # a service principal could be used here using service_principal_name 45 | # (see Databricks documentation). 46 | user_name: jacek@japila.pl 47 | -------------------------------------------------------------------------------- /Delta Live Tables/Materialization.sql: -------------------------------------------------------------------------------- 1 | -- Databricks notebook source 2 | -- MAGIC %md # Materialization 3 | 4 | -- COMMAND ---------- 5 | 6 | -- MAGIC %md ## Review Me 7 | -- MAGIC 8 | -- MAGIC 1. https://www.databricks.com/glossary/materialized-views 9 | -- MAGIC 1. https://docs.databricks.com/en/sql/user/materialized-views.html 10 | -- MAGIC 1. https://www.google.com/search?q=databricks+materialized+view 11 | 12 | -- COMMAND ---------- 13 | 14 | -- MAGIC %md ## CREATE TABLE 15 | -- MAGIC 16 | -- MAGIC [CREATE TABLE](https://docs.databricks.com/en/sql/language-manual/sql-ref-syntax-ddl-create-table-using.html) 17 | -- MAGIC 18 | -- MAGIC ```sql 19 | -- MAGIC { { [CREATE OR] REPLACE TABLE | CREATE [EXTERNAL] TABLE [ IF NOT EXISTS ] } 20 | -- MAGIC table_name 21 | -- MAGIC [ table_specification ] 22 | -- MAGIC [ USING data_source ] 23 | -- MAGIC [ table_clauses ] 24 | -- MAGIC [ AS query ] } 25 | -- MAGIC ``` 26 | 27 | -- COMMAND ---------- 28 | 29 | -- MAGIC %md ## AS query 30 | -- MAGIC 31 | -- MAGIC > This optional clause populates the table using the data from query. When you specify a query you must not also specify a table_specification. The table schema is derived from the query. 32 | -- MAGIC 33 | -- MAGIC > Note that Databricks overwrites the underlying data source with the data of the input query, to make sure the table gets created contains exactly the same data as the input query. 34 | -- MAGIC 35 | -- MAGIC 36 | 37 | -- COMMAND ---------- 38 | 39 | CREATE TABLE IF NOT EXISTS demo_table 40 | AS SELECT * FROM VALUES 1,2,3,4 41 | 42 | -- COMMAND ---------- 43 | 44 | DESCRIBE EXTENDED demo_table 45 | 46 | -- COMMAND ---------- 47 | 48 | -- MAGIC %md ## Materialized Views 49 | -- MAGIC 50 | -- MAGIC [Materialized Views](https://www.databricks.com/glossary/materialized-views): 51 | -- MAGIC 52 | -- MAGIC > A materialized view is a database object that stores the results of a query as a physical table. Unlike regular database views, which are virtual and derive their data from the underlying tables, materialized views contain precomputed data that is incrementally updated on a schedule or on demand. 53 | 54 | -- COMMAND ---------- 55 | 56 | 57 | -------------------------------------------------------------------------------- /Databricks Workflows/for_each_task_demo/README.md: -------------------------------------------------------------------------------- 1 | # For each Task Demo 2 | 3 | The 'for_each_task_demo' project was generated by using the default-python template of [Databricks Asset Bundles](https://docs.databricks.com/en/dev-tools/bundles/index.html). 4 | 5 | ## Run Demo 6 | 7 | Deploy the demo project first. 8 | 9 | ```bash 10 | databricks bundle deploy 11 | ``` 12 | 13 | Run the job. 14 | 15 | ```bash 16 | databricks bundle run for_each_task_demo_job 17 | ``` 18 | 19 | ## Getting started 20 | 21 | 1. Install the Databricks CLI from https://docs.databricks.com/dev-tools/cli/databricks-cli.html 22 | 23 | 2. Authenticate to your Databricks workspace, if you have not done so already: 24 | 25 | ```bash 26 | $ databricks configure 27 | ``` 28 | 29 | 3. To deploy a development copy of this project, type: 30 | ``` 31 | $ databricks bundle deploy --target dev 32 | ``` 33 | (Note that "dev" is the default target, so the `--target` parameter 34 | is optional here.) 35 | 36 | This deploys everything that's defined for this project. 37 | For example, the default template would deploy a job called 38 | `[dev yourname] for_each_task_demo_job` to your workspace. 39 | You can find that job by opening your workpace and clicking on **Workflows**. 40 | 41 | 4. Similarly, to deploy a production copy, type: 42 | ``` 43 | $ databricks bundle deploy --target prod 44 | ``` 45 | 46 | Note that the default job from the template has a schedule that runs every day 47 | (defined in resources/for_each_task_demo.job.yml). The schedule 48 | is paused when deploying in development mode (see 49 | https://docs.databricks.com/dev-tools/bundles/deployment-modes.html). 50 | 51 | 5. To run a job or pipeline, use the "run" command: 52 | ``` 53 | $ databricks bundle run 54 | ``` 55 | 56 | 6. Optionally, install developer tools such as the Databricks extension for Visual Studio Code from 57 | https://docs.databricks.com/dev-tools/vscode-ext.html. 58 | 59 | 7. For documentation on the Databricks asset bundles format used 60 | for this project, and for CI/CD configuration, see 61 | https://docs.databricks.com/dev-tools/bundles/index.html. 62 | -------------------------------------------------------------------------------- /Python/Pyenv.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md # Pyenv 3 | # MAGIC 4 | # MAGIC [Simple Python Version Management: pyenv](https://github.com/pyenv/pyenv) 5 | 6 | # COMMAND ---------- 7 | 8 | # MAGIC %md ## Installation 9 | # MAGIC 10 | # MAGIC On macos: 11 | # MAGIC 12 | # MAGIC ```shell 13 | # MAGIC brew update 14 | # MAGIC brew install pyenv 15 | # MAGIC ``` 16 | # MAGIC 17 | # MAGIC or [Installation](https://github.com/pyenv/pyenv#installation) 18 | 19 | # COMMAND ---------- 20 | 21 | # MAGIC %md 22 | # MAGIC 23 | # MAGIC ``` 24 | # MAGIC $ pyenv --version 25 | # MAGIC pyenv 2.3.30 26 | # MAGIC ``` 27 | 28 | # COMMAND ---------- 29 | 30 | # MAGIC %md 31 | # MAGIC 32 | # MAGIC ```shell 33 | # MAGIC $ pyenv install 3.12 34 | # MAGIC python-build: use openssl@3 from homebrew 35 | # MAGIC python-build: use readline from homebrew 36 | # MAGIC Downloading Python-3.12.0.tar.xz... 37 | # MAGIC -> https://www.python.org/ftp/python/3.12.0/Python-3.12.0.tar.xz 38 | # MAGIC Installing Python-3.12.0... 39 | # MAGIC python-build: use tcl-tk from homebrew 40 | # MAGIC python-build: use readline from homebrew 41 | # MAGIC python-build: use ncurses from homebrew 42 | # MAGIC python-build: use zlib from xcode sdk 43 | # MAGIC Installed Python-3.12.0 to /Users/jacek/.pyenv/versions/3.12.0 44 | # MAGIC ``` 45 | # MAGIC 46 | # MAGIC ```shell 47 | # MAGIC $ pyenv virtualenv 3.12 databricks-cli 48 | # MAGIC ``` 49 | # MAGIC 50 | # MAGIC ```shell 51 | # MAGIC $ pyenv activate databricks-cli 52 | # MAGIC ``` 53 | # MAGIC 54 | # MAGIC ```shell 55 | # MAGIC $ python --version 56 | # MAGIC Python 3.12.0 57 | # MAGIC ``` 58 | # MAGIC 59 | # MAGIC ```shell 60 | # MAGIC $ pyenv deactivate 61 | # MAGIC ``` 62 | # MAGIC 63 | # MAGIC ```shell 64 | # MAGIC $ python --version 65 | # MAGIC Python 3.11.6 66 | # MAGIC ``` 67 | # MAGIC 68 | # MAGIC ```shell 69 | # MAGIC $ pyenv local databricks-cli 70 | # MAGIC ``` 71 | # MAGIC 72 | # MAGIC ```shell 73 | # MAGIC $ pyenv local 74 | # MAGIC ``` 75 | # MAGIC 76 | # MAGIC A special version name `system` means to use whatever Python is found on PATH after the shims PATH entry (in other words, whatever would be run if Pyenv shims weren't on PATH). 77 | # MAGIC 78 | # MAGIC ```shell 79 | # MAGIC $ pyenv global 80 | # MAGIC system 81 | # MAGIC ``` 82 | -------------------------------------------------------------------------------- /Databricks Asset Bundles/delta_live_tables_demo/src/dlt_pipeline.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "application/vnd.databricks.v1+cell": { 7 | "cellMetadata": {}, 8 | "inputWidgets": {}, 9 | "nuid": "9a626959-61c8-4bba-84d2-2a4ecab1f7ec", 10 | "showTitle": false, 11 | "title": "" 12 | } 13 | }, 14 | "source": [ 15 | "# DLT pipeline\n", 16 | "\n", 17 | "This Delta Live Tables (DLT) definition is executed using a pipeline defined in resources/delta_live_tables_demo_pipeline.yml." 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": 0, 23 | "metadata": { 24 | "application/vnd.databricks.v1+cell": { 25 | "cellMetadata": {}, 26 | "inputWidgets": {}, 27 | "nuid": "9198e987-5606-403d-9f6d-8f14e6a4017f", 28 | "showTitle": false, 29 | "title": "" 30 | } 31 | }, 32 | "outputs": [], 33 | "source": [ 34 | "# Import DLT and src/delta_live_tables_demo\n", 35 | "import dlt\n", 36 | "import sys\n", 37 | "from pyspark.sql.functions import expr\n", 38 | "from delta_live_tables_demo import main" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": 0, 44 | "metadata": { 45 | "application/vnd.databricks.v1+cell": { 46 | "cellMetadata": {}, 47 | "inputWidgets": {}, 48 | "nuid": "3fc19dba-61fd-4a89-8f8c-24fee63bfb14", 49 | "showTitle": false, 50 | "title": "" 51 | } 52 | }, 53 | "outputs": [], 54 | "source": [ 55 | "@dlt.view\n", 56 | "def taxi_raw():\n", 57 | " return main.get_taxis()\n", 58 | "\n", 59 | "@dlt.table\n", 60 | "def filtered_taxis():\n", 61 | " return dlt.read(\"taxi_raw\").filter(expr(\"fare_amount < 30\"))" 62 | ] 63 | } 64 | ], 65 | "metadata": { 66 | "application/vnd.databricks.v1+notebook": { 67 | "dashboards": [], 68 | "language": "python", 69 | "notebookMetadata": { 70 | "pythonIndentUnit": 2 71 | }, 72 | "notebookName": "dlt_pipeline", 73 | "widgets": {} 74 | }, 75 | "kernelspec": { 76 | "display_name": "Python 3", 77 | "language": "python", 78 | "name": "python3" 79 | }, 80 | "language_info": { 81 | "name": "python", 82 | "version": "3.11.4" 83 | } 84 | }, 85 | "nbformat": 4, 86 | "nbformat_minor": 0 87 | } 88 | -------------------------------------------------------------------------------- /review_me/python only please.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md # Create DLT Table (Live Table) using Python API 3 | 4 | # COMMAND ---------- 5 | 6 | dbutils.widgets.text(name='filename', defaultValue='dbfs:/FileStore/books.csv') 7 | filename = dbutils.widgets.get('filename') 8 | 9 | # COMMAND ---------- 10 | 11 | import dlt 12 | 13 | # COMMAND ---------- 14 | 15 | # MAGIC %py 16 | # MAGIC 17 | # MAGIC # A regular PySpark data loading pattern 18 | # MAGIC # dataframe = spark.read.format('csv').option('header', True).load('dbfs:/FileStore/books.csv') 19 | # MAGIC # display(dataframe) 20 | # MAGIC 21 | # MAGIC # What am I supposed to do with the two below 22 | # MAGIC # to create a DLT live table in Python? 23 | # MAGIC 24 | # MAGIC # @dlt.table Decorator 25 | # MAGIC # The Python table and view functions must return a DataFrame 26 | # MAGIC 27 | # MAGIC from pyspark.sql import DataFrame 28 | # MAGIC 29 | # MAGIC # decorators beg for methods 30 | # MAGIC 31 | # MAGIC # A DLT data loading pattern 32 | # MAGIC 33 | # MAGIC @dlt.table(name='raw_books') 34 | # MAGIC def raw_load_csv() -> DataFrame: 35 | # MAGIC return spark.read.format('csv').option('header', True).load(filename) 36 | 37 | # COMMAND ---------- 38 | 39 | @dlt.table 40 | def silver_book_titles() -> DataFrame: 41 | # The following won't work as we renamed the table name using @dlt.table(name=...) 42 | # return spark.table('live.raw_load_csv').select('title') 43 | 44 | # This is how to access Spark property 45 | column_name = spark.conf.get('column_name') 46 | return spark.table('live.raw_books').select(column_name) 47 | 48 | # COMMAND ---------- 49 | 50 | from pyspark.sql import functions as F 51 | 52 | 53 | @dlt.table 54 | def golden_upper_titles() -> DataFrame: 55 | return spark.table('live.silver_book_titles').select(F.upper('title').alias('upper_title')) 56 | 57 | # COMMAND ---------- 58 | 59 | # MAGIC %sql 60 | # MAGIC 61 | # MAGIC CREATE LIVE TABLE sql_in_python 62 | # MAGIC AS SELECT * FROM range(0, 5) 63 | 64 | # COMMAND ---------- 65 | 66 | # MAGIC %sql 67 | # MAGIC 68 | # MAGIC show tables 69 | 70 | # COMMAND ---------- 71 | 72 | from pyspark.sql import functions as F 73 | spark.table('my_table').select(F.upper('name')).display() 74 | 75 | # COMMAND ---------- 76 | 77 | # MAGIC %sql insert into my_table values (0, 'ania') 78 | 79 | # COMMAND ---------- 80 | 81 | 82 | -------------------------------------------------------------------------------- /PySpark/PySpark.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md # PySpark 3 | 4 | # COMMAND ---------- 5 | 6 | # MAGIC %md # Create DataFrame From NumPy Array 7 | 8 | # COMMAND ---------- 9 | 10 | # MAGIC %pip install numpy matplotlib scipy 11 | 12 | # COMMAND ---------- 13 | 14 | dbutils.library.restartPython() 15 | 16 | # COMMAND ---------- 17 | 18 | # https://realpython.com/preview/numpy-random-normal/ 19 | import numpy as np 20 | rng = np.random.default_rng() 21 | numbers = rng.normal(size=10_000) 22 | nums = spark.createDataFrame(numbers) 23 | display(nums) 24 | 25 | # COMMAND ---------- 26 | 27 | # MAGIC %md # Standard Functions 28 | # MAGIC 29 | # MAGIC [pyspark.sql.functions](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/functions.html) 30 | 31 | # COMMAND ---------- 32 | 33 | # MAGIC %md # Basic Aggregation with pandas UDFs 34 | 35 | # COMMAND ---------- 36 | 37 | import pandas as pd 38 | from pyspark.sql.functions import pandas_udf 39 | 40 | # COMMAND ---------- 41 | 42 | # MAGIC %md 43 | # MAGIC 44 | # MAGIC Learn more about [pandas.Series](https://pandas.pydata.org/docs/reference/api/pandas.Series.html). 45 | 46 | # COMMAND ---------- 47 | 48 | @pandas_udf(returnType = "long") 49 | def group_id(vs: pd.Series) -> pd.Series: 50 | return (vs.abs() * 1000).round() % 2 51 | 52 | # COMMAND ---------- 53 | 54 | with_gid = nums.withColumn("gid", group_id(nums.value)) 55 | display(with_gid) 56 | 57 | # COMMAND ---------- 58 | 59 | display(with_gid.groupby("gid").count()) 60 | 61 | # COMMAND ---------- 62 | 63 | @pandas_udf(returnType = "long") 64 | def my_count(s: pd.Series) -> 'long': 65 | return pd.Series(s.count()) 66 | 67 | # COMMAND ---------- 68 | 69 | grouped_nums = with_gid.groupBy("gid") 70 | count_by_gid_agg = my_count("gid").alias("count") 71 | counts_by_gid = grouped_nums.agg(count_by_gid_agg) 72 | 73 | # COMMAND ---------- 74 | 75 | display(counts_by_gid) 76 | 77 | # COMMAND ---------- 78 | 79 | # MAGIC %md # DataFrame Partitions 80 | # MAGIC 81 | # MAGIC [pyspark.sql.functions.spark_partition_id](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.functions.spark_partition_id.html#pyspark.sql.functions.spark_partition_id) 82 | 83 | # COMMAND ---------- 84 | 85 | from pyspark.sql.functions import spark_partition_id 86 | 87 | display(counts_by_gid.withColumn("spark_partition_id", spark_partition_id())) 88 | -------------------------------------------------------------------------------- /Generative AI/llm-rag-chatbot/config.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md 3 | # MAGIC ## Configuration file 4 | # MAGIC 5 | # MAGIC Please change your catalog and schema here to run the demo on a different catalog. 6 | # MAGIC 7 | # MAGIC 8 | # MAGIC 9 | 10 | # COMMAND ---------- 11 | 12 | VECTOR_SEARCH_ENDPOINT_NAME="dbdemos_vs_endpoint" 13 | 14 | DATABRICKS_SITEMAP_URL = "https://docs.databricks.com/en/doc-sitemap.xml" 15 | 16 | catalog = "main" 17 | 18 | #email = spark.sql('select current_user() as user').collect()[0]['user'] 19 | #username = email.split('@')[0].replace('.', '_') 20 | #dbName = db = f"rag_chatbot_{username}" 21 | dbName = db = "rag_chatbot" 22 | 23 | # COMMAND ---------- 24 | 25 | # MAGIC %md 26 | # MAGIC ### License 27 | # MAGIC This demo installs the following external libraries on top of DBR(ML): 28 | # MAGIC 29 | # MAGIC 30 | # MAGIC | Library | License | 31 | # MAGIC |---------|---------| 32 | # MAGIC | langchain | [MIT](https://github.com/langchain-ai/langchain/blob/master/LICENSE) | 33 | # MAGIC | lxml | [BSD-3](https://pypi.org/project/lxml/) | 34 | # MAGIC | transformers | [Apache 2.0](https://github.com/huggingface/transformers/blob/main/LICENSE) | 35 | # MAGIC | unstructured | [Apache 2.0](https://github.com/Unstructured-IO/unstructured/blob/main/LICENSE.md) | 36 | # MAGIC | llama-index | [MIT](https://github.com/run-llama/llama_index/blob/main/LICENSE) | 37 | # MAGIC | tesseract | [Apache 2.0](https://github.com/tesseract-ocr/tesseract/blob/main/LICENSE) | 38 | # MAGIC | poppler-utils | [MIT](https://github.com/skmetaly/poppler-utils/blob/master/LICENSE) | 39 | # MAGIC | textstat | [MIT](https://pypi.org/project/textstat/) | 40 | # MAGIC | tiktoken | [MIT](https://github.com/openai/tiktoken/blob/main/LICENSE) | 41 | # MAGIC | evaluate | [Apache2](https://pypi.org/project/evaluate/) | 42 | # MAGIC | torch | [BDS-3](https://github.com/intel/torch/blob/master/LICENSE.md) | 43 | # MAGIC | tiktoken | [MIT](https://github.com/openai/tiktoken/blob/main/LICENSE) | 44 | # MAGIC 45 | # MAGIC 46 | # MAGIC 47 | # MAGIC 48 | -------------------------------------------------------------------------------- /Apache Spark/Parameterized Queries.sql: -------------------------------------------------------------------------------- 1 | -- Databricks notebook source 2 | -- MAGIC %md 3 | -- MAGIC # Parameterized Queries 4 | -- MAGIC 5 | -- MAGIC [The Internals of Spark SQL](https://books.japila.pl/spark-sql-internals/parameterized-queries/) 6 | 7 | -- COMMAND ---------- 8 | 9 | -- MAGIC %md 10 | -- MAGIC 11 | -- MAGIC ## Parameter markers 12 | -- MAGIC 13 | -- MAGIC [Parameter markers](https://docs.databricks.com/en/sql/language-manual/sql-ref-parameter-marker.html) 14 | 15 | -- COMMAND ---------- 16 | 17 | -- MAGIC %md 18 | -- MAGIC 19 | -- MAGIC The following parameterized query does not seem to work in Databricks (as I hoped) and fails with the exception: 20 | -- MAGIC 21 | -- MAGIC > org.apache.spark.sql.catalyst.ExtendedAnalysisException: [UNBOUND_SQL_PARAMETER] Found the unbound parameter: limitA. Please, fix `args` and provide a mapping of the parameter to either a SQL literal or collection constructor functions such as `map()`, `array()`, `struct()`. SQLSTATE: 42P02; line 4 pos 6; 22 | -- MAGIC 23 | -- MAGIC ```sql 24 | -- MAGIC WITH a AS (SELECT 1 c) 25 | -- MAGIC SELECT * 26 | -- MAGIC FROM a 27 | -- MAGIC LIMIT : limitA 28 | -- MAGIC ``` 29 | 30 | -- COMMAND ---------- 31 | 32 | -- MAGIC %md 33 | -- MAGIC 34 | -- MAGIC ## DECLARE VARIABLE 35 | -- MAGIC 36 | -- MAGIC [DECLARE VARIABLE](https://docs.databricks.com/en/sql/language-manual/sql-ref-syntax-ddl-declare-variable.html) 37 | 38 | -- COMMAND ---------- 39 | 40 | DECLARE OR REPLACE VARIABLE limitA INT DEFAULT 5; 41 | 42 | -- COMMAND ---------- 43 | 44 | -- MAGIC %md 45 | -- MAGIC 46 | -- MAGIC ## SET VARIABLE 47 | -- MAGIC 48 | -- MAGIC [SET VARIABLE](https://docs.databricks.com/en/sql/language-manual/sql-ref-syntax-aux-set-variable.html) 49 | 50 | -- COMMAND ---------- 51 | 52 | SET VARIABLE limitA=10 53 | 54 | -- COMMAND ---------- 55 | 56 | -- MAGIC %md 57 | -- MAGIC 58 | -- MAGIC ## EXECUTE IMMEDIATE 59 | -- MAGIC 60 | -- MAGIC [EXECUTE IMMEDIATE](https://docs.databricks.com/en/sql/language-manual/sql-ref-syntax-aux-execute-immediate.html) 61 | 62 | -- COMMAND ---------- 63 | 64 | DECLARE OR REPLACE sqlStr = 'WITH a AS (SELECT "It works! 🔥" result) 65 | SELECT * 66 | FROM a 67 | LIMIT :limitA'; 68 | 69 | -- COMMAND ---------- 70 | 71 | DECLARE OR REPLACE limitA = 5; 72 | 73 | -- COMMAND ---------- 74 | 75 | EXECUTE IMMEDIATE sqlStr USING (limitA AS limitA); 76 | 77 | -- COMMAND ---------- 78 | 79 | -- MAGIC %md 80 | -- MAGIC 81 | -- MAGIC ## Learn More 82 | -- MAGIC 83 | -- MAGIC 1. [Parameterized queries with PySpark](https://www.databricks.com/blog/parameterized-queries-pyspark) 84 | -------------------------------------------------------------------------------- /Generative AI/Databricks Mosaic AI.sql: -------------------------------------------------------------------------------- 1 | -- Databricks notebook source 2 | -- MAGIC %md # Databricks Mosaic AI 3 | 4 | -- COMMAND ---------- 5 | 6 | -- MAGIC %md 7 | -- MAGIC 8 | -- MAGIC [Databricks Mosaic AI](https://www.databricks.com/product/machine-learning) 9 | -- MAGIC 10 | -- MAGIC > Build and deploy production-quality ML and GenAI applications 11 | 12 | -- COMMAND ---------- 13 | 14 | -- MAGIC %md 15 | -- MAGIC 16 | -- MAGIC Provides unified tooling to build, deploy and monitor AI and ML solutions — from building predictive models to the latest GenAI and large language models (LLMs) 17 | 18 | -- COMMAND ---------- 19 | 20 | -- MAGIC %md ## Databricks + MosaicML 21 | -- MAGIC 22 | -- MAGIC [Databricks + MosaicML](https://www.databricks.com/blog/databricks-mosaicml) 23 | -- MAGIC 24 | -- MAGIC * Databricks acquired MosaicML 25 | -- MAGIC * A leading platform for creating and customizing generative AI models for enterprises 26 | -- MAGIC * They keep using the term so I'm gonna repeat the full sentence: _"Democratize data and AI for every enterprise"_ 27 | -- MAGIC * to provide the best-in-class experience for training, customizing, and deploying generative AI applications 28 | -- MAGIC * The three most important developments required to move generative AI into the mainstream for enterprises 29 | -- MAGIC * models are widely available to every company 30 | -- MAGIC * reduce the price of training and customizing large language models 31 | -- MAGIC * training and serving costs 32 | -- MAGIC * Open LLMs: popular [MPT-7B](https://huggingface.co/mosaicml/mpt-7b) and [MPT-30B](https://huggingface.co/mosaicml/mpt-30b) base LLMs 33 | -- MAGIC * AI applications with reasoning abilities and language-based interfaces 34 | -- MAGIC * incorporate the large volumes of custom data: information about business processes, customers, accounts, orders, or other aspects of their business 35 | -- MAGIC * Data privacy and safety 36 | -- MAGIC * little tolerance for hallucinations or incorrect responses 37 | -- MAGIC * deploy safe, secure, and effective AI applications 38 | -- MAGIC * Unifying the AI and data stack 39 | -- MAGIC * model development life cycle 40 | -- MAGIC * Databricks to continue to put data at the center of the AI journey 41 | -- MAGIC * upstream data preparation like cleaning, featurization, and embedding of data for use in models 42 | -- MAGIC * the data and models must be jointly curated 43 | -- MAGIC * efficiently build large AI models on their own data and business processes 44 | -- MAGIC * MosaicML within the Lakehouse AI Platform 45 | -------------------------------------------------------------------------------- /meetups/Meetup_2025_01_30.sql: -------------------------------------------------------------------------------- 1 | -- Databricks notebook source 2 | -- MAGIC %md # Data Quality in Databricks Workflows with Pydantic 3 | -- MAGIC 4 | -- MAGIC ➡️ [Meetup Announcement](https://www.meetup.com/warsaw-data-engineering/events/305877678/) 5 | -- MAGIC 6 | -- MAGIC Agenda: 7 | -- MAGIC 8 | -- MAGIC 1. 5 minut rozgrzewki na luźne pomysły na ten i przyszłe meetupy 9 | -- MAGIC * News (new versions, new features, etc.) 10 | -- MAGIC 1. 50 minut Live coding session, a w nim: 11 | -- MAGIC * Stworzysz nowy projekt dla libki w Pythonie z Pydantic (hello world itp.) i jedynie słusznym uv do zarządzania projektem 12 | -- MAGIC * Stworzysz Databricks job z notebookiem z naszym projektem w Pythonie wyżej (wszystko ręcznie / klikamy w UI / pełny manual) 13 | -- MAGIC * Automatyzacja z Databricks Asset Bundles (DAB) 14 | -- MAGIC 1. 5 minut Q&A / Zbieranie pomysłów na kolejne edycje 15 | 16 | -- COMMAND ---------- 17 | 18 | -- MAGIC %md 19 | -- MAGIC 20 | -- MAGIC ## 🌟 Praise Quote 🌟 21 | -- MAGIC 22 | -- MAGIC > Co zainteresowało Cię w Warsaw Data Engineering Meetup, że zdecydowałaś/-eś się przyłączyć? 23 | -- MAGIC 24 | -- MAGIC > I love studying everything in detail. 25 | -- MAGIC > I'd like to learn more about Apache Spark. 26 | -- MAGIC > I read a lot of articles by Jacek Laskowski and have started reading books on Spark internals. 27 | 28 | -- COMMAND ---------- 29 | 30 | -- MAGIC %md # 📢 News 31 | 32 | -- COMMAND ---------- 33 | 34 | -- MAGIC %md 35 | -- MAGIC 36 | -- MAGIC ## New Versions 37 | -- MAGIC 38 | -- MAGIC * [uv 0.5.25](https://github.com/astral-sh/uv/releases/tag/0.5.25) 39 | -- MAGIC * [Databricks CLI 0.240.0](https://github.com/databricks/cli/releases/tag/v0.240.0) 40 | -- MAGIC * [awscli 2.23.9](https://github.com/aws/aws-cli/releases/tag/2.23.9) 41 | 42 | -- COMMAND ---------- 43 | 44 | -- MAGIC %md 45 | -- MAGIC 46 | -- MAGIC ## Databricks Notebook UI 47 | -- MAGIC 48 | -- MAGIC [Databricks notebook interface and controls](https://docs.databricks.com/en/notebooks/notebook-ui.html) 49 | -- MAGIC 50 | -- MAGIC **Cmd + Shift + P** for [Command palette](https://docs.databricks.com/en/notebooks/notebook-editor.html) with the following: 51 | -- MAGIC 52 | -- MAGIC 1. [Multicursor support](https://docs.databricks.com/en/notebooks/notebook-editor.html#multicursor-support) 🥳 53 | -- MAGIC 1. [Use web terminal and Databricks CLI](https://docs.databricks.com/en/notebooks/notebook-editor.html#use-web-terminal-and-databricks-cli) 🤔 54 | -- MAGIC 1. Duplicating lines as in Visual Code ❤️ 55 | 56 | -- COMMAND ---------- 57 | 58 | -- MAGIC %md 59 | -- MAGIC 60 | -- MAGIC # Live Coding Session 61 | -------------------------------------------------------------------------------- /Delta Live Tables/Agenda.sql: -------------------------------------------------------------------------------- 1 | -- Databricks notebook source 2 | -- MAGIC %md # Delta Live Tables » Agenda 3 | 4 | -- COMMAND ---------- 5 | 6 | -- MAGIC %md 7 | -- MAGIC 8 | -- MAGIC | # | Module | 9 | -- MAGIC | --- | --- | 10 | -- MAGIC | 0 | [Introduction]($./Delta Live Tables) | 11 | -- MAGIC | 1 | [Delta Live Tables SQL]($./Building Delta Live Tables pipelines with SQL) | 12 | -- MAGIC | 2 | 👍 [Delta Live Tables Python API]($./Delta Live Tables Python) | 13 | -- MAGIC | x | [Pipeline settings]($./Pipeline settings) | 14 | -- MAGIC | 2L | [DLT Lab]($./DLT Lab) | 15 | -- MAGIC | 3 | [Expectations]($./Expectations) | 16 | -- MAGIC | 4 | [Storage location]($./Storage location) | 17 | -- MAGIC | 5 | [Full Refresh]($./Full Refresh) | 18 | -- MAGIC | 6 | [Deep Dive into DLTs]($./Deep Dive into DLTs) | 19 | -- MAGIC | 7 | [CLI]($./Delta Live Tables CLI) | 20 | -- MAGIC | 8 | [Auto Loader and Streaming DLTs]($./Auto Loader and Streaming DLTs) | 21 | 22 | -- COMMAND ---------- 23 | 24 | -- MAGIC %md ## Open Topics / TODOs 25 | -- MAGIC 26 | -- MAGIC [Open Topics / TODOs]($./TODOs) 27 | 28 | -- COMMAND ---------- 29 | 30 | -- MAGIC %md 31 | -- MAGIC 32 | -- MAGIC ## Topics 33 | -- MAGIC 34 | -- MAGIC * How to work with files in Databricks 35 | -- MAGIC * `/FileStore` 36 | -- MAGIC * `dbfs` magic command 37 | -- MAGIC * Parameters in jobs vs DLT pipelines 38 | -- MAGIC * How to parameterized SQL queries (to define parameters at job level) 39 | -- MAGIC * https://docs.databricks.com/en/sql/user/queries/query-parameters.html 40 | 41 | -- COMMAND ---------- 42 | 43 | -- MAGIC %md ## Heads-Up 44 | -- MAGIC 45 | -- MAGIC 1. You can use Python in a SQL notebook for a DLT pipeline yet it won't be rendered in a dataflow (and vice versa) 46 | 47 | -- COMMAND ---------- 48 | 49 | -- MAGIC %md ## Exercise 50 | -- MAGIC 51 | -- MAGIC Based on https://jaceklaskowski.github.io/spark-workshop/exercises/spark-sql-exercise-Using-upper-Standard-Function.html 52 | -- MAGIC 53 | -- MAGIC Create a DLT pipeline that does the following: 54 | -- MAGIC 55 | -- MAGIC 1. FIXME Accepts a parameter - a CSV filename to load 56 | -- MAGIC 1. FIXME Accepts another parameter that is the column name with string values 57 | -- MAGIC 1. Executes `upper` standard function on this string column 58 | -- MAGIC 59 | -- MAGIC FIXMEs = how to pass parameters to a DLT pipeline 60 | -- MAGIC 61 | -- MAGIC In summary: 62 | -- MAGIC 63 | -- MAGIC The dataflow (pipeline) should be two tables 64 | 65 | -- COMMAND ---------- 66 | 67 | -- MAGIC %md ## Exercise (Databricks SQL) 68 | -- MAGIC 69 | -- MAGIC Create a job with a DLT pipeline (that's already created) and a (SQL) query 70 | -------------------------------------------------------------------------------- /Databricks Workflows/01 Conditional Workflows.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md 3 | # MAGIC 4 | # MAGIC # Conditional Workflows 5 | 6 | # COMMAND ---------- 7 | 8 | # MAGIC %md 9 | # MAGIC 10 | # MAGIC ## Conditional Tasks 11 | # MAGIC 12 | # MAGIC `if/else condition` task is used to run a part of a job DAG based on the results of a boolean expression. 13 | # MAGIC 14 | # MAGIC Adds branching logic to your job 15 | # MAGIC 16 | # MAGIC [Add branching logic to your job with the If/else condition task](https://docs.databricks.com/en/workflows/jobs/conditional-tasks.html#add-branching-logic-to-your-job-with-the-ifelse-condition-task): 17 | # MAGIC 18 | # MAGIC 1. Runs a part of a job DAG based on a boolean expression 19 | # MAGIC 1. The expression consists of a boolean operator and a pair of operands, where the operands might reference job or task state using [job and task parameter variables](https://docs.databricks.com/en/workflows/jobs/parameter-value-references.html) or use [task values](https://docs.databricks.com/en/workflows/jobs/share-task-context.html). 20 | # MAGIC 21 | 22 | # COMMAND ---------- 23 | 24 | # MAGIC %md 25 | # MAGIC 26 | # MAGIC ## Run if dependencies 27 | # MAGIC 28 | # MAGIC [Add the Run if condition of a task](https://docs.databricks.com/en/workflows/jobs/conditional-tasks.html#add-the-run-if-condition-of-a-task): 29 | # MAGIC 30 | # MAGIC 1. Adds conditions to a task 31 | # MAGIC 1. `Run if dependencies` drop-down menu in the task configuration 32 | # MAGIC 1. Condition is evaluated after completing all the task dependencies 33 | 34 | # COMMAND ---------- 35 | 36 | # MAGIC %md 37 | # MAGIC 38 | # MAGIC ## Conditional Execution 39 | # MAGIC 40 | # MAGIC 1. Tasks configured to handle failures or not meeting if/else condition are marked as Excluded. 41 | # MAGIC 1. Excluded tasks are skipped and are treated as successful. 42 | # MAGIC 1. If all task dependencies are excluded, the task is also excluded, regardless of its Run if condition. 43 | # MAGIC 1. If you cancel a task run, the cancellation propagates through downstream tasks, and tasks with a Run if condition that handles failure are run, for example, to verify a cleanup task runs when a task run is canceled. 44 | 45 | # COMMAND ---------- 46 | 47 | # MAGIC %md 48 | # MAGIC 49 | # MAGIC ## Job Run Status 50 | # MAGIC 51 | # MAGIC [How does Databricks Jobs determine job run status?](https://docs.databricks.com/en/workflows/jobs/conditional-tasks.html#how-does-databricks-jobs-determine-job-run-status) 52 | 53 | # COMMAND ---------- 54 | 55 | # MAGIC %md 56 | # MAGIC 57 | # MAGIC ## Learn More 58 | 59 | # COMMAND ---------- 60 | 61 | # MAGIC %md 62 | # MAGIC 63 | # MAGIC 1. [Run tasks conditionally in a Databricks job](https://docs.databricks.com/en/workflows/jobs/conditional-tasks.html) 64 | -------------------------------------------------------------------------------- /Apache Spark/Bucketing.sql: -------------------------------------------------------------------------------- 1 | -- Databricks notebook source 2 | -- MAGIC %md # Bucketing 3 | 4 | -- COMMAND ---------- 5 | 6 | -- MAGIC %md 7 | -- MAGIC 8 | -- MAGIC * Not supported by Delta Lake 9 | 10 | -- COMMAND ---------- 11 | 12 | DROP SCHEMA jaceklaskowski CASCADE; 13 | 14 | -- COMMAND ---------- 15 | 16 | -- SCHEMA == DATABASE 17 | CREATE SCHEMA jaceklaskowski; 18 | USE jaceklaskowski; 19 | 20 | -- COMMAND ---------- 21 | 22 | CREATE TABLE bucketed ( 23 | id BIGINT, 24 | name STRING, 25 | type STRING) 26 | USING parquet 27 | CLUSTERED BY (type) INTO 8 BUCKETS 28 | 29 | -- COMMAND ---------- 30 | 31 | -- MAGIC %scala 32 | -- MAGIC 33 | -- MAGIC import org.apache.spark.sql.SaveMode 34 | -- MAGIC spark.range(10e4.toLong).write.format("parquet").mode(SaveMode.Overwrite).saveAsTable("jaceklaskowski.t10e4") 35 | -- MAGIC spark.range(10e6.toLong).write.format("parquet").mode(SaveMode.Overwrite).saveAsTable("jaceklaskowski.t10e6") 36 | 37 | -- COMMAND ---------- 38 | 39 | SHOW TABLES 40 | 41 | -- COMMAND ---------- 42 | 43 | -- MAGIC %scala 44 | -- MAGIC 45 | -- MAGIC sc.setJobDescription("Setup") 46 | -- MAGIC 47 | -- MAGIC spark.conf.set("spark.sql.autoBroadcastJoinThreshold", -1) 48 | -- MAGIC spark.conf.set("spark.sql.adaptive.enabled", false) 49 | -- MAGIC 50 | -- MAGIC // https://docs.databricks.com/optimizations/disk-cache.html 51 | -- MAGIC spark.conf.set("spark.databricks.io.cache.enabled", false) 52 | 53 | -- COMMAND ---------- 54 | 55 | -- MAGIC %scala 56 | -- MAGIC 57 | -- MAGIC sc.setJobDescription("Non-Bucketed Join") 58 | -- MAGIC 59 | -- MAGIC val t4 = spark.table("t10e4") 60 | -- MAGIC val t6 = spark.table("t10e6") 61 | -- MAGIC 62 | -- MAGIC assert(t4.count == 10e4) 63 | -- MAGIC assert(t6.count == 10e6) 64 | -- MAGIC 65 | -- MAGIC // trigger execution of the join query 66 | -- MAGIC t4.join(t6, "id").foreach(_ => ()) 67 | 68 | -- COMMAND ---------- 69 | 70 | -- MAGIC %scala 71 | -- MAGIC 72 | -- MAGIC sc.setJobDescription("Create Bucketed Tables") 73 | -- MAGIC 74 | -- MAGIC import org.apache.spark.sql.SaveMode 75 | -- MAGIC spark.range(10e4.toLong) 76 | -- MAGIC .write 77 | -- MAGIC .format("parquet") 78 | -- MAGIC .bucketBy(4, "id") 79 | -- MAGIC .sortBy("id") 80 | -- MAGIC .mode(SaveMode.Overwrite) 81 | -- MAGIC .saveAsTable("bucketed_4_10e4") 82 | -- MAGIC 83 | -- MAGIC spark.range(10e6.toLong) 84 | -- MAGIC .write 85 | -- MAGIC .format("parquet") 86 | -- MAGIC .bucketBy(4, "id") 87 | -- MAGIC .sortBy("id") 88 | -- MAGIC .mode(SaveMode.Overwrite) 89 | -- MAGIC .saveAsTable("bucketed_4_10e6") 90 | 91 | -- COMMAND ---------- 92 | 93 | -- MAGIC %scala 94 | -- MAGIC 95 | -- MAGIC sc.setJobDescription("Bucketed Join") 96 | -- MAGIC 97 | -- MAGIC val bucketed_4_10e4 = spark.table("bucketed_4_10e4") 98 | -- MAGIC val bucketed_4_10e6 = spark.table("bucketed_4_10e6") 99 | -- MAGIC bucketed_4_10e4.join(bucketed_4_10e6, "id").foreach(_ => ()) 100 | -------------------------------------------------------------------------------- /Generative AI/llm-rag-chatbot/00-RAG-LLM-RAG-Introduction.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md 3 | # MAGIC # Deploy Your LLM Chatbots With the Data Intelligence Platform 4 | # MAGIC 5 | # MAGIC In this tutorial, you will learn how to build your own Chatbot Assisstant to help your customers answer questions about Databricks, using Retrieval Augmented Generation (RAG), llama2-70B Foundation Model and Vector Search. 6 | # MAGIC 7 | # MAGIC 8 | # MAGIC 9 | 10 | # COMMAND ---------- 11 | 12 | # MAGIC %md-sandbox 13 | # MAGIC ## Quickstart: Getting started 14 | # MAGIC 15 | # MAGIC 16 | # MAGIC Start here if this is your first time implementing a GenAI application. 17 | # MAGIC 18 | # MAGIC You will learn: 19 | # MAGIC 20 | # MAGIC - How to prepare your document dataset, creating text chunk from documentation pages 21 | # MAGIC - Create your Vector Search index and send queries to find similar documents 22 | # MAGIC - Build your langchain model leveraging Databricks Foundation Model (Llama 2) 23 | # MAGIC - Deploy the chatbot model as Model Serving Endpoint 24 | 25 | # COMMAND ---------- 26 | 27 | # MAGIC %md 28 | # MAGIC Get started: open the [01-quickstart/00-RAG-chatbot-Introduction notebook]($./01-quickstart/00-RAG-chatbot-Introduction). 29 | 30 | # COMMAND ---------- 31 | 32 | # MAGIC %md 33 | # MAGIC ## Advanced: Going further 34 | # MAGIC 35 | # MAGIC Explore this content to discover how to leverage all the Databricks Data Intelligence Platform capabilities for your GenAI Apps. 36 | # MAGIC 37 | # MAGIC You will learn: 38 | # MAGIC 39 | # MAGIC - How to extract information from unstructured documents (pdfs) and create custom chunks 40 | # MAGIC - Leverage Databricks Embedding Foundation Model to compute the chunks embeddings 41 | # MAGIC - Create a Self Managed Vector Search index and send queries to find similar documents 42 | # MAGIC - Build an advanecd langchain model leveraging Databricks Foundation Model (Llama 2) 43 | # MAGIC - Evaluate your model chatbot model correctness with MLflow 44 | # MAGIC - Deploy your Model Serving Endpoint with Table Inferences to automatically log your model traffic 45 | # MAGIC - Run online llm evaluation and track your metrics with Databricks Monitoring 46 | 47 | # COMMAND ---------- 48 | 49 | # MAGIC %md 50 | # MAGIC Learn more adavanced GenAI concepts: [open the 02-advanced/01-PDF-Advanced-Data-Preparation]($./02-advanced/01-PDF-Advanced-Data-Preparation). 51 | -------------------------------------------------------------------------------- /Apache Spark/Parquet Connector.scala: -------------------------------------------------------------------------------- 1 | // Databricks notebook source 2 | // MAGIC %md # Parquet Connector 3 | 4 | // COMMAND ---------- 5 | 6 | // MAGIC %md 7 | // MAGIC 8 | // MAGIC ## Requirements 9 | // MAGIC 10 | // MAGIC 1. A load-save query (loading a parquet dataset, `Dataset.map` over records and saving it out) 11 | // MAGIC 1. Let's call `Dataset.map` operation as `samanta` 12 | // MAGIC 1. A Scala case class as a record 13 | // MAGIC 1. 1 partition 14 | // MAGIC 1. 1GB per record 15 | 16 | // COMMAND ---------- 17 | 18 | // MAGIC %md 19 | // MAGIC 20 | // MAGIC ## Open questions and observations 21 | // MAGIC 22 | // MAGIC 1. Vectorized parquet decoding seems making query processing faster 23 | // MAGIC 1. `Dataset.map` vs `Dataset.mapPartitions` 24 | 25 | // COMMAND ---------- 26 | 27 | // MAGIC %md ## Experiment 28 | 29 | // COMMAND ---------- 30 | 31 | val input = "/Users/jacek@japila.pl/1g.parquet" 32 | val output = "/Users/jacek@japila.pl/1g.parquet_output" 33 | 34 | // COMMAND ---------- 35 | 36 | dbutils.fs.rm(dir = input, recurse = true) 37 | dbutils.fs.rm(dir = output, recurse = true) 38 | 39 | // COMMAND ---------- 40 | 41 | // MAGIC %md ### Prepare 1G parquet dataset 42 | 43 | // COMMAND ---------- 44 | 45 | // 46 | 47 | // Each number takes up 4 bytes 48 | // 1 billion numbers gives 4GB 49 | // We just need 1GB (hence division by 4) 50 | 51 | spark.range(1000*1000*1000 / 4).repartition(1).write.format("parquet").mode("overwrite").save(input) 52 | 53 | // COMMAND ---------- 54 | 55 | // MAGIC %fs ls /Users/jacek@japila.pl/1g.parquet/ 56 | 57 | // COMMAND ---------- 58 | 59 | // MAGIC %md ### Run the query 60 | 61 | // COMMAND ---------- 62 | 63 | sc.setJobDescription(s"mapPartition over parquet ($dataset)") 64 | 65 | // https://docs.databricks.com/optimizations/disk-cache.html 66 | spark.conf.set("spark.databricks.io.cache.enabled", "false") 67 | 68 | import spark.implicits._ 69 | case class MyRecord(id: Long, name: String) 70 | 71 | // FIXME Każdy task po 1GB i takich rekodów ~1k 72 | 73 | // FIXME What exactly should samanta convert to? 74 | val samanta = (mr: MyRecord) => r 75 | 76 | spark 77 | .read 78 | .parquet(input) 79 | .as[MyRecord] 80 | // .map(samanta) 81 | .write 82 | .format("parquet") 83 | .save(output) 84 | 85 | // COMMAND ---------- 86 | 87 | // Skip the rest 88 | dbutils.notebook.exit("skip the rest") 89 | 90 | // COMMAND ---------- 91 | 92 | sc.setJobDescription("mapPartition over parquet (data20K)") 93 | 94 | # https://docs.databricks.com/optimizations/disk-cache.html 95 | spark.conf.set("spark.databricks.io.cache.enabled", "false") 96 | 97 | val samanta_jeden_rekord = r => r 98 | val samanta = rs => Iterator.single(rs.length) // rs.map(samanta_jeden_rekord) 99 | 100 | // FIXME 101 | // 1 task / partycja 102 | // 1 executor only najmniejszy 103 | // rekord waży 1G (case class = row) 104 | // zbadać row groups 105 | // mapPartitions vs map 106 | val r = spark.read.schema("rating DOUBLE,review STRING").parquet("/databricks-datasets/amazon/data20K").mapPartitions(samanta) 107 | display(r) 108 | 109 | // COMMAND ---------- 110 | 111 | // MAGIC %md 112 | // MAGIC 113 | // MAGIC ## Spark QA 114 | // MAGIC 115 | // MAGIC 1. Introduction to JVM (Łukasz) 116 | -------------------------------------------------------------------------------- /Delta Lake/TRUNCATE TABLE in Delta Lake.sql: -------------------------------------------------------------------------------- 1 | -- Databricks notebook source 2 | -- MAGIC %md # TRUNCATE TABLE in Delta Lake 3 | -- MAGIC 4 | -- MAGIC The not-so-obvious ways 5 | 6 | -- COMMAND ---------- 7 | 8 | -- MAGIC %scala 9 | -- MAGIC 10 | -- MAGIC // `truncateTable` = `TRUNCATE TABLE` was running perfectly fine with Hive 11 | -- MAGIC // 1k tests that did `TRUNCATE TABLE` and `INSERT INTO` 3-10-20 records, do some testing and it's over and over again. 12 | -- MAGIC 13 | -- MAGIC // With Hive, it took 30 mins 14 | -- MAGIC 15 | -- MAGIC // Switching from `format("hive")` to `format("delta")` 16 | -- MAGIC 17 | -- MAGIC // With Delta Lake, it took 5 hours 18 | -- MAGIC 19 | -- MAGIC // TRUNCATE TABLE is NOT supported by Delta Lake (open-source version / outside Databricks) 20 | -- MAGIC // because...all the tests were executed OUTSIDE Databricks 21 | -- MAGIC 22 | -- MAGIC // Why do you think the time could even increase?! 23 | -- MAGIC // 1. Metadata! What happens when you `DELETE FROM` / `TRUNCATE` => a new version is created (= a disk op) 24 | -- MAGIC 25 | -- MAGIC protected def truncateTable(databaseName: String, tableName: String): Unit = { 26 | -- MAGIC val fullTableName = s"$databaseName.$tableName" 27 | -- MAGIC 28 | -- MAGIC val beforeNs = System.nanoTime() 29 | -- MAGIC 30 | -- MAGIC // Approach #0 31 | -- MAGIC // val approach = "DELETE FROM" 32 | -- MAGIC // spark.sql(s"DELETE FROM $databaseName.$tableName") 33 | -- MAGIC 34 | -- MAGIC // Approach #1 35 | -- MAGIC // https://stackoverflow.com/a/67519402/1305344 36 | -- MAGIC val approach = "limit(0)" 37 | -- MAGIC spark.table(fullTableName).limit(0).write.mode("overwrite").format("delta").saveAsTable(fullTableName) 38 | -- MAGIC 39 | -- MAGIC // Approach #2 40 | -- MAGIC // 10x slower than DELETE FROM 41 | -- MAGIC // https://docs.delta.io/latest/delta-utility.html#remove-files-no-longer-referenced-by-a-delta-table 42 | -- MAGIC // VACUUM RETAIN 0 HOURS 43 | -- MAGIC // .config("spark.databricks.delta.retentionDurationCheck.enabled", "false") 44 | -- MAGIC // .config("spark.databricks.delta.vacuum.parallelDelete.enabled", "true") 45 | -- MAGIC // val approach = "VACUUM RETAIN 0 HOURS" 46 | -- MAGIC // spark.sql(s"VACUUM $fullTableName RETAIN 0 HOURS") 47 | -- MAGIC 48 | -- MAGIC // Approach #3 49 | -- MAGIC // https://spark.apache.org/docs/latest/sql-ref-syntax-ddl-truncate-table.html 50 | -- MAGIC // Not supported in Delta Lake OSS 51 | -- MAGIC // val approach = "TRUNCATE TABLE" 52 | -- MAGIC // spark.sql(s"TRUNCATE TABLE $databaseName.$tableName") 53 | -- MAGIC 54 | -- MAGIC // Approach #4 55 | -- MAGIC // val approach = "DeltaTable API" 56 | -- MAGIC // import io.delta.tables.DeltaTable 57 | -- MAGIC // DeltaTable.forName(fullTableName).delete() 58 | -- MAGIC 59 | -- MAGIC val tookSecs = (System.nanoTime() - beforeNs) / 1e+9 60 | -- MAGIC println(s">>> truncateTable($fullTableName) took ${tookSecs}s (using $approach)") 61 | -- MAGIC } 62 | 63 | -- COMMAND ---------- 64 | 65 | DROP TABLE IF EXISTS jacek_demo 66 | 67 | -- COMMAND ---------- 68 | 69 | CREATE TABLE jacek_demo 70 | AS SELECT 1 71 | 72 | -- COMMAND ---------- 73 | 74 | DESCRIBE HISTORY jacek_demo 75 | 76 | -- COMMAND ---------- 77 | 78 | SELECT * FROM jacek_demo 79 | 80 | -- COMMAND ---------- 81 | 82 | TRUNCATE TABLE jacek_demo 83 | 84 | -- COMMAND ---------- 85 | 86 | DESCRIBE HISTORY jacek_demo 87 | 88 | -- COMMAND ---------- 89 | 90 | SELECT * FROM jacek_demo 91 | -------------------------------------------------------------------------------- /Generative AI/Model Serving.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md 3 | # MAGIC # Model Serving 4 | 5 | # COMMAND ---------- 6 | 7 | # MAGIC %md 8 | # MAGIC 9 | # MAGIC ## Mosaic AI Model Serving 10 | # MAGIC 11 | # MAGIC [Model serving with Databricks](https://docs.databricks.com/en/machine-learning/model-serving/index.html): 12 | # MAGIC 13 | # MAGIC * Mosaic AI Model Serving provides a unified interface to deploy, govern, and query AI models 14 | # MAGIC * Each model served is available using REST API 15 | # MAGIC * Offers a unified REST API and MLflow Deployment API for CRUD and querying tasks 16 | # MAGIC * Provides a highly available and low-latency service for deploying models 17 | # MAGIC * A single UI to manage all your models and their respective serving endpoints 18 | # MAGIC * Ability to extend pre-trained models (e.g., Llama 3.1) with proprietary data to improve quality 19 | # MAGIC * Specialize them for specific business contexts and skills to build higher quality models 20 | # MAGIC * Automatically scales up or down to meet demand changes 21 | # MAGIC * Uses [serverless compute](https://docs.databricks.com/en/getting-started/overview.html#serverless) 22 | # MAGIC * [Pricing](https://www.databricks.com/product/pricing/model-serving) 23 | 24 | # COMMAND ---------- 25 | 26 | # MAGIC %md 27 | # MAGIC 28 | # MAGIC Model serving supports the following language models: 29 | # MAGIC 30 | # MAGIC 1. [Custom models](https://docs.databricks.com/en/machine-learning/model-serving/custom-models.html) 31 | # MAGIC * Python models packaged in the MLflow format 32 | # MAGIC * Registered either in Unity Catalog or in the workspace model registry 33 | # MAGIC * Examples: scikit-learn, XGBoost, PyTorch, Hugging Face transformer models, [agent serving](https://docs.databricks.com/en/generative-ai/deploy-agent.html) 34 | # MAGIC 1. State-of-the-art open models using [Foundation Model APIs]($./Foundation Models) 35 | # MAGIC * [Llama]($./Llama) 36 | # MAGIC * Curated foundation model architectures that support optimized inference 37 | # MAGIC * Base models (e.g., Llama-2-70B-chat, BGE-Large, and Mistral-7B) are available for immediate use with pay-per-token pricing 38 | # MAGIC * Workloads that require performance guarantees and fine-tuned model variants can be deployed with provisioned throughput 39 | # MAGIC 1. [External models](https://docs.databricks.com/en/generative-ai/external-models/index.html) 40 | # MAGIC * Generative AI models hosted outside of Databricks 41 | # MAGIC * Endpoints that serve external models can be centrally governed and customers can establish rate limits and access control for them 42 | # MAGIC * Examples: OpenAI’s GPT-4, Anthropic’s Claude 43 | 44 | # COMMAND ---------- 45 | 46 | # MAGIC %md 47 | # MAGIC 48 | # MAGIC ## SQL Access 49 | # MAGIC 50 | # MAGIC Models are available from SQL using [AI functions](https://docs.databricks.com/en/large-language-models/ai-functions.html) for easy integration into analytics workflows. 51 | 52 | # COMMAND ---------- 53 | 54 | # MAGIC %md 55 | # MAGIC 56 | # MAGIC ## Tutorials 57 | # MAGIC 58 | # MAGIC 1. [Tutorial: Deploy and query a custom model](https://docs.databricks.com/en/machine-learning/model-serving/model-serving-intro.html) on how to serve custom models on Databricks 59 | # MAGIC 1. [Get started querying LLMs on Databricks](https://docs.databricks.com/en/large-language-models/llm-serving-intro.html) on how to query a foundation model on Databricks 60 | -------------------------------------------------------------------------------- /Apache Spark/ANTI and SEMI joins in SQL and DataFrame API.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md 3 | # MAGIC 4 | # MAGIC # ANTI and SEMI joins in SQL and DataFrame API 5 | 6 | # COMMAND ---------- 7 | 8 | # MAGIC %md 9 | # MAGIC 10 | # MAGIC ## Discovery of the Day 11 | # MAGIC 12 | # MAGIC There's no `DataFrame.createOrReplaceView` in PySpark and Scala APIs 😬 13 | # MAGIC 14 | # MAGIC `CREATE VIEW` only. 15 | 16 | # COMMAND ---------- 17 | 18 | # MAGIC %md 19 | # MAGIC 20 | # MAGIC ## SQL Join types explained with 1 picture 21 | # MAGIC 22 | # MAGIC [SQL Join types explained with 1 picture](https://www.securesolutions.no/sql-join-types-explained-with-1-picture/) 23 | # MAGIC 24 | # MAGIC ![Joins](https://www.securesolutions.no/wp-content/uploads/2014/07/joins-1.jpg) 25 | 26 | # COMMAND ---------- 27 | 28 | # MAGIC %md 29 | # MAGIC 30 | # MAGIC ## SQL Joins 31 | # MAGIC 32 | # MAGIC [SQL Joins](https://www.w3schools.com/sql/sql_join.asp) 33 | 34 | # COMMAND ---------- 35 | 36 | # MAGIC %md 37 | # MAGIC 38 | # MAGIC ## JOINs in Spark SQL 39 | # MAGIC 40 | # MAGIC [JOIN](https://spark.apache.org/docs/latest/sql-ref-syntax-qry-select-join.html) 41 | 42 | # COMMAND ---------- 43 | 44 | left = spark.range(3) 45 | 46 | # COMMAND ---------- 47 | 48 | # MAGIC %md ## Left anti join 49 | # MAGIC 50 | # MAGIC ![Left anti join](https://learn.microsoft.com/en-us/power-query/media/merge-queries-left-anti/left-anti-join-operation.png) 51 | 52 | # COMMAND ---------- 53 | 54 | # MAGIC %md ## Left Semi JOIN 55 | # MAGIC 56 | # MAGIC * [Difference Between Anti-Join and Semi-Join](https://www.geeksforgeeks.org/difference-between-anti-join-and-semi-join/) 57 | # MAGIC * [Difference between INNER JOIN and LEFT SEMI JOIN](https://stackoverflow.com/q/21738784/1305344) 58 | 59 | # COMMAND ---------- 60 | 61 | # MAGIC %md 62 | # MAGIC 63 | # MAGIC ## DataFrame API 64 | # MAGIC 65 | # MAGIC * [DataFrame.except](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrame.except.html) 66 | # MAGIC * [DataFrame.subtract](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrame.subtract.html) 67 | # MAGIC * [DataFrame.intersect](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrame.intersect.html) 68 | 69 | # COMMAND ---------- 70 | 71 | left = spark.range(5) 72 | dups_left = left.union(left) 73 | two_threes = spark.createDataFrame([3,3], 'int').withColumnRenamed('value', 'id') 74 | dups_left_with_threes = dups_left.union(two_threes) 75 | right = spark.range(3, 8, 1) 76 | 77 | # COMMAND ---------- 78 | 79 | two_threes.display() 80 | 81 | # COMMAND ---------- 82 | 83 | left.join(right, 'id', 'L_E_f_t_aN_tI').display() 84 | 85 | # COMMAND ---------- 86 | 87 | left.exceptAll(right).display() 88 | 89 | # COMMAND ---------- 90 | 91 | # MAGIC %md 92 | # MAGIC 93 | # MAGIC programmatic vs maths approach 94 | 95 | # COMMAND ---------- 96 | 97 | two_threes.join(right, 'id', 'left_anti').display() 98 | 99 | # COMMAND ---------- 100 | 101 | two_threes.exceptAll(right).display() 102 | 103 | # COMMAND ---------- 104 | 105 | two_threes.subtract(right).display() 106 | 107 | # COMMAND ---------- 108 | 109 | # MAGIC %md 110 | # MAGIC 111 | # MAGIC ## Mind Query Plans 112 | # MAGIC 113 | # MAGIC Explore query plans with larger datasets (best would be to use delta tables) before claiming that one is better than the others 😜 114 | # MAGIC 115 | # MAGIC Danke schon, Paul, for bringing it up to my attention! 🥳 116 | -------------------------------------------------------------------------------- /Databricks Workflows/Step 1. Load Raw Data.sql: -------------------------------------------------------------------------------- 1 | -- Databricks notebook source 2 | -- MAGIC %md # Load Raw Data 3 | -- MAGIC 4 | -- MAGIC This notebook uses an input parameter (using [Databricks widgets](https://docs.databricks.com/notebooks/widgets.html)). 5 | -- MAGIC 6 | -- MAGIC Name | Default Value | Label 7 | -- MAGIC -----|---------------|------- 8 | -- MAGIC `table_name` | workflows_raw_data | Table Name (Raw Data) 9 | -- MAGIC `database_name` | jaceklaskowski | Database Name 10 | 11 | -- COMMAND ---------- 12 | 13 | -- MAGIC %md 14 | -- MAGIC 15 | -- MAGIC **NOTE**: 16 | -- MAGIC 17 | -- MAGIC > Do not use `Run all` button to run all the cells. 18 | 19 | -- COMMAND ---------- 20 | 21 | -- MAGIC %md 22 | -- MAGIC 23 | -- MAGIC [Databricks widgets](https://docs.databricks.com/notebooks/widgets.html) describes how to access widgets values in Spark SQL. 24 | -- MAGIC 25 | -- MAGIC 26 | -- MAGIC Unfortunatelly, [notebooks in jobs cannot use widgets](https://docs.databricks.com/notebooks/widgets.html#using-widget-values-in-spark-sql): 27 | -- MAGIC 28 | -- MAGIC > In general, you cannot use widgets (...) if you use Run All or run the notebook as a job. 29 | -- MAGIC 30 | -- MAGIC There are a couple of issues to keep in mind, esp. while doing a demo: 31 | -- MAGIC 32 | -- MAGIC 1. In general, you cannot use widgets to pass arguments between different languages within a notebook 33 | -- MAGIC 1. You can create a widget `arg1` in a Python cell and use it in a SQL or Scala cell only if you run one cell at a time. 34 | -- MAGIC 1. Using widget values between different languages does not work if you use **Run All** or run the notebook as a job 35 | 36 | -- COMMAND ---------- 37 | 38 | -- MAGIC %python 39 | -- MAGIC 40 | -- MAGIC dbutils.jobs.taskValues.help() 41 | 42 | -- COMMAND ---------- 43 | 44 | -- MAGIC %python 45 | -- MAGIC 46 | -- MAGIC dbutils.jobs.taskValues.help("get") 47 | 48 | -- COMMAND ---------- 49 | 50 | -- MAGIC %python 51 | -- MAGIC 52 | -- MAGIC # Creates a text input widget with a given name and default value. 53 | -- MAGIC # Notebook Widgets are only for Run all (when executed outside a job) 54 | -- MAGIC dbutils.widgets.removeAll() 55 | -- MAGIC dbutils.widgets.text(name = "database_name", defaultValue = "jaceklaskowski", label = "Database Name") 56 | -- MAGIC dbutils.widgets.text(name = "raw_table_name", defaultValue = "workflows_raw_data", label = "Raw Table Name") 57 | -- MAGIC dbutils.widgets.text(name = "silver_table_name", defaultValue = "workflows_transform", label = "Silver Table Name") 58 | -- MAGIC dbutils.widgets.text(name = "gold_table_name", defaultValue = "workflows_aggregates", label = "Gold Table Name") 59 | 60 | -- COMMAND ---------- 61 | 62 | -- The following does not seem to work 63 | -- REMOVE WIDGET table_name; 64 | -- CREATE WIDGET TEXT table_name DEFAULT "workflows_raw_data"; 65 | 66 | -- COMMAND ---------- 67 | 68 | CREATE DATABASE IF NOT EXISTS ${database_name}; 69 | USE ${database_name} 70 | 71 | -- COMMAND ---------- 72 | 73 | SHOW TABLES 74 | 75 | -- COMMAND ---------- 76 | 77 | -- MAGIC %md ## Create Raw Table 78 | -- MAGIC 79 | -- MAGIC Learn more in [CREATE VIEW](https://docs.databricks.com/sql/language-manual/sql-ref-syntax-ddl-create-view.html) 80 | 81 | -- COMMAND ---------- 82 | 83 | -- MAGIC %python 84 | -- MAGIC 85 | -- MAGIC dbutils.widgets.getArgument("raw_table_name") 86 | 87 | -- COMMAND ---------- 88 | 89 | CREATE OR REPLACE VIEW ${raw_table_name} 90 | (id COMMENT 'Unique identification number', name) 91 | COMMENT 'Bronze layer' 92 | AS 93 | SELECT id, name 94 | FROM VALUES (0, "zero"), (1, "one") t(id, name) 95 | 96 | -- COMMAND ---------- 97 | 98 | SELECT * FROM ${raw_table_name} 99 | -------------------------------------------------------------------------------- /Databricks Workflows/02 Modular Orchestration with Run Job Task.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md # Modular Orchestration with Run Job Task 3 | 4 | # COMMAND ---------- 5 | 6 | # MAGIC %md 7 | # MAGIC 8 | # MAGIC 1. Breaking down large complex workflows (DAGs) into logical chunks or smaller "child" jobs that are defined and managed separately 9 | # MAGIC 1. parent and child jobs 10 | # MAGIC 1. split a DAG up by organizational boundaries 11 | # MAGIC * allowing different teams in an organization to work together on different parts of a workflow 12 | # MAGIC * ownership of parts of the workflow can be better managed, with different teams potentially using different code repositories for the jobs they own 13 | # MAGIC * testing and updates covered by child job ownership 14 | # MAGIC 1. reusability 15 | # MAGIC * define common shared steps in a job once and then reuse that as a child job in different parent workflows 16 | # MAGIC * With parameters, reused tasks can be made more flexible to fit the needs of different parent workflows 17 | # MAGIC 1. Creates a modular workflow 18 | 19 | # COMMAND ---------- 20 | 21 | # MAGIC %md ## Run Job 22 | # MAGIC 23 | # MAGIC 1. A new task type **Run Job** 24 | # MAGIC * Requires a job to trigger 25 | # MAGIC 1. Calls a job to be run by the task 26 | # MAGIC 1. Jobs triggered by a Run Job task use their own cluster configuration 27 | # MAGIC * [Trigger a new job run](https://docs.databricks.com/api/workspace/jobs/runnow) 28 | 29 | # COMMAND ---------- 30 | 31 | # MAGIC %md ## Parameters 32 | # MAGIC 33 | # MAGIC 1. Enter the key and value of each job parameter to pass to a Run Job 34 | # MAGIC 1. [Pass context about job runs into job tasks](https://docs.databricks.com/en/workflows/jobs/parameter-value-references.html) 35 | # MAGIC * Click **Browse dynamic values** for a list of available dynamic value references 36 | # MAGIC 1. If job parameters are configured on the job a task belongs to, those parameters are displayed when you add task parameters. 37 | # MAGIC * If job and task parameters share a key, the job parameter takes precedence. 38 | # MAGIC * A warning is shown in the UI if you attempt to add a task parameter with the same key as a job parameter. 39 | # MAGIC * [Add parameters for all job tasks](https://docs.databricks.com/en/workflows/jobs/settings.html#add-parameters-for-all-job-tasks) 40 | 41 | # COMMAND ---------- 42 | 43 | # MAGIC %md ## Task Queueing 44 | # MAGIC 45 | # MAGIC 1. A workspace is limited to 1000 concurrent task runs. A 429 Too Many Requests response is returned when you request a run that cannot start immediately. 46 | # MAGIC 1. The number of jobs a workspace can create in an hour is limited to 10000 (includes “runs submit”). This limit also affects jobs created by the REST API and notebook workflows. 47 | 48 | # COMMAND ---------- 49 | 50 | # MAGIC %md ## Gotchas 51 | # MAGIC 52 | # MAGIC 1. You should not create jobs with circular dependencies or jobs that nest more than three Run Job tasks. 53 | # MAGIC 1. Circular dependencies are Run Job tasks that directly or indirectly trigger each other. 54 | # MAGIC 1. A run is queued when the maximum concurrent Run Job task runs in the workspace is reached (see [What if my job cannot run because of concurrency limits?](https://docs.databricks.com/en/workflows/jobs/create-run-jobs.html#what-if-my-job-cannot-run-because-of-concurrency-limits)) 55 | 56 | # COMMAND ---------- 57 | 58 | # MAGIC %md ## Learn More 59 | # MAGIC 60 | # MAGIC 1. [Modular Orchestration with Databricks Workflows](https://www.databricks.com/blog/modular-orchestration-databricks-workflows) 61 | # MAGIC 1. [Task type options](https://docs.databricks.com/en/workflows/jobs/create-run-jobs.html#task-type-options) 62 | -------------------------------------------------------------------------------- /Generative AI/00 Generative AI.sql: -------------------------------------------------------------------------------- 1 | -- Databricks notebook source 2 | -- MAGIC %md # Generative AI and Large Language Models 3 | -- MAGIC 4 | -- MAGIC Gain more ground in **Generative AI** (#GenAI) and **Large Language Models** (#LLM) that I know nothing about to become pick as many as could please you (these I found while reading some articles about the subject 😎): 5 | -- MAGIC 6 | -- MAGIC * A passionate GenAI and LLM enthusiast 7 | -- MAGIC * A seasoned GenAI and LLM professional 8 | 9 | -- COMMAND ---------- 10 | 11 | -- MAGIC %md 12 | -- MAGIC 13 | -- MAGIC There are four [Generative AI Architecture Patterns](https://www.databricks.com/product/machine-learning/build-generative-ai) to consider when building a large language model–based solution: 14 | -- MAGIC 15 | -- MAGIC 1. [Prompt Engineering]($./Prompt Engineering) 16 | -- MAGIC 1. [Retrieval Augmented Generation (RAG)]($./Retrieval Augmented Generation) 17 | -- MAGIC 1. Fine-tuning 18 | -- MAGIC 1. Pretraining 19 | 20 | -- COMMAND ---------- 21 | 22 | -- MAGIC %md 23 | -- MAGIC 24 | -- MAGIC ## Mosaic AI Agent Framework and Agent Evaluation 25 | -- MAGIC 26 | -- MAGIC RAG applications and Agents are the most popular GenAI applications on Databricks (built using [Mosaic AI Agent Framework and Agent Evaluation](https://www.databricks.com/blog/announcing-mosaic-ai-agent-framework-and-agent-evaluation)) 27 | 28 | -- COMMAND ---------- 29 | 30 | -- MAGIC %md ## Use Cases 31 | -- MAGIC 32 | -- MAGIC * Doc Q&A 33 | -- MAGIC * Chatbots 34 | 35 | -- COMMAND ---------- 36 | 37 | -- MAGIC %md ## Other Branches of Machine Learning 38 | -- MAGIC 39 | -- MAGIC * Predictive and Prescriptive Analytics 40 | -- MAGIC * Computer Vision 41 | -- MAGIC * Natural Language Processing 42 | -- MAGIC * Deployment / ML Ops / Cloud 43 | -- MAGIC * Reinforcement Learning 44 | -- MAGIC 45 | 46 | -- COMMAND ---------- 47 | 48 | -- MAGIC %md ## Large Language Models (LLMs) 49 | -- MAGIC 50 | -- MAGIC * ChatGPT 51 | -- MAGIC * [BloombergGPT](https://www.bloomberg.com/company/press/bloomberggpt-50-billion-parameter-llm-tuned-finance/) - Bloomberg’s 50-billion parameter large language model, purpose-built from scratch for finance 52 | -- MAGIC * [M0deler](https://m0deler.com/) 53 | -- MAGIC 54 | 55 | -- COMMAND ---------- 56 | 57 | -- MAGIC %md 58 | -- MAGIC 59 | -- MAGIC Generative AI and Large Language Models (LLMs): 60 | -- MAGIC 61 | -- MAGIC * [OpenAI](https://openai.com/) 62 | -- MAGIC * Local and on-premise models 63 | -- MAGIC * the Rise of Generative AI due to ChatGPT 64 | -- MAGIC * general-purpose chat bots 65 | -- MAGIC * Building products that use LLMs and GenAI 66 | -- MAGIC * Developing apps with GenAI and LLMs 67 | 68 | -- COMMAND ---------- 69 | 70 | -- MAGIC %md ## Prompt Engineering 71 | 72 | -- COMMAND ---------- 73 | 74 | -- MAGIC %md 75 | -- MAGIC 76 | -- MAGIC * Writing ChatGPT prompts 77 | -- MAGIC * the no-code method for writing ChatGPT prompts 78 | 79 | -- COMMAND ---------- 80 | 81 | -- MAGIC %md 82 | -- MAGIC 83 | -- MAGIC ## Databricks Generative AI Fundamentals Learning Plan 84 | -- MAGIC 85 | -- MAGIC [Generative AI Fundamentals](https://www.databricks.com/resources/learn/training/generative-ai-fundamentals) 86 | 87 | -- COMMAND ---------- 88 | 89 | -- MAGIC %md ## Recommended Reading by ChatGPT 3.5 90 | -- MAGIC 91 | -- MAGIC There are the academic papers recommended by [ChatGPT 3.5](https://chat.openai.com/share/3dfac550-eeb6-4740-b68e-52140632edc0) that were instrumental in advancing large language models: 92 | -- MAGIC 93 | -- MAGIC ### Attention Is All You Need 94 | -- MAGIC 95 | -- MAGIC Started my journey into academic paper reading and LLMs from [Attention Is All You Need](https://arxiv.org/abs/1706.03762) by Vaswani et al. (2017) as _"This paper introduced the Transformer architecture, which revolutionized the field of NLP and laid the foundation for large language models like GPT and BERT."_ 96 | -------------------------------------------------------------------------------- /Delta Live Tables/Expectations.sql: -------------------------------------------------------------------------------- 1 | -- Databricks notebook source 2 | -- MAGIC %md # Delta Live Tables Expectations 3 | 4 | -- COMMAND ---------- 5 | 6 | -- MAGIC %md 7 | -- MAGIC 8 | -- MAGIC ## Introduction 9 | -- MAGIC 10 | -- MAGIC [Manage data quality with Delta Live Tables](https://docs.databricks.com/delta-live-tables/expectations.html): 11 | -- MAGIC 12 | -- MAGIC 1. **Expectations** define data quality constraints (_requirements_, _assertions_) 13 | -- MAGIC 1. Optional 14 | -- MAGIC 1. Data quality checks on each record passing through a query (before they land in a delta table) 15 | -- MAGIC ``` 16 | -- MAGIC expectation: record => Boolean 17 | -- MAGIC ``` 18 | -- MAGIC 1. Provide insights into data quality for each pipeline update 19 | -- MAGIC 1. Applied to queries using Python decorators or SQL `CONSTRAINT` clauses 20 | 21 | -- COMMAND ---------- 22 | 23 | -- MAGIC %md ## CREATE OR REFRESH Statement 24 | -- MAGIC 25 | -- MAGIC [Delta Live Tables SQL language reference](https://docs.databricks.com/delta-live-tables/sql-ref.html) 26 | -- MAGIC 27 | -- MAGIC ```sql 28 | -- MAGIC CREATE OR REFRESH [TEMPORARY] { STREAMING TABLE | LIVE TABLE } table_name 29 | -- MAGIC [( 30 | -- MAGIC [ 31 | -- MAGIC col_name1 col_type1 [ GENERATED ALWAYS AS generation_expression1 ] [ COMMENT col_comment1 ], 32 | -- MAGIC col_name2 col_type2 [ GENERATED ALWAYS AS generation_expression2 ] [ COMMENT col_comment2 ], 33 | -- MAGIC ... 34 | -- MAGIC ] 35 | -- MAGIC [ 36 | -- MAGIC CONSTRAINT expectation_name_1 EXPECT (expectation_expr1) [ON VIOLATION { FAIL UPDATE | DROP ROW }], 37 | -- MAGIC CONSTRAINT expectation_name_2 EXPECT (expectation_expr2) [ON VIOLATION { FAIL UPDATE | DROP ROW }], 38 | -- MAGIC ... 39 | -- MAGIC ] 40 | -- MAGIC )] 41 | -- MAGIC [USING DELTA] 42 | -- MAGIC [PARTITIONED BY (col_name1, col_name2, ... )] 43 | -- MAGIC [LOCATION path] 44 | -- MAGIC [COMMENT table_comment] 45 | -- MAGIC [TBLPROPERTIES (key1 [ = ] val1, key2 [ = ] val2, ... )] 46 | -- MAGIC AS select_statement 47 | -- MAGIC ``` 48 | 49 | -- COMMAND ---------- 50 | 51 | -- MAGIC %md ## CONSTRAINT Clause 52 | -- MAGIC 53 | -- MAGIC ```sql 54 | -- MAGIC CONSTRAINT expectation_name EXPECT (expectation_expr) [ON VIOLATION { FAIL UPDATE | DROP ROW }] 55 | -- MAGIC ``` 56 | -- MAGIC 57 | -- MAGIC An expectation (`CONSTRAINT`) consists of three properties: 58 | -- MAGIC 59 | -- MAGIC Property | SQL |Meaning 60 | -- MAGIC ---------|-----|------- 61 | -- MAGIC Identifier | `expectation_name` | a unique identifier and allows you to track metrics for the constraint 62 | -- MAGIC Condition | `expectation_expr` | A boolean expression 63 | -- MAGIC Action | `ON VIOLATION` | (optional) What to do when a record fails the expectation (the condition is `false`) 64 | 65 | -- COMMAND ---------- 66 | 67 | -- MAGIC %md 68 | -- MAGIC 69 | -- MAGIC ```sql 70 | -- MAGIC CONSTRAINT expectation_name -- Name / Identifier 71 | -- MAGIC EXPECT (expectation_expr) -- Data Quality Assertion 72 | -- MAGIC [ON VIOLATION { DROP ROW | FAIL UPDATE }] -- Action 73 | -- MAGIC ``` 74 | -- MAGIC 75 | -- MAGIC [SQL properties](https://docs.databricks.com/delta-live-tables/sql-ref.html#sql-properties-1) 76 | -- MAGIC 77 | -- MAGIC Action | Result 78 | -- MAGIC -------|------- 79 | -- MAGIC No `ON VIOLATION` (_warn_) | **(default)** Invalid records are written to the target table; failure is reported as a metric for the dataset. (_accept violation_) 80 | -- MAGIC `ON VIOLATION DROP ROW` | Invalid records are dropped (not written to the target table) and a pipeline continues processing; failure is reported as a metrics for the dataset 81 | -- MAGIC `ON VIOLATION FAIL UPDATE` | An invalid record immediately stops pipeline execution; Manual intervention is required before re-processing 82 | 83 | -- COMMAND ---------- 84 | 85 | -- MAGIC %md 86 | -- MAGIC 87 | -- MAGIC [What are Delta Live Tables expectations?](https://docs.databricks.com/delta-live-tables/expectations.html#what-are-delta-live-tables-expectations) 88 | -------------------------------------------------------------------------------- /Generative AI/llm-rag-chatbot/_resources/LICENSE.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md 3 | # MAGIC ## Licence 4 | 5 | # COMMAND ---------- 6 | 7 | # MAGIC %md 8 | # MAGIC 9 | # MAGIC Copyright (2022) Databricks, Inc. 10 | # MAGIC 11 | # MAGIC This library (the "Software") may not be used except in connection with the Licensee's use of the Databricks Platform Services pursuant 12 | # MAGIC to an Agreement (defined below) between Licensee (defined below) and Databricks, Inc. ("Databricks"). The Object Code version of the 13 | # MAGIC Software shall be deemed part of the Downloadable Services under the Agreement, or if the Agreement does not define Downloadable Services, 14 | # MAGIC Subscription Services, or if neither are defined then the term in such Agreement that refers to the applicable Databricks Platform 15 | # MAGIC Services (as defined below) shall be substituted herein for “Downloadable Services.” Licensee's use of the Software must comply at 16 | # MAGIC all times with any restrictions applicable to the Downlodable Services and Subscription Services, generally, and must be used in 17 | # MAGIC accordance with any applicable documentation. For the avoidance of doubt, the Software constitutes Databricks Confidential Information 18 | # MAGIC under the Agreement. 19 | # MAGIC 20 | # MAGIC Additionally, and notwithstanding anything in the Agreement to the contrary: 21 | # MAGIC * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 22 | # MAGIC OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 23 | # MAGIC LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR 24 | # MAGIC IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 25 | # MAGIC * you may view, make limited copies of, and may compile the Source Code version of the Software into an Object Code version of the 26 | # MAGIC Software. For the avoidance of doubt, you may not make derivative works of Software (or make any any changes to the Source Code 27 | # MAGIC version of the unless you have agreed to separate terms with Databricks permitting such modifications (e.g., a contribution license 28 | # MAGIC agreement)). 29 | # MAGIC 30 | # MAGIC If you have not agreed to an Agreement or otherwise do not agree to these terms, you may not use the Software or view, copy or compile 31 | # MAGIC the Source Code of the Software. 32 | # MAGIC 33 | # MAGIC This license terminates automatically upon the termination of the Agreement or Licensee's breach of these terms. Additionally, 34 | # MAGIC Databricks may terminate this license at any time on notice. Upon termination, you must permanently delete the Software and all 35 | # MAGIC copies thereof (including the Source Code). 36 | # MAGIC 37 | # MAGIC Agreement: the agreement between Databricks and Licensee governing the use of the Databricks Platform Services, which shall be, with 38 | # MAGIC respect to Databricks, the Databricks Terms of Service located at www.databricks.com/termsofservice, and with respect to Databricks 39 | # MAGIC Community Edition, the Community Edition Terms of Service located at www.databricks.com/ce-termsofuse, in each case unless Licensee 40 | # MAGIC has entered into a separate written agreement with Databricks governing the use of the applicable Databricks Platform Services. 41 | # MAGIC 42 | # MAGIC Databricks Platform Services: the Databricks services or the Databricks Community Edition services, according to where the Software is used. 43 | # MAGIC 44 | # MAGIC Licensee: the user of the Software, or, if the Software is being used on behalf of a company, the company. 45 | # MAGIC 46 | # MAGIC Object Code: is version of the Software produced when an interpreter or a compiler translates the Source Code into recognizable and 47 | # MAGIC executable machine code. 48 | # MAGIC 49 | # MAGIC Source Code: the human readable portion of the Software. 50 | -------------------------------------------------------------------------------- /workshops/Course Agenda 2 Days.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md # Databricks Workshop » Agenda 3 | # MAGIC 4 | # MAGIC This notebook is a sample agenda of a Databricks workshop that can be used as a guidance for further customization. 5 | # MAGIC 6 | # MAGIC This agenda has been used in a 2-day workshop format for a group of data engineers, architects, data analysts and testers (with some automation skills). 7 | # MAGIC 8 | # MAGIC Duration: 2 days (8 hours / day) 9 | # MAGIC 10 | # MAGIC Recommended number of participants: 8-12 people 11 | 12 | # COMMAND ---------- 13 | 14 | # MAGIC %md ## Agenda 15 | 16 | # COMMAND ---------- 17 | 18 | # MAGIC %md 19 | # MAGIC 20 | # MAGIC The idea is to go over all the different types of artifacts that can be built using "New" menu: 21 | # MAGIC 22 | # MAGIC 1. Notebook 23 | # MAGIC 1. Repo 24 | # MAGIC 1. Data 25 | # MAGIC * File upload 26 | # MAGIC * Add data 27 | # MAGIC 1. Compute 28 | # MAGIC * Cluster 29 | # MAGIC * SQL Warehouse 30 | # MAGIC 1. SQL 31 | # MAGIC * Query 32 | # MAGIC * Dashoboard 33 | # MAGIC * Lakeview Dashboard 34 | # MAGIC * Alert 35 | # MAGIC 1. Data Engineering 36 | # MAGIC * Job 37 | # MAGIC * DLT Pipeline 38 | # MAGIC 1. Machine Learning (not covered) 39 | 40 | # COMMAND ---------- 41 | 42 | # MAGIC %md 43 | # MAGIC 44 | # MAGIC 1. Overview of Databricks Workspace for Data Engineers 45 | # MAGIC 1. Data Engineering with Spark SQL 46 | # MAGIC 1. Managing Data in Delta Lake Tables 47 | # MAGIC * DLT Pipelines 48 | # MAGIC * `dbfs` magic command 49 | # MAGIC 1. Databricks Workflows and Jobs 50 | # MAGIC 1. Developing Data Pipelines using Delta Live Tables 51 | # MAGIC * Converting Spark exercises to Databricks tools' mindset 52 | # MAGIC 1. Databricks SQL for Data Analytics 53 | # MAGIC 1. Unity Catalog 54 | # MAGIC 1. Setting up Development Environment 55 | # MAGIC * IntelliJ IDEA / PyCharm 56 | # MAGIC * Visual Studio Code 57 | # MAGIC * Databricks JDBC Driver 58 | # MAGIC * Databricks CLI 59 | # MAGIC * Databricks SQL Connector for Python 60 | # MAGIC 1. (optional) Databricks CI/CD 61 | # MAGIC * REST APIs 62 | # MAGIC * Terraform / Databricks Terraform Provider 63 | 64 | # COMMAND ---------- 65 | 66 | # MAGIC %md ## Preprequisites 67 | 68 | # COMMAND ---------- 69 | 70 | # MAGIC %md 71 | # MAGIC 72 | # MAGIC It is assumed that the course participants have got the following skills: 73 | # MAGIC 74 | # MAGIC 1. Familiarity with Apache Spark and/or PySpark 75 | # MAGIC 1. Familiarity with one of the following programming languages: 76 | # MAGIC * Python 77 | # MAGIC * Scala 78 | # MAGIC * SQL 79 | 80 | # COMMAND ---------- 81 | 82 | # MAGIC %md ## Schedule 83 | 84 | # COMMAND ---------- 85 | 86 | # MAGIC %md 87 | # MAGIC 88 | # MAGIC A class is split into 1-hour blocks with a 12-minute break each 89 | # MAGIC 90 | # MAGIC A day starts at 9am and ends at 4pm to let students have an extra 1 hour at the end of a day to work alone on exercises and have enough room for some cognitive work at its own pace and perhaps even ask questions 91 | # MAGIC 92 | # MAGIC Lunch breaks at 1pm for 1 hour 93 | 94 | # COMMAND ---------- 95 | 96 | # MAGIC %md 97 | # MAGIC 98 | # MAGIC * 9:00 – 9:48 (12’ break) 99 | # MAGIC * 10:00 – 10:48 (12’ break) 100 | # MAGIC * 11:00 – 11:48 (12’ break) 101 | # MAGIC * Lunch break (1h) 102 | # MAGIC * 13:00 – 13:48 (12’ break) 103 | # MAGIC * 14:00 – 14:48 (12’ break) 104 | # MAGIC * 15:00 – 15:48 (12’ break) 105 | # MAGIC * 16:00 – 17:00 a quiet working hour 106 | 107 | # COMMAND ---------- 108 | 109 | # MAGIC %md 110 | # MAGIC 111 | # MAGIC ## Databricks Workspace 112 | 113 | # COMMAND ---------- 114 | 115 | # MAGIC %md 116 | # MAGIC 117 | # MAGIC * A Databricks Workspace with Unity Catalog enabled 118 | # MAGIC * Myself as a workspace admin 119 | # MAGIC * An extra (fake) non-admin user account for testing and demo 120 | # MAGIC * DLT pipelines and workflows (jobs) 121 | -------------------------------------------------------------------------------- /demo/delta-live-tables/my_streaming_table.sql: -------------------------------------------------------------------------------- 1 | -- Databricks notebook source 2 | -- MAGIC %md # my_streaming_table DLT Table 3 | 4 | -- COMMAND ---------- 5 | 6 | -- MAGIC %md 7 | -- MAGIC 8 | -- MAGIC ## Why SQL Considered Superior (over Python) 9 | -- MAGIC 10 | -- MAGIC Unlike in Python, [Delta Live Tables SQL](https://docs.databricks.com/workflows/delta-live-tables/delta-live-tables-sql-ref.html) allow for: 11 | -- MAGIC 12 | -- MAGIC 1. Executing DLT notebooks for syntax analysis (using `Run all`) 13 | -- MAGIC 1. Markdown 😍 14 | -- MAGIC 15 | -- MAGIC You can still use SQL notebooks with Python notebooks in a single DLT pipeline. 16 | 17 | -- COMMAND ---------- 18 | 19 | -- MAGIC %md 20 | -- MAGIC 21 | -- MAGIC When executing the notebook in a DLT pipeline you will get this WARN message: 22 | -- MAGIC 23 | -- MAGIC ``` 24 | -- MAGIC Magic commands (e.g. %py, %sql and %run) are not supported with the exception of 25 | -- MAGIC %pip within a Python notebook. Cells containing magic commands are ignored. 26 | -- MAGIC Unsupported magic commands were found in the following notebooks 27 | -- MAGIC 28 | -- MAGIC /Repos/jacek@japila.pl/learn-databricks/Delta Live Tables/auto_loader: %fs 29 | -- MAGIC ``` 30 | 31 | -- COMMAND ---------- 32 | 33 | -- MAGIC %md 34 | -- MAGIC 35 | -- MAGIC Let's start with an example non-DLT query with `cloud_files` TVF (Auto Loader). 36 | -- MAGIC 37 | -- MAGIC It won't work. 38 | -- MAGIC 39 | -- MAGIC `cloud_files` creates a streaming table while CTAS does not define one. 40 | 41 | -- COMMAND ---------- 42 | 43 | -- MAGIC %md 44 | -- MAGIC 45 | -- MAGIC The following won't work as is in a DLT notebook. 46 | -- MAGIC 47 | -- MAGIC ``` 48 | -- MAGIC Unable to process top-level query. DLT currently only accepts 'CREATE TEMPORARY LIVE VIEW', 'CREATE OR REFRESH LIVE TABLE', 'APPLY CHANGES INTO', and 'SET' statements. 49 | -- MAGIC ``` 50 | -- MAGIC 51 | -- MAGIC Don't forget to comment it out before executing the notebook in a DLT pipeline. 52 | 53 | -- COMMAND ---------- 54 | 55 | -- SELECT * FROM cloud_files("/databricks-datasets/retail-org/customers/", "csv") 56 | 57 | -- COMMAND ---------- 58 | 59 | -- MAGIC %md ## Schema Inference 60 | 61 | -- COMMAND ---------- 62 | 63 | -- MAGIC %md 64 | -- MAGIC 65 | -- MAGIC [Auto Loader](https://docs.databricks.com/workflows/delta-live-tables/delta-live-tables-data-sources.html#auto-loader): 66 | -- MAGIC 67 | -- MAGIC * You can use supported format options with Auto Loader. 68 | -- MAGIC * Use `map()` function to pass options to `cloud_files()` 69 | 70 | -- COMMAND ---------- 71 | 72 | -- MAGIC %md 73 | -- MAGIC 74 | -- MAGIC ```sql 75 | -- MAGIC CREATE OR REFRESH STREAMING LIVE TABLE 76 | -- MAGIC AS SELECT * 77 | -- MAGIC FROM cloud_files( 78 | -- MAGIC "", 79 | -- MAGIC "", 80 | -- MAGIC map( 81 | -- MAGIC "", "", " 98 | -- MAGIC AS SELECT * 99 | -- MAGIC FROM cloud_files( 100 | -- MAGIC "", 101 | -- MAGIC "", 102 | -- MAGIC map("schema", "title STRING, id INT, revisionId INT, revisionTimestamp TIMESTAMP, revisionUsername STRING, revisionUsernameId INT, text STRING") 103 | -- MAGIC ) 104 | -- MAGIC ``` 105 | 106 | -- COMMAND ---------- 107 | 108 | -- no "header", "true" by default 109 | CREATE OR REFRESH STREAMING LIVE TABLE raw_streaming_table( 110 | CONSTRAINT names_at_least_5_char_long EXPECT (name IS NOT NULL AND len(name) >= 5), 111 | CONSTRAINT ids_only_even EXPECT (id % 2 = 0) 112 | ) 113 | AS SELECT * FROM 114 | cloud_files( 115 | "${cloud_files_input_path}", 116 | "csv", 117 | map( 118 | "schema", "id INT, name STRING" 119 | ) 120 | ) 121 | -------------------------------------------------------------------------------- /Delta Live Tables/Storage location.sql: -------------------------------------------------------------------------------- 1 | -- Databricks notebook source 2 | -- MAGIC %md # Storage location 3 | 4 | -- COMMAND ---------- 5 | 6 | -- MAGIC %md 7 | -- MAGIC 8 | -- MAGIC **Storage location** can be specified explicitly by a user while creating a Delta Live Table pipeline or assigned automatically by the runtime. 9 | -- MAGIC 10 | -- MAGIC It can only be specified once and for the whole lifecycle of a DLT pipeline. It cannot be changed ever. 11 | -- MAGIC 12 | -- MAGIC If auto-assigned by the runtime, the storage location is under `dbfs:/pipelines` directory (in a directory with the same name as the pipeline ID). 13 | -- MAGIC 14 | -- MAGIC You can find out about the **Storage location** of a DLT pipeline in the **Pipeline settings > Destination** section in the UI. 15 | 16 | -- COMMAND ---------- 17 | 18 | -- MAGIC %fs ls dbfs:/pipelines/960da65b-c9df-4cb9-9456-1005ffe103a9/ 19 | 20 | -- COMMAND ---------- 21 | 22 | -- MAGIC %md 23 | -- MAGIC 24 | -- MAGIC You can also find out about the Storage location of a DLT pipeline using [Delta Live Table API](https://docs.databricks.com/workflows/delta-live-tables/delta-live-tables-api-guide.html) directly or higher-level [Delta Live Tables CLI](https://docs.databricks.com/dev-tools/cli/dlt-cli.html) (`databricks pipelines`). 25 | -- MAGIC 26 | -- MAGIC ```console 27 | -- MAGIC $ databricks pipelines get --pipeline-id 960da65b-c9df-4cb9-9456-1005ffe103a9 | jq '.spec.storage' 28 | -- MAGIC "dbfs:/pipelines/960da65b-c9df-4cb9-9456-1005ffe103a9" 29 | -- MAGIC ``` 30 | 31 | -- COMMAND ---------- 32 | 33 | -- MAGIC %md 34 | -- MAGIC 35 | -- MAGIC [Databricks recommends always storing checkpoint and schema evolution information in storage locations managed by Unity Catalog](https://docs.databricks.com/ingestion/auto-loader/unity-catalog.html#specifying-locations-for-auto-loader-resources-for-unity-catalog) 36 | 37 | -- COMMAND ---------- 38 | 39 | -- MAGIC %md ## Autoloader Directory 40 | -- MAGIC 41 | -- MAGIC Contains schema evolution information 42 | 43 | -- COMMAND ---------- 44 | 45 | -- MAGIC %fs ls dbfs:/pipelines/960da65b-c9df-4cb9-9456-1005ffe103a9/autoloader 46 | 47 | -- COMMAND ---------- 48 | 49 | -- MAGIC %fs ls dbfs:/pipelines/960da65b-c9df-4cb9-9456-1005ffe103a9/autoloader/schema_1493166085_/_schemas 50 | 51 | -- COMMAND ---------- 52 | 53 | -- MAGIC %fs head dbfs:/pipelines/960da65b-c9df-4cb9-9456-1005ffe103a9/autoloader/schema_1493166085_/_schemas/0 54 | 55 | -- COMMAND ---------- 56 | 57 | -- MAGIC %md ## System Directory 58 | 59 | -- COMMAND ---------- 60 | 61 | -- MAGIC %fs ls dbfs:/pipelines/960da65b-c9df-4cb9-9456-1005ffe103a9/system/ 62 | 63 | -- COMMAND ---------- 64 | 65 | -- MAGIC %fs ls dbfs:/pipelines/960da65b-c9df-4cb9-9456-1005ffe103a9/system/events 66 | 67 | -- COMMAND ---------- 68 | 69 | -- MAGIC %md ## Events Delta Table 70 | 71 | -- COMMAND ---------- 72 | 73 | SELECT * FROM delta.`dbfs:/pipelines/a02952e6-7197-44a4-a072-5ea5124d7bce/system/events` 74 | 75 | -- COMMAND ---------- 76 | 77 | -- MAGIC %md ## Data Quality Checks 78 | 79 | -- COMMAND ---------- 80 | 81 | -- MAGIC %python 82 | -- MAGIC 83 | -- MAGIC pipelines = spark.createDataFrame(data = dbutils.fs.ls("dbfs:/pipelines/")) 84 | -- MAGIC path = pipelines.orderBy(pipelines["modificationTime"].desc()).select("path").head().path 85 | -- MAGIC spark.conf.set("pipeline.path", path) 86 | -- MAGIC print(spark.conf.get('pipeline.path')) 87 | 88 | -- COMMAND ---------- 89 | 90 | SELECT '${pipeline.path}' path 91 | 92 | -- COMMAND ---------- 93 | 94 | DESCRIBE delta.`${pipeline.path}/system/events` 95 | 96 | -- COMMAND ---------- 97 | 98 | -- MAGIC %md 99 | -- MAGIC 100 | -- MAGIC Inspired by [this article](https://www.linkedin.com/pulse/delta-live-tables-how-build-pipeline-run-data-quality-mathias-weber/) and feeling a bit adventurous to use some advanced "tools": 101 | -- MAGIC 102 | -- MAGIC * [Common Table Expression (CTE)](https://spark.apache.org/docs/latest/sql-ref-syntax-qry-select-cte.html) 103 | -- MAGIC * [JSON path expression](https://docs.databricks.com/sql/language-manual/sql-ref-json-path-expression.html) 104 | 105 | -- COMMAND ---------- 106 | 107 | WITH data_quality AS ( 108 | WITH details AS ( 109 | SELECT 110 | id update_id, 111 | details:flow_progress:data_quality:expectations 112 | FROM delta.`dbfs:/pipelines/05740fff-c03e-4366-8061-2680f9e9ce48/system/events` 113 | WHERE event_type = 'flow_progress' 114 | ) 115 | SELECT 116 | update_id, 117 | explode(from_json(expectations, "array>")) expectations 118 | FROM details 119 | WHERE expectations IS NOT NULL 120 | ) 121 | SELECT update_id, expectations.* FROM data_quality 122 | -------------------------------------------------------------------------------- /workshops/Databricks Workshop Day 3.sql: -------------------------------------------------------------------------------- 1 | -- Databricks notebook source 2 | -- MAGIC %md # Databricks Workshop Day 3 3 | -- MAGIC 4 | -- MAGIC Duration: 4.5 hours (9:30-14:00) 5 | 6 | -- COMMAND ---------- 7 | 8 | -- MAGIC %md ## Schedule 9 | 10 | -- COMMAND ---------- 11 | 12 | -- MAGIC %md 13 | -- MAGIC 14 | -- MAGIC * The class starts at 9:30 15 | -- MAGIC * A class is split into 1-hour blocks with a 12-minute break each 16 | -- MAGIC * Breaks at the end of an hour 17 | -- MAGIC * However, the first 20' break is at 10:30 (till 10:50) 18 | 19 | -- COMMAND ---------- 20 | 21 | -- MAGIC %md ## Agenda 22 | 23 | -- COMMAND ---------- 24 | 25 | -- MAGIC %md 26 | -- MAGIC 27 | -- MAGIC 1. (spark) [The Internals of Structured Query Execution](https://jaceklaskowski.github.io/spark-workshop/slides/spark-sql-internals-of-structured-query-execution.html) 28 | -- MAGIC * jak czytać, na co zwracać uwagę 29 | -- MAGIC * omówić ogólnie 30 | -- MAGIC * analiza na przykładzie załadunku do silver i MOLka 31 | -- MAGIC * [Exercise: How to add days (as values of a column) to date?](https://jaceklaskowski.github.io/spark-workshop/exercises/sql/How-to-add-days-as-values-of-a-column-to-date.html) 32 | -- MAGIC * [Exercise: split function with variable delimiter per row](https://jaceklaskowski.github.io/spark-workshop/exercises/sql/split-function-with-variable-delimiter-per-row.html) 33 | -- MAGIC 1. (spark) Narrow and Wide Transformations 34 | -- MAGIC * [Basic Aggregation](https://jaceklaskowski.github.io/spark-workshop/slides/spark-sql-basic-aggregation.html#/home) 35 | -- MAGIC * [Exercise: Finding Ids of Rows with Word in Array Column](https://jaceklaskowski.github.io/spark-workshop/exercises/sql/Finding-Ids-of-Rows-with-Word-in-Array-Column.html) 36 | -- MAGIC * [Windowed Aggregation](https://jaceklaskowski.github.io/spark-workshop/slides/spark-sql-windowed-aggregation.html#/home) 37 | -- MAGIC * [Exercise: Finding 1st and 2nd Bestsellers Per Genre](https://jaceklaskowski.github.io/spark-workshop/exercises/sql/Finding-1st-and-2nd-Bestsellers-Per-Genre.html) 38 | -- MAGIC * [Exercise: Calculating Gap Between Current And Highest Salaries Per Department](https://jaceklaskowski.github.io/spark-workshop/exercises/sql/Calculating-Gap-Between-Current-And-Highest-Salaries-Per-Department.html) 39 | -- MAGIC * [Exercise: Calculating Difference Between Consecutive Rows Per Window](https://jaceklaskowski.github.io/spark-workshop/exercises/sql/Calculating-Difference-Between-Consecutive-Rows-Per-Window.html) 40 | -- MAGIC * [Demo: Dynamic Partition Pruning](https://books.japila.pl/spark-sql-internals/demo/dynamic-partition-pruning/) 41 | -- MAGIC 1. (spark) Data Shuffle 42 | -- MAGIC * jak sprawdzić czy występują w planie zapytania 43 | -- MAGIC * w jakich sytuacjach i jak możemy przeciwdziałać 44 | -- MAGIC * możemy zerknąć na plan wykonania naszego ładowania 45 | -- MAGIC * kiedy warto stosować [Bucketing](https://books.japila.pl/spark-sql-internals/bucketing/) 46 | -- MAGIC * [Demo: ObjectHashAggregateExec and Sort-Based Fallback Tasks](https://books.japila.pl/spark-sql-internals/demo/objecthashaggregateexec-sort-based-fallback-tasks/) 47 | -- MAGIC * [Demo: Spilling](https://books.japila.pl/spark-sql-internals/demo/spilling/) 48 | -- MAGIC 1. (spark) [Joins](https://jaceklaskowski.github.io/spark-workshop/slides/spark-sql-joins.html) 49 | -- MAGIC * joiny, hinty itp (broadcasty), jak sobie radzic z joinami duzych tabel 50 | -- MAGIC * range joiny -> czy stosować, jak stosować, wszystkie podziały itp 51 | -- MAGIC * [Bloom Filter Join](https://books.japila.pl/spark-sql-internals/bloom-filter-join/) 52 | -- MAGIC * [Runtime Filtering](https://books.japila.pl/spark-sql-internals/runtime-filtering/) 53 | -- MAGIC * [Exercise: Finding Most Populated Cities Per Country](https://jaceklaskowski.github.io/spark-workshop/exercises/sql/Finding-Most-Populated-Cities-Per-Country.html) 54 | -- MAGIC * [Exercise: Selecting the most important rows per assigned priority](https://jaceklaskowski.github.io/spark-workshop/exercises/sql/selecting-the-most-important-rows-per-assigned-priority.html) 55 | -- MAGIC 1. [Adaptive Query Execution (AQE)](https://books.japila.pl/spark-sql-internals/adaptive-query-execution/) 56 | -- MAGIC 1. (delta lake/databricks) [Change Data Feed / Change Data Capture](https://books.japila.pl/delta-lake-internals/change-data-feed/) 57 | -- MAGIC * "Pure" Delta Lake (not Delta Live Tables) 58 | -- MAGIC * Mediallion Architecture 59 | -- MAGIC * The Gold layer based on CDF of (a JOIN query of) tables from the Silver layer 60 | -- MAGIC * [Use Delta Lake change data feed on Databricks](https://docs.databricks.com/en/delta/delta-change-data-feed.html) 61 | -- MAGIC * [Demo: Change Data Feed](https://books.japila.pl/delta-lake-internals/demo/change-data-feed/) 62 | -- MAGIC * [Notebook example: Propagate changes with Delta change data feed](https://docs.databricks.com/en/delta/delta-change-data-feed.html#notebook-example-propagate-changes-with-delta-change-data-feed) 63 | -- MAGIC 64 | -------------------------------------------------------------------------------- /Delta Live Tables/Building Delta Live Tables pipelines with SQL.sql: -------------------------------------------------------------------------------- 1 | -- Databricks notebook source 2 | -- MAGIC %md 3 | -- MAGIC 4 | -- MAGIC # Building Data Pipelines with Delta Live Tables using SQL 5 | -- MAGIC 6 | -- MAGIC [Meetup](https://www.meetup.com/warsaw-data-engineering/events/291905799/) 7 | -- MAGIC 8 | -- MAGIC Delta Live Tables extends functionality of Apache Spark's Structured Streaming and allows you to write just a few lines of declarative Python or SQL to deploy a production-quality data pipeline (from [Tutorial: ingesting data with Databricks Auto Loader](https://docs.databricks.com/ingestion/auto-loader/index.html#tutorial-ingesting-data-with-databricks-auto-loader)). 9 | -- MAGIC 10 | -- MAGIC Learn more in [Delta Live Tables SQL language reference](https://docs.databricks.com/workflows/delta-live-tables/delta-live-tables-sql-ref.html) 11 | 12 | -- COMMAND ---------- 13 | 14 | -- MAGIC %md ## Introduction 15 | -- MAGIC 16 | -- MAGIC * Delta Live Tables supports only SQL and Python (You cannot use JVM libraries in a DLT pipeline) 17 | 18 | -- COMMAND ---------- 19 | 20 | -- MAGIC %md 21 | -- MAGIC 22 | -- MAGIC ## CREATE LIVE TABLE 23 | -- MAGIC 24 | -- MAGIC * `CREATE OR REFRESH [TEMPORARY] { STREAMING LIVE TABLE | LIVE TABLE } table_name` 25 | -- MAGIC * This Delta Live Tables query is syntactically valid, but you must create a pipeline in order to define and populate your table. 26 | 27 | -- COMMAND ---------- 28 | 29 | -- MAGIC %md ## CREATE TEMPORARY LIVE VIEW 30 | -- MAGIC 31 | -- MAGIC * `CREATE TEMPORARY [STREAMING] LIVE VIEW view_name` 32 | 33 | -- COMMAND ---------- 34 | 35 | -- MAGIC %md ## TEMPORARY 36 | -- MAGIC 37 | -- MAGIC [SQL properties](https://docs.databricks.com/workflows/delta-live-tables/delta-live-tables-sql-ref.html#sql-properties-1): 38 | -- MAGIC 39 | -- MAGIC * `TEMPORARY` creates a temporary table or view. No metadata is persisted for this table. 40 | -- MAGIC * Use TEMPORARY marker to prevent publishing of intermediate tables that are not intended for external consumption (discussed later in this notebook) 41 | 42 | -- COMMAND ---------- 43 | 44 | -- MAGIC %md ## STREAMING 45 | -- MAGIC 46 | -- MAGIC * Creates a table or view that reads an input dataset as a stream 47 | -- MAGIC * Input dataset must be a streaming data source, e.g. [Auto Loader](https://docs.databricks.com/ingestion/auto-loader/index.html) (discussed lated in this notebook) or a `STREAMING LIVE` table or view. 48 | 49 | -- COMMAND ---------- 50 | 51 | -- MAGIC %md 52 | -- MAGIC 53 | -- MAGIC ## Identity Columns 54 | -- MAGIC 55 | -- MAGIC `CREATE LIVE TABLE` supports `GENERATED ALWAYS AS` clause (see [CREATE TABLE SQL reference](https://docs.databricks.com/sql/language-manual/sql-ref-syntax-ddl-create-table-using.html)). 56 | -- MAGIC 57 | -- MAGIC `GENERATED ALWAYS AS IDENTITY` clause can only be used for columns with BIGINT data type. 58 | 59 | -- COMMAND ---------- 60 | 61 | -- MAGIC %md ## Table properties 62 | -- MAGIC 63 | -- MAGIC [Table properties](https://docs.databricks.com/workflows/delta-live-tables/delta-live-tables-sql-ref.html#tbl-properties) 64 | 65 | -- COMMAND ---------- 66 | 67 | -- MAGIC %md 68 | -- MAGIC 69 | -- MAGIC ## Publish data from Delta Live Tables pipelines 70 | -- MAGIC 71 | -- MAGIC [Publish data from Delta Live Tables pipelines](https://docs.databricks.com/workflows/delta-live-tables/delta-live-tables-publish.html): 72 | -- MAGIC 73 | -- MAGIC 1. make the output data of your pipeline discoverable and available to query by publishing datasets to the Databricks metastore. 74 | -- MAGIC 1. enter a database name in the Target field when you create a pipeline 75 | -- MAGIC 1. No support for publishing tables to Unity Catalog. Delta Live Tables supports publishing tables only to the workspace-level Hive metastore. 76 | -- MAGIC 1. only tables and associated metadata are published. Views are not published to the metastore (because they are temporary by definition). 77 | -- MAGIC 1. Use `TEMPORARY` marker to prevent publishing of intermediate tables that are not intended for external consumption 78 | -- MAGIC ```sql 79 | -- MAGIC CREATE TEMPORARY LIVE TABLE temp_table 80 | -- MAGIC AS SELECT ... ; 81 | -- MAGIC ``` 82 | 83 | -- COMMAND ---------- 84 | 85 | --- Pipeline settings > Destination > Target schema 86 | SHOW TABLES IN jaceklaskowski_dlts; 87 | 88 | -- COMMAND ---------- 89 | 90 | select * from jaceklaskowski_dlts.dlt_one; 91 | 92 | -- COMMAND ---------- 93 | 94 | DESCRIBE EXTENDED jaceklaskowski_dlts.dlt_one; 95 | 96 | -- COMMAND ---------- 97 | 98 | describe history jaceklaskowski_dlts.dlt_one; 99 | 100 | -- COMMAND ---------- 101 | 102 | -- MAGIC %md 103 | -- MAGIC 104 | -- MAGIC ## The End 105 | 106 | -- COMMAND ---------- 107 | 108 | -- MAGIC %md 109 | -- MAGIC 110 | -- MAGIC ## Pipeline updates 111 | -- MAGIC 112 | -- MAGIC [Pipeline updates](https://docs.databricks.com/workflows/delta-live-tables/delta-live-tables-concepts.html#dlt-concepts-updates) 113 | -- MAGIC 114 | -- MAGIC An **update** does the following: 115 | -- MAGIC 116 | -- MAGIC 1. Starts a cluster with the correct configuration. 117 | -- MAGIC 1. Discovers all the tables and views defined, and checks for any analysis errors such as invalid column names, missing dependencies, and syntax errors. 118 | -- MAGIC 1. Creates or updates tables and views with the most recent data available. 119 | 120 | -- COMMAND ---------- 121 | 122 | -- MAGIC %md 123 | -- MAGIC 124 | -- MAGIC ## Delta Live Tables FAQ 125 | -- MAGIC 126 | -- MAGIC [Delta Live Tables frequently asked questions](https://docs.databricks.com/workflows/delta-live-tables/delta-live-tables-faqs-issues.html) 127 | -------------------------------------------------------------------------------- /meetups/MLflow on Databricks.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md 3 | # MAGIC 4 | # MAGIC https://learn.microsoft.com/en-us/azure/databricks/machine-learning/manage-model-lifecycle/ 5 | 6 | # COMMAND ---------- 7 | 8 | # MAGIC %pip install --upgrade "mlflow-skinny[databricks]" 9 | 10 | # COMMAND ---------- 11 | 12 | # MAGIC %restart_python 13 | 14 | # COMMAND ---------- 15 | 16 | import mlflow 17 | mlflow.get_registry_uri() 18 | 19 | # COMMAND ---------- 20 | 21 | from sklearn import datasets 22 | from sklearn.ensemble import RandomForestClassifier 23 | 24 | # Train a sklearn model on the iris dataset 25 | X, y = datasets.load_iris(return_X_y=True, as_frame=True) 26 | clf = RandomForestClassifier(max_depth=7) 27 | clf.fit(X, y) 28 | 29 | # Note that the UC model name follows the pattern 30 | # .., corresponding to 31 | # the catalog, schema, and registered model name 32 | # in Unity Catalog under which to create the version 33 | # The registered model will be created if it doesn't already exist 34 | autolog_run = mlflow.last_active_run() 35 | print(autolog_run) 36 | 37 | # COMMAND ---------- 38 | 39 | model_run = mlflow.active_run() 40 | 41 | # COMMAND ---------- 42 | 43 | print(model_run.info) 44 | 45 | # COMMAND ---------- 46 | 47 | model_uri = "runs:/{}/model".format(model_run.info.run_id) 48 | mlflow.register_model(model_uri, "iris_model") 49 | 50 | # COMMAND ---------- 51 | 52 | # MAGIC %md 53 | # MAGIC 54 | # MAGIC # Notes 55 | # MAGIC 56 | # MAGIC Recommended Training: [Use MLflow in Azure Databricks](https://learn.microsoft.com/en-us/training/modules/mlflow-azure-databricks/) 57 | # MAGIC 58 | # MAGIC > Prerequisites 59 | # MAGIC > 60 | # MAGIC > Before starting this module, you should be familiar with Azure Databricks and the **machine learning model training process**. 61 | # MAGIC 62 | # MAGIC This "machine learning model training process" is very important. 63 | 64 | # COMMAND ---------- 65 | 66 | # MAGIC %md 67 | # MAGIC # Run experiments with MLflow 68 | # MAGIC 69 | # MAGIC [Run experiments with MLflow](https://learn.microsoft.com/en-us/training/modules/mlflow-azure-databricks/3-run-experiments) 70 | 71 | # COMMAND ---------- 72 | 73 | # MAGIC %md 74 | # MAGIC 75 | # MAGIC **MLflow experiments** allow data scientists to track training runs in a collection called an **experiment**. 76 | # MAGIC 77 | # MAGIC **Experiment runs** are useful for the following: 78 | # MAGIC 79 | # MAGIC 1. Compare changes over time. 80 | # MAGIC 1. Compare the relative performance of models with different hyperparameter values. 81 | 82 | # COMMAND ---------- 83 | 84 | # MAGIC %md 85 | # MAGIC 86 | # MAGIC Follow up in [Running an experiment](https://learn.microsoft.com/en-us/training/modules/mlflow-azure-databricks/3-run-experiments) 87 | 88 | # COMMAND ---------- 89 | 90 | # MAGIC %md 91 | # MAGIC # MLflow 92 | 93 | # COMMAND ---------- 94 | 95 | # MAGIC %md 96 | # MAGIC 97 | # MAGIC From "Introduction" in [Use MLflow in Azure Databricks](https://learn.microsoft.com/en-us/training/modules/mlflow-azure-databricks/1-introduction) training: 98 | # MAGIC 99 | # MAGIC 1. MLflow is an open source platform for end-to-end machine learning operations. 100 | # MAGIC 1. Using MLflow, data scientists can track model training experiments; logging parameters, metrics, and other assets. 101 | # MAGIC 1. Machine learning engineers can use MLflow to deploy and manage models, enabling applications to consume the models and use them to inference predictions for new data. 102 | 103 | # COMMAND ---------- 104 | 105 | # MAGIC %md 106 | # MAGIC ## Capabilities of MLflow 107 | # MAGIC 108 | # MAGIC There are four components to MLflow: 109 | # MAGIC 110 | # MAGIC 1. MLflow Tracking 111 | # MAGIC 1. MLflow Projects 112 | # MAGIC 1. MLflow Models 113 | # MAGIC 1. MLflow Model Registry 114 | 115 | # COMMAND ---------- 116 | 117 | # MAGIC %md 118 | # MAGIC ## MLflow Tracking 119 | # MAGIC 120 | # MAGIC * **MLflow Tracking** allows data scientists to work with experiments in which they process and analyze data or train machine learning models. 121 | # MAGIC * For each run in an experiment, a data scientist can log parameter values, versions of libraries used, model evaluation metrics, and generated output files; including images of data visualizations and model files. 122 | # MAGIC * This ability to log important details about experiment runs makes it possible to audit and compare the results of prior model training executions. 123 | 124 | # COMMAND ---------- 125 | 126 | # MAGIC %md 127 | # MAGIC 128 | # MAGIC ## MLflow Projects 129 | # MAGIC 130 | # MAGIC 1. An MLflow Project is a way of packaging up code for consistent deployment and reproducibility of results. 131 | # MAGIC 1. MLflow supports several environments for projects, including the use of Conda and Docker to define consistent Python code execution environments. 132 | 133 | # COMMAND ---------- 134 | 135 | # MAGIC %md 136 | # MAGIC 137 | # MAGIC ## MLflow Models 138 | # MAGIC 139 | # MAGIC 1. MLflow offers a standardized format for packaging models for distribution. 140 | # MAGIC 1. This standardized model format allows MLflow to work with models generated from several popular libraries, including Scikit-Learn, PyTorch, MLlib, and others. 141 | # MAGIC 142 | # MAGIC Learn more in [MLflow Models](https://mlflow.org/docs/latest/model) 143 | 144 | # COMMAND ---------- 145 | 146 | # MAGIC %md 147 | # MAGIC ## MLflow Model Registry 148 | # MAGIC 149 | # MAGIC 1. The MLflow Model Registry allows data scientists to register trained models. 150 | # MAGIC 1. MLflow Models and MLflow Projects use the MLflow Model Registry to enable machine learning engineers to deploy and serve models for client applications to consume. 151 | -------------------------------------------------------------------------------- /Delta Lake/Generated Columns.sql: -------------------------------------------------------------------------------- 1 | -- Databricks notebook source 2 | -- MAGIC %md # Generated Columns 3 | -- MAGIC 4 | -- MAGIC [CREATE TABLE USING](https://docs.databricks.com/sql/language-manual/sql-ref-syntax-ddl-create-table-using.html) 5 | 6 | -- COMMAND ---------- 7 | 8 | CREATE SCHEMA IF NOT EXISTS jacek_laskowski; 9 | USE jacek_laskowski; 10 | 11 | -- COMMAND ---------- 12 | 13 | CREATE OR REPLACE TABLE generated_columns ( 14 | id BIGINT GENERATED ALWAYS AS IDENTITY, 15 | name STRING, 16 | five_by_default INT GENERATED ALWAYS AS (5)) 17 | USING delta 18 | 19 | -- COMMAND ---------- 20 | 21 | -- MAGIC %md # SHOW CREATE TABLE 22 | -- MAGIC 23 | -- MAGIC `SHOW CREATE TABLE` seems the only way to find out generated columns. 24 | 25 | -- COMMAND ---------- 26 | 27 | SHOW CREATE TABLE generated_columns 28 | 29 | -- COMMAND ---------- 30 | 31 | DESC TABLE EXTENDED generated_columns id 32 | 33 | -- COMMAND ---------- 34 | 35 | -- MAGIC %md # Column Metadata 36 | 37 | -- COMMAND ---------- 38 | 39 | -- MAGIC %scala 40 | -- MAGIC 41 | -- MAGIC import org.apache.spark.sql.connector.catalog.TableCatalog 42 | -- MAGIC import org.apache.spark.sql.connector.catalog.Identifier 43 | -- MAGIC val table = spark.sessionState.catalogManager.currentCatalog.asInstanceOf[TableCatalog].loadTable(Identifier.of(Array("default"), "generated_columns")) 44 | -- MAGIC 45 | -- MAGIC import com.databricks.sql.transaction.tahoe.catalog.DeltaTableV2 46 | -- MAGIC table.asInstanceOf[DeltaTableV2].snapshot.tableDataSchema.map(_.metadata).foreach(println) 47 | 48 | -- COMMAND ---------- 49 | 50 | -- MAGIC %md # Using High-Level API 51 | -- MAGIC 52 | -- MAGIC Using the Developer API seems fruitless as column metadata (where generated column expressions are stored) is cleared up :( 53 | 54 | -- COMMAND ---------- 55 | 56 | -- MAGIC %scala 57 | -- MAGIC 58 | -- MAGIC import io.delta.tables.DeltaTable 59 | -- MAGIC val dt = DeltaTable.forName("generated_columns") 60 | -- MAGIC display(dt.toDF.schema.map(c => (c.name, c.dataType.sql, c.metadata.json)).toDF("name", "dataType", "metadata")) 61 | 62 | -- COMMAND ---------- 63 | 64 | -- MAGIC %scala 65 | -- MAGIC 66 | -- MAGIC val table = spark.sharedState.externalCatalog.getTable("default", "generated_columns") 67 | -- MAGIC import org.apache.spark.sql.functions.from_json 68 | -- MAGIC import org.apache.spark.sql.types._ 69 | -- MAGIC val metadata_schema = StructType.fromDDL("metadata map") 70 | -- MAGIC val schemaDf = table.schema 71 | -- MAGIC .map(c => (c.name, c.dataType.sql, c.metadata.json)) 72 | -- MAGIC .toDF("name", "dataType", "metadata") 73 | -- MAGIC // FIXME How to display metadata as JSON using metadata_schema 74 | -- MAGIC //.withColumn("metadata", from_json($"metadata", metadata_schema)) 75 | -- MAGIC display(schemaDf) 76 | 77 | -- COMMAND ---------- 78 | 79 | -- MAGIC %md # GENERATED AS IDENTITY Clause 80 | -- MAGIC 81 | -- MAGIC ```antlr 82 | -- MAGIC GENERATED { ALWAYS | BY DEFAULT } AS IDENTITY [ ( [ START WITH start ] [ INCREMENT BY step ] ) ] 83 | -- MAGIC ``` 84 | -- MAGIC 85 | -- MAGIC When you write to the table, and do not provide values for the identity column, it will be automatically assigned a unique and statistically increasing (or decreasing if step is negative) value. This clause is only supported for Delta Lake tables. This clause can only be used for columns with BIGINT data type. 86 | -- MAGIC 87 | -- MAGIC Assigned values are unique but are not guaranteed to be contiguous. Both parameters are optional, and the default value is 1. step cannot be 0. 88 | -- MAGIC 89 | -- MAGIC If the automatically assigned values are beyond the range of the identity column type, the query will fail. 90 | 91 | -- COMMAND ---------- 92 | 93 | -- MAGIC %md ## BY DEFAULT 94 | 95 | -- COMMAND ---------- 96 | 97 | CREATE TABLE ident ( 98 | name string, 99 | id BIGINT GENERATED BY DEFAULT AS IDENTITY (START WITH 0 INCREMENT BY 1) 100 | ); 101 | INSERT INTO ident (name) VALUES ('Juli'), ('Mark'); 102 | INSERT INTO ident (name, id) VALUES ('Dave', 5); 103 | SELECT * FROM ident; 104 | 105 | -- COMMAND ---------- 106 | 107 | INSERT INTO ident (name) VALUES ('Jacek'); 108 | 109 | -- COMMAND ---------- 110 | 111 | select * from ident; 112 | 113 | -- COMMAND ---------- 114 | 115 | INSERT INTO ident (name) VALUES ('Agata'); 116 | 117 | -- COMMAND ---------- 118 | 119 | select * from ident ORDER BY id; 120 | 121 | -- COMMAND ---------- 122 | 123 | -- MAGIC %md What's going to be `ID` value now? 124 | 125 | -- COMMAND ---------- 126 | 127 | INSERT INTO ident (name) VALUES ("Guess what's the ID?"); 128 | SELECT * FROM ident; 129 | 130 | -- COMMAND ---------- 131 | 132 | -- MAGIC %md ## ALWAYS 133 | 134 | -- COMMAND ---------- 135 | 136 | -- MAGIC %md 137 | -- MAGIC 138 | -- MAGIC * When ALWAYS is used, you cannot provide your own values for the identity column. 139 | 140 | -- COMMAND ---------- 141 | 142 | -- MAGIC %md # GENERATED vs DEFAULT 143 | 144 | -- COMMAND ---------- 145 | 146 | -- MAGIC %md 147 | -- MAGIC 148 | -- MAGIC `GENERATED ALWAYS AS` 149 | -- MAGIC 150 | -- MAGIC `GENERATED AS IDENTITY` 151 | -- MAGIC 152 | -- MAGIC * This clause is only supported for Delta Lake tables 153 | 154 | -- COMMAND ---------- 155 | 156 | -- MAGIC %md 157 | -- MAGIC 158 | -- MAGIC `DEFAULT default_expression` 159 | -- MAGIC 160 | -- MAGIC * Defines a `DEFAULT` value for the column which is used on `INSERT`, `UPDATE`, and `MERGE ... INSERT` when the column is not specified. 161 | -- MAGIC * Supported for CSV, JSON, PARQUET, and ORC sources 162 | -- MAGIC * If no default is specified `DEFAULT NULL` is applied for nullable columns. 163 | -- MAGIC * `default_expression` may be composed of literals, and built-in SQL functions or operators 164 | -------------------------------------------------------------------------------- /Data Visualization/Data Visualization on Databricks.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md # Data Visualization on Databricks 3 | 4 | # COMMAND ---------- 5 | 6 | # MAGIC %md 7 | # MAGIC 8 | # MAGIC Inspired by the course [Introduction to Python for Data Science and Data Engineering](https://www.databricks.com/training/catalog/introduction-to-python-for-data-science-and-data-engineering-969) 9 | 10 | # COMMAND ---------- 11 | 12 | # MAGIC %md 13 | # MAGIC 14 | # MAGIC ## pandas 15 | 16 | # COMMAND ---------- 17 | 18 | # MAGIC %md 19 | # MAGIC 20 | # MAGIC ### pandas.core.frame.DataFrame 21 | 22 | # COMMAND ---------- 23 | 24 | import pandas as pd 25 | 26 | # COMMAND ---------- 27 | 28 | pd.__version__ 29 | 30 | # COMMAND ---------- 31 | 32 | # MAGIC %pip install --upgrade pandas 33 | 34 | # COMMAND ---------- 35 | 36 | dbutils.library.restartPython() 37 | 38 | # COMMAND ---------- 39 | 40 | import pandas as pd 41 | pd.__version__ 42 | 43 | # COMMAND ---------- 44 | 45 | # data = a list of lists 46 | data = [ 47 | [0, 'she', 'She Senior Dev'], 48 | [1, 'he', 'He Junior Dev'], 49 | [2, 'them', 'Them Python Dev'], 50 | ] 51 | columns = ['id', 'name', 'role'] 52 | 53 | # COMMAND ---------- 54 | 55 | pandas_dataframe = pd.DataFrame(data=data, columns=columns) 56 | 57 | # COMMAND ---------- 58 | 59 | type(pandas_dataframe) 60 | 61 | # COMMAND ---------- 62 | 63 | display(pandas_dataframe) 64 | 65 | # COMMAND ---------- 66 | 67 | # MAGIC %md 68 | # MAGIC 69 | # MAGIC ### pandas.core.series.Series 70 | 71 | # COMMAND ---------- 72 | 73 | type(pandas_dataframe['id']) 74 | 75 | # COMMAND ---------- 76 | 77 | display(pandas_dataframe['id']) 78 | 79 | # COMMAND ---------- 80 | 81 | # Good ol' map in Functional Programming (FP) 82 | pandas_dataframe['id'] + 1 * 2 83 | 84 | # COMMAND ---------- 85 | 86 | # MAGIC %md 87 | # MAGIC 88 | # MAGIC ## Spark SQL 89 | 90 | # COMMAND ---------- 91 | 92 | # MAGIC %md 93 | # MAGIC 94 | # MAGIC ### DataFrame 95 | 96 | # COMMAND ---------- 97 | 98 | # Guess what happens in PySpark with no data provided for a cell 99 | schema='id long, name string, role string' 100 | pyspark_dataframe = spark.createDataFrame(data=data, schema=schema) 101 | 102 | # COMMAND ---------- 103 | 104 | type(pyspark_dataframe) 105 | 106 | # COMMAND ---------- 107 | 108 | # MAGIC %md 109 | # MAGIC 110 | # MAGIC ## Databricks Built-in Visualizations 111 | 112 | # COMMAND ---------- 113 | 114 | display(pyspark_dataframe) 115 | 116 | # COMMAND ---------- 117 | 118 | display(pyspark_dataframe) 119 | 120 | # COMMAND ---------- 121 | 122 | # MAGIC %md 123 | # MAGIC 124 | # MAGIC ## Python Visualization Libraries 125 | 126 | # COMMAND ---------- 127 | 128 | # MAGIC %md 129 | # MAGIC 130 | # MAGIC On Day 2 of the [Introduction to Python for Data Science and Data Engineering](https://www.databricks.com/training/catalog/introduction-to-python-for-data-science-and-data-engineering-969) course, Databricks introduces [pandas.DataFrame.hist](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.hist.html). 131 | # MAGIC 132 | # MAGIC > A histogram is a representation of the distribution of data. This function calls `matplotlib.pyplot.hist()`, on each series in the DataFrame, resulting in one histogram per column. 133 | 134 | # COMMAND ---------- 135 | 136 | # MAGIC %md 137 | # MAGIC 138 | # MAGIC In the highly-applauded book by the author of pandas [Python for Data Analysis: Data Wrangling with pandas, NumPy, and Jupyter](https://www.amazon.com/Python-Data-Analysis-Wrangling-Jupyter-dp-109810403X/dp/109810403X), one of the take-aways is: 139 | # MAGIC 140 | # MAGIC > Create informative visualizations with matplotlib 141 | 142 | # COMMAND ---------- 143 | 144 | # MAGIC %md 145 | # MAGIC 146 | # MAGIC ## Matplotlib 147 | # MAGIC 148 | # MAGIC [Matplotlib: Visualization with Python](https://matplotlib.org/): 149 | # MAGIC 150 | # MAGIC > **Matplotlib** is a comprehensive library for creating static, animated, and interactive visualizations in Python. Matplotlib makes easy things easy and hard things possible. 151 | 152 | # COMMAND ---------- 153 | 154 | # MAGIC %md 155 | # MAGIC 156 | # MAGIC ## pandas.DataFrame.hist 157 | 158 | # COMMAND ---------- 159 | 160 | pandas_dataframe['id'].hist() 161 | 162 | # COMMAND ---------- 163 | 164 | # MAGIC %md 165 | # MAGIC 166 | # MAGIC ## Seaborn 167 | # MAGIC 168 | # MAGIC [seaborn: statistical data visualization](https://seaborn.pydata.org/): 169 | # MAGIC 170 | # MAGIC > **Seaborn** is a Python data visualization library based on [matplotlib](https://matplotlib.org/). It provides a high-level interface for drawing attractive and informative statistical graphics. 171 | 172 | # COMMAND ---------- 173 | 174 | # MAGIC %md 175 | # MAGIC 176 | # MAGIC ## Visualizations in Databricks notebooks 177 | # MAGIC 178 | # MAGIC Based on Databricks' [Visualizations in Databricks notebooks](https://docs.databricks.com/en/visualizations/index.html): 179 | # MAGIC 180 | # MAGIC * Databricks has built-in support for charts and visualizations in both Databricks SQL and in notebooks 181 | # MAGIC * To create a visualization, click `+` above the result and select Visualization 182 | # MAGIC * If you hover over the top right of a chart in the visualization editor, a Plotly toolbar appears with operations such as select, zoom, and pan 183 | # MAGIC * Click the downward pointing arrow at the right of the tab name for the following operations on a visualization: 184 | # MAGIC * Download 185 | # MAGIC * Remove 186 | # MAGIC * Duplicate 187 | # MAGIC * Rename 188 | # MAGIC * Add to dashboard 189 | # MAGIC * You can change the name of a visualization by clicking directly and editing the name in place 190 | # MAGIC * You can edit a visualization 191 | 192 | # COMMAND ---------- 193 | 194 | bikes = spark.read.csv("/databricks-datasets/bikeSharing/data-001/day.csv", header="true", inferSchema="true") 195 | display(bikes) 196 | --------------------------------------------------------------------------------