├── .gitignore
├── .vscode
    └── settings.json
├── Chapter01
    ├── 1.1 read-csv-data.ipynb
    ├── 1.2 read-json-data.ipynb
    ├── 1.3 read-parquet-data.ipynb
    ├── 1.4 read-xml-data.ipynb
    ├── 1.5 work-with-nested-data.ipynb
    ├── 1.6 process-text-data.ipynb
    └── 1.7 write-data.ipynb
├── Chapter02
    ├── 2.1 basic-transformations.ipynb
    ├── 2.2 filter-data.ipynb
    ├── 2.3 perform-joins.ipynb
    ├── 2.4 perform-aggregations.ipynb
    ├── 2.5 apply-window-functions.ipynb
    ├── 2.6 work-with-UDFs.ipynb
    └── 2.7 handle-nulls.ipynb
├── Chapter03
    ├── 3.1 create-delta-tables.ipynb
    ├── 3.2 read-delta-tables.ipynb
    ├── 3.3 upsert-delta-tables.ipynb
    ├── 3.4 merge-delta-tables.ipynb
    ├── 3.5 change_data_capture_delta-tables.ipynb
    ├── 3.6 optimizing-delta-tables.ipynb
    ├── 3.7 time-travel-delta-tables.ipynb
    └── 3.8 manage-delta-tables.ipynb
├── Chapter04
    ├── 4.0 events-gen-kafka.ipynb
    ├── 4.0 user-gen-kafka.ipynb
    ├── 4.1 config-streaming.ipynb
    ├── 4.2 connect-kafka-streaming.ipynb
    ├── 4.3 transform-filter-streaming.ipynb
    ├── 4.4 config-checkpoints.ipynb
    ├── 4.5 config-triggers.ipynb
    ├── 4.6 apply-window-aggregations.ipynb
    └── 4.7 handle-late-and-out-of-order-data.ipynb
├── Chapter05
    ├── 5.0 events-gen-kafka.ipynb
    ├── 5.0 orders-gen-kafka.ipynb
    ├── 5.0 user-gen-kafka.ipynb
    ├── 5.1 delta-write-streaming.ipynb
    ├── 5.2 idempotent-stream-write-delta.ipynb
    ├── 5.3 merge-cdc-streaming.ipynb
    ├── 5.4 joining-stream-static-data.ipynb
    ├── 5.5 joining-stream-stream-data.ipynb
    └── 5.6 monitor-streams.ipynb
├── Chapter06
    ├── 6.1 monitor-spark-ui.ipynb
    ├── 6.2 broadcast-variables.ipynb
    ├── 6.3 optimize-data-shuffles.ipynb
    ├── 6.4 avoid-data-skew.ipynb
    ├── 6.5 cache-and-persist.ipynb
    ├── 6.6 partitioning-and-repartitioning.ipynb
    └── 6.7 optimize-join-strategies.ipynb
├── Chapter07
    ├── 7.1 optimize-table-partitions-delta.ipynb
    ├── 7.2 z-order-delta-tables.ipynb
    ├── 7.3 data-skipping-delta-tables.ipynb
    └── 7.4 compression-delta-tables.ipynb
├── Chapter08
    ├── 8.1 building-databricks-workflow.yml
    ├── 8.4 conditional-branching.yml
    ├── Clean Up.sql
    ├── Data Preparation DLT.sql
    ├── Download Inventory Data.sql
    └── Setup.sql
├── Chapter09
    ├── 9.1 create-medallion-arch-DLT.sql
    ├── 9.3 data-quality-and-validation.sql
    ├── 9.4 quarantine-bad-data-dlt.sql
    ├── 9.5 monitor-delta-live-table-pipelines.sql
    ├── 9.6 dlt-dabs-cicd
    │   ├── 9.6 create-medallion-arch-DLT.sql
    │   ├── databricks.yml
    │   └── dlt_dabs_cicd_pipeline.yml
    └── 9.7 apply-changes_into-dlt.sql
├── Chapter10
    ├── 10.2 uc_object_hierarchy.sql
    ├── 10.4 tags_comments_metadata.sql
    ├── 10.5 filter_sensitive_data.sql
    ├── 10.6 lineage_view.sql
    └── 10.7 system_tables.sql
├── Chapter11
    ├── 11.1 connect_to_git_repo.py
    ├── 11.3 using_databricks_sdk.py
    ├── 11.4 databricks_vscode_extension.py
    └── 11.5_databricks_asset_bundles
    │   └── dabs_cicd_example
    │       ├── .github
    │           └── workflows
    │           │   ├── deploy_to_prod_CD.yml
    │           │   └── deploy_to_qa_CI.yml
    │       ├── README.md
    │       ├── databricks.yml
    │       ├── fixtures
    │           └── .gitkeep
    │       ├── requirements-dev.txt
    │       ├── resources
    │           ├── dabs_cicd_example_job.yml
    │           └── dabs_cicd_example_pipeline.yml
    │       ├── scratch
    │           ├── README.md
    │           └── exploration.ipynb
    │       └── src
    │           ├── dlt_pipeline.ipynb
    │           └── notebook.ipynb
├── LICENSE
├── README.md
├── build.sh
├── data
    ├── Credit Card
    │   ├── CardBase.csv
    │   ├── CustomerBase.csv
    │   ├── FraudBase.csv
    │   └── TransactionBase.csv
    ├── Online_Retail.csv
    ├── Reviews.csv
    ├── Stanford Question Answering Dataset.json
    ├── netflix_titles.csv
    ├── netflix_titles_batch_2.csv
    ├── nobel_prizes.json
    ├── nobel_prizes.xml
    ├── partitioned_recipes
    │   ├── ._SUCCESS.crc
    │   ├── DatePublished=2019-01-01
    │   │   ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc
    │   │   └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet
    │   ├── DatePublished=2019-01-02
    │   │   ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc
    │   │   └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet
    │   ├── DatePublished=2019-01-03
    │   │   ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc
    │   │   └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet
    │   ├── DatePublished=2019-01-04
    │   │   ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc
    │   │   └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet
    │   ├── DatePublished=2019-01-05
    │   │   ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc
    │   │   └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet
    │   ├── DatePublished=2019-01-06
    │   │   ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc
    │   │   └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet
    │   ├── DatePublished=2019-01-07
    │   │   ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc
    │   │   └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet
    │   ├── DatePublished=2019-01-08
    │   │   ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc
    │   │   └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet
    │   ├── DatePublished=2019-01-09
    │   │   ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc
    │   │   └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet
    │   ├── DatePublished=2019-01-10
    │   │   ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc
    │   │   └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet
    │   ├── DatePublished=2019-01-11
    │   │   ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc
    │   │   └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet
    │   ├── DatePublished=2019-01-12
    │   │   ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc
    │   │   └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet
    │   ├── DatePublished=2019-01-13
    │   │   ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc
    │   │   └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet
    │   ├── DatePublished=2019-01-14
    │   │   ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc
    │   │   └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet
    │   ├── DatePublished=2019-01-15
    │   │   ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc
    │   │   └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet
    │   ├── DatePublished=2019-01-16
    │   │   ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc
    │   │   └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet
    │   ├── DatePublished=2019-01-17
    │   │   ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc
    │   │   └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet
    │   ├── DatePublished=2019-01-18
    │   │   ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc
    │   │   └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet
    │   ├── DatePublished=2019-01-19
    │   │   ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc
    │   │   └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet
    │   ├── DatePublished=2019-01-20
    │   │   ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc
    │   │   └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet
    │   ├── DatePublished=2019-01-21
    │   │   ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc
    │   │   └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet
    │   ├── DatePublished=2019-01-22
    │   │   ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc
    │   │   └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet
    │   ├── DatePublished=2019-01-23
    │   │   ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc
    │   │   └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet
    │   ├── DatePublished=2019-01-24
    │   │   ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc
    │   │   └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet
    │   ├── DatePublished=2019-01-25
    │   │   ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc
    │   │   └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet
    │   ├── DatePublished=2019-01-26
    │   │   ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc
    │   │   └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet
    │   ├── DatePublished=2019-01-27
    │   │   ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc
    │   │   └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet
    │   ├── DatePublished=2019-01-28
    │   │   ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc
    │   │   └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet
    │   ├── DatePublished=2019-01-29
    │   │   ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc
    │   │   └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet
    │   ├── DatePublished=2019-01-30
    │   │   ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc
    │   │   └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet
    │   ├── DatePublished=2019-01-31
    │   │   ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc
    │   │   └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet
    │   ├── DatePublished=2020-01-01
    │   │   ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc
    │   │   └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet
    │   ├── DatePublished=2020-01-02
    │   │   ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc
    │   │   └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet
    │   ├── DatePublished=2020-01-03
    │   │   ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc
    │   │   └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet
    │   ├── DatePublished=2020-01-07
    │   │   ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc
    │   │   └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet
    │   ├── DatePublished=2020-01-09
    │   │   ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc
    │   │   └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet
    │   ├── DatePublished=2020-01-10
    │   │   ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc
    │   │   └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet
    │   ├── DatePublished=2020-01-13
    │   │   ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc
    │   │   └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet
    │   ├── DatePublished=2020-01-14
    │   │   ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc
    │   │   └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet
    │   ├── DatePublished=2020-01-16
    │   │   ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc
    │   │   └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet
    │   ├── DatePublished=2020-01-21
    │   │   ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc
    │   │   └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet
    │   ├── DatePublished=2020-01-23
    │   │   ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc
    │   │   └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet
    │   ├── DatePublished=2020-01-28
    │   │   ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc
    │   │   └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet
    │   ├── DatePublished=2020-01-29
    │   │   ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc
    │   │   └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet
    │   ├── DatePublished=2020-01-30
    │   │   ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc
    │   │   └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet
    │   ├── DatePublished=2020-01-31
    │   │   ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc
    │   │   └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet
    │   ├── DatePublished=2020-02-03
    │   │   ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc
    │   │   └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet
    │   ├── DatePublished=2020-02-04
    │   │   ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc
    │   │   └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet
    │   ├── DatePublished=2020-02-05
    │   │   ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc
    │   │   └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet
    │   ├── DatePublished=2020-02-06
    │   │   ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc
    │   │   └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet
    │   ├── DatePublished=2020-02-07
    │   │   ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc
    │   │   └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet
    │   ├── DatePublished=2020-02-10
    │   │   ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc
    │   │   └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet
    │   ├── DatePublished=2020-02-11
    │   │   ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc
    │   │   └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet
    │   ├── DatePublished=2020-02-13
    │   │   ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc
    │   │   └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet
    │   ├── DatePublished=2020-02-14
    │   │   ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc
    │   │   └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet
    │   ├── DatePublished=2020-02-16
    │   │   ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc
    │   │   └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet
    │   ├── DatePublished=2020-02-18
    │   │   ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc
    │   │   └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet
    │   ├── DatePublished=2020-02-20
    │   │   ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc
    │   │   └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet
    │   ├── DatePublished=2020-02-23
    │   │   ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc
    │   │   └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet
    │   ├── DatePublished=2020-02-24
    │   │   ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc
    │   │   └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet
    │   ├── DatePublished=2020-02-25
    │   │   ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc
    │   │   └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet
    │   ├── DatePublished=2020-02-27
    │   │   ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc
    │   │   └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet
    │   ├── DatePublished=2020-02-28
    │   │   ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc
    │   │   └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet
    │   └── _SUCCESS
    ├── recipes.parquet
    └── titles.csv
├── diagrams
    └── Chapter10
    │   ├── unity_catalog_hierarchy.md
    │   └── unity_catalog_hierarchy.png
├── docker-compose.yml
└── docker
    ├── base
        └── Dockerfile
    ├── jupyterlab
        ├── 00-first.py
        └── Dockerfile
    ├── spark-base
        └── Dockerfile
    ├── spark-master
        └── Dockerfile
    └── spark-worker
        └── Dockerfile


/.gitignore:
--------------------------------------------------------------------------------
  1 | 
  2 | # Created by https://www.toptal.com/developers/gitignore/api/spark,jupyternotebooks,pycharm,pycharm+iml,pycharm+all
  3 | # Edit at https://www.toptal.com/developers/gitignore?templates=spark,jupyternotebooks,pycharm,pycharm+iml,pycharm+all
  4 | 
  5 | ### JupyterNotebooks ###
  6 | # gitignore template for Jupyter Notebooks
  7 | # website: http://jupyter.org/
  8 | 
  9 | .vscode/*
 10 | !.vscode/settings.json
 11 | !.vscode/tasks.json
 12 | !.vscode/launch.json
 13 | !.vscode/extensions.json
 14 | !.vscode/*.code-snippets
 15 | 
 16 | # Local History for Visual Studio Code
 17 | .history/
 18 | 
 19 | # Built Visual Studio Code Extensions
 20 | *.vsix
 21 | 
 22 | .databricks/
 23 | build/
 24 | dist/
 25 | __pycache__/
 26 | *.egg-info
 27 | .venv/
 28 | scratch/**
 29 | !scratch/README.md
 30 | 
 31 | .ipynb_checkpoints
 32 | */.ipynb_checkpoints/*
 33 | 
 34 | .Trash*
 35 | images/*
 36 | data/data_lake/*
 37 | data/delta_lake/*
 38 | data/tmp/*
 39 | # IPython
 40 | profile_default/
 41 | ipython_config.py
 42 | 
 43 | # Remove previous ipynb_checkpoints
 44 | #   git rm -r .ipynb_checkpoints/
 45 | 
 46 | ### PyCharm ###
 47 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
 48 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
 49 | 
 50 | # User-specific stuff
 51 | .idea/**/workspace.xml
 52 | .idea/**/tasks.xml
 53 | .idea/**/usage.statistics.xml
 54 | .idea/**/dictionaries
 55 | .idea/**/shelf
 56 | 
 57 | # Generated files
 58 | .idea/**/contentModel.xml
 59 | 
 60 | # Sensitive or high-churn files
 61 | .idea/**/dataSources/
 62 | .idea/**/dataSources.ids
 63 | .idea/**/dataSources.local.xml
 64 | .idea/**/sqlDataSources.xml
 65 | .idea/**/dynamic.xml
 66 | .idea/**/uiDesigner.xml
 67 | .idea/**/dbnavigator.xml
 68 | 
 69 | # Gradle
 70 | .idea/**/gradle.xml
 71 | .idea/**/libraries
 72 | 
 73 | # Gradle and Maven with auto-import
 74 | # When using Gradle or Maven with auto-import, you should exclude module files,
 75 | # since they will be recreated, and may cause churn.  Uncomment if using
 76 | # auto-import.
 77 | # .idea/artifacts
 78 | # .idea/compiler.xml
 79 | # .idea/jarRepositories.xml
 80 | # .idea/modules.xml
 81 | # .idea/*.iml
 82 | # .idea/modules
 83 | # *.iml
 84 | # *.ipr
 85 | 
 86 | # CMake
 87 | cmake-build-*/
 88 | 
 89 | # Mongo Explorer plugin
 90 | .idea/**/mongoSettings.xml
 91 | 
 92 | # File-based project format
 93 | *.iws
 94 | 
 95 | # IntelliJ
 96 | out/
 97 | 
 98 | # mpeltonen/sbt-idea plugin
 99 | .idea_modules/
100 | 
101 | # JIRA plugin
102 | atlassian-ide-plugin.xml
103 | 
104 | # Cursive Clojure plugin
105 | .idea/replstate.xml
106 | 
107 | # Crashlytics plugin (for Android Studio and IntelliJ)
108 | com_crashlytics_export_strings.xml
109 | crashlytics.properties
110 | crashlytics-build.properties
111 | fabric.properties
112 | 
113 | # Editor-based Rest Client
114 | .idea/httpRequests
115 | 
116 | # Android studio 3.1+ serialized cache file
117 | .idea/caches/build_file_checksums.ser
118 | 
119 | ### PyCharm Patch ###
120 | # Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721
121 | 
122 | # *.iml
123 | # modules.xml
124 | # .idea/misc.xml
125 | # *.ipr
126 | 
127 | # Sonarlint plugin
128 | .idea/**/sonarlint/
129 | 
130 | # SonarQube Plugin
131 | .idea/**/sonarIssues.xml
132 | 
133 | # Markdown Navigator plugin
134 | .idea/**/markdown-navigator.xml
135 | .idea/**/markdown-navigator-enh.xml
136 | .idea/**/markdown-navigator/
137 | 
138 | # Cache file creation bug
139 | # See https://youtrack.jetbrains.com/issue/JBR-2257
140 | .idea/$CACHE_FILE$
141 | 
142 | ### PyCharm+all ###
143 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
144 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
145 | 
146 | # User-specific stuff
147 | 
148 | # Generated files
149 | 
150 | # Sensitive or high-churn files
151 | 
152 | # Gradle
153 | 
154 | # Gradle and Maven with auto-import
155 | # When using Gradle or Maven with auto-import, you should exclude module files,
156 | # since they will be recreated, and may cause churn.  Uncomment if using
157 | # auto-import.
158 | # .idea/artifacts
159 | # .idea/compiler.xml
160 | # .idea/jarRepositories.xml
161 | # .idea/modules.xml
162 | # .idea/*.iml
163 | # .idea/modules
164 | # *.iml
165 | # *.ipr
166 | 
167 | # CMake
168 | 
169 | # Mongo Explorer plugin
170 | 
171 | # File-based project format
172 | 
173 | # IntelliJ
174 | 
175 | # mpeltonen/sbt-idea plugin
176 | 
177 | # JIRA plugin
178 | 
179 | # Cursive Clojure plugin
180 | 
181 | # Crashlytics plugin (for Android Studio and IntelliJ)
182 | 
183 | # Editor-based Rest Client
184 | 
185 | # Android studio 3.1+ serialized cache file
186 | 
187 | ### PyCharm+all Patch ###
188 | # Ignores the whole .idea folder and all .iml files
189 | # See https://github.com/joeblau/gitignore.io/issues/186 and https://github.com/joeblau/gitignore.io/issues/360
190 | 
191 | .idea/
192 | 
193 | # Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-249601023
194 | 
195 | *.iml
196 | modules.xml
197 | .idea/misc.xml
198 | *.ipr
199 | 
200 | # Sonarlint plugin
201 | .idea/sonarlint
202 | 
203 | ### PyCharm+iml ###
204 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
205 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
206 | 
207 | # User-specific stuff
208 | 
209 | # Generated files
210 | 
211 | # Sensitive or high-churn files
212 | 
213 | # Gradle
214 | 
215 | # Gradle and Maven with auto-import
216 | # When using Gradle or Maven with auto-import, you should exclude module files,
217 | # since they will be recreated, and may cause churn.  Uncomment if using
218 | # auto-import.
219 | # .idea/artifacts
220 | # .idea/compiler.xml
221 | # .idea/jarRepositories.xml
222 | # .idea/modules.xml
223 | # .idea/*.iml
224 | # .idea/modules
225 | # *.iml
226 | # *.ipr
227 | 
228 | # CMake
229 | 
230 | # Mongo Explorer plugin
231 | 
232 | # File-based project format
233 | 
234 | # IntelliJ
235 | 
236 | # mpeltonen/sbt-idea plugin
237 | 
238 | # JIRA plugin
239 | 
240 | # Cursive Clojure plugin
241 | 
242 | # Crashlytics plugin (for Android Studio and IntelliJ)
243 | 
244 | # Editor-based Rest Client
245 | 
246 | # Android studio 3.1+ serialized cache file
247 | 
248 | ### PyCharm+iml Patch ###
249 | # Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-249601023
250 | 
251 | 
252 | ### Spark ###
253 | *#*#
254 | *.#*
255 | *.pyc
256 | *.pyo
257 | *.swp
258 | *~
259 | .DS_Store
260 | .cache
261 | .classpath
262 | .ensime
263 | .ensime_cache/
264 | .ensime_lucene
265 | .generated-mima*
266 | .project
267 | .pydevproject
268 | .scala_dependencies
269 | .settings
270 | /lib/
271 | R-unit-tests.log
272 | R/unit-tests.out
273 | R/cran-check.out
274 | R/pkg/vignettes/sparkr-vignettes.html
275 | R/pkg/tests/fulltests/Rplots.pdf
276 | build/*.jar
277 | build/apache-maven*
278 | build/scala*
279 | build/zinc*
280 | cache
281 | checkpoint
282 | conf/*.cmd
283 | conf/*.conf
284 | conf/*.properties
285 | conf/*.sh
286 | conf/*.xml
287 | conf/java-opts
288 | conf/slaves
289 | dependency-reduced-pom.xml
290 | derby.log
291 | dev/create-release/*final
292 | dev/create-release/*txt
293 | dev/pr-deps/
294 | dist/
295 | docs/_site
296 | docs/api
297 | sql/docs
298 | sql/site
299 | lib_managed/
300 | lint-r-report.log
301 | log/
302 | logs/
303 | project/boot/
304 | project/build/target/
305 | project/plugins/lib_managed/
306 | project/plugins/project/build.properties
307 | project/plugins/src_managed/
308 | project/plugins/target/
309 | python/lib/pyspark.zip
310 | python/deps
311 | python/test_coverage/coverage_data
312 | python/test_coverage/htmlcov
313 | python/pyspark/python
314 | reports/
315 | scalastyle-on-compile.generated.xml
316 | scalastyle-output.xml
317 | scalastyle.txt
318 | spark-*-bin-*.tgz
319 | spark-tests.log
320 | src_managed/
321 | streaming-tests.log
322 | target/
323 | unit-tests.log
324 | work/
325 | docs/.jekyll-metadata
326 | 
327 | # For Hive
328 | TempStatsStore/
329 | metastore/
330 | metastore_db/
331 | sql/hive-thriftserver/test_warehouses
332 | warehouse/
333 | spark-warehouse/
334 | 
335 | # For R session data
336 | .RData
337 | .RHistory
338 | .Rhistory
339 | *.Rproj
340 | *.Rproj.*
341 | 
342 | .Rproj.user
343 | 
344 | # For SBT
345 | .jvmopts
346 | 
347 | 
348 | # End of https://www.toptal.com/developers/gitignore/api/spark,jupyternotebooks,pycharm,pycharm+iml,pycharm+all


--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
1 | {
2 |   "python.envFile": "${workspaceFolder}/.databricks/.databricks.env",
3 |   "databricks.python.envFile": "${workspaceFolder}/.env",
4 |   "jupyter.interactiveWindow.cellMarker.codeRegex": "^# COMMAND ----------|^# Databricks notebook source|^(#\\s*%%|#\\s*\\<codecell\\>|#\\s*In\\[\\d*?\\]|#\\s*In\\[ \\])",
5 |   "jupyter.interactiveWindow.cellMarker.default": "# COMMAND ----------"
6 | }
7 | 


--------------------------------------------------------------------------------
/Chapter01/1.7 write-data.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "id": "f6e19347",
  7 |    "metadata": {
  8 |     "tags": []
  9 |    },
 10 |    "outputs": [
 11 |     {
 12 |      "name": "stderr",
 13 |      "output_type": "stream",
 14 |      "text": [
 15 |       "Setting default log level to \"WARN\".\n",
 16 |       "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n"
 17 |      ]
 18 |     },
 19 |     {
 20 |      "name": "stdout",
 21 |      "output_type": "stream",
 22 |      "text": [
 23 |       "23/05/19 12:19:40 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n"
 24 |      ]
 25 |     }
 26 |    ],
 27 |    "source": [
 28 |     "from pyspark.sql import SparkSession\n",
 29 |     "\n",
 30 |     "spark = (SparkSession.builder\n",
 31 |     "         .appName(\"write-data\")\n",
 32 |     "         .master(\"spark://spark-master:7077\")\n",
 33 |     "         .config(\"spark.executor.memory\", \"512m\")\n",
 34 |     "         .getOrCreate())\n",
 35 |     "\n",
 36 |     "spark.sparkContext.setLogLevel(\"ERROR\")"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": 3,
 42 |    "id": "b20b7da2-b027-478b-aa5a-377277fd12f9",
 43 |    "metadata": {
 44 |     "tags": []
 45 |    },
 46 |    "outputs": [
 47 |     {
 48 |      "name": "stderr",
 49 |      "output_type": "stream",
 50 |      "text": [
 51 |       "                                                                                \r"
 52 |      ]
 53 |     }
 54 |    ],
 55 |    "source": [
 56 |     "from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DateType\n",
 57 |     "\n",
 58 |     "df = (spark.read.format(\"csv\")\n",
 59 |     "      .option(\"header\", \"true\")\n",
 60 |     "      .option(\"nullValue\", \"null\")\n",
 61 |     "      .option(\"dateFormat\", \"LLLL d, y\")\n",
 62 |     "      .load(\"../data/netflix_titles.csv\"))"
 63 |    ]
 64 |   },
 65 |   {
 66 |    "cell_type": "code",
 67 |    "execution_count": 7,
 68 |    "id": "e653265c-a151-4917-82ab-ccf8df808fcd",
 69 |    "metadata": {
 70 |     "tags": []
 71 |    },
 72 |    "outputs": [
 73 |     {
 74 |      "name": "stderr",
 75 |      "output_type": "stream",
 76 |      "text": [
 77 |       "                                                                                \r"
 78 |      ]
 79 |     }
 80 |    ],
 81 |    "source": [
 82 |     "(df.write.format(\"csv\")\n",
 83 |     " .option(\"header\", \"true\") \n",
 84 |     " .mode(\"overwrite\")\n",
 85 |     " .option(\"delimiter\", \",\")\n",
 86 |     " .save(\"../data/data_lake/netflix_csv_data\"))"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "code",
 91 |    "execution_count": 8,
 92 |    "id": "353d95f3-0f85-4802-b8b1-7ae6ef4250f3",
 93 |    "metadata": {
 94 |     "tags": []
 95 |    },
 96 |    "outputs": [
 97 |     {
 98 |      "name": "stderr",
 99 |      "output_type": "stream",
100 |      "text": [
101 |       "                                                                                \r"
102 |      ]
103 |     }
104 |    ],
105 |    "source": [
106 |     "(df.write.format(\"json\") \n",
107 |     " .mode(\"overwrite\") \n",
108 |     " .save(\"../data/data_lake/netflix_json_data\"))"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "code",
113 |    "execution_count": 9,
114 |    "id": "2d409bc4-41ec-4c60-a5e9-36cec7bc7354",
115 |    "metadata": {
116 |     "tags": []
117 |    },
118 |    "outputs": [
119 |     {
120 |      "name": "stderr",
121 |      "output_type": "stream",
122 |      "text": [
123 |       "                                                                                \r"
124 |      ]
125 |     }
126 |    ],
127 |    "source": [
128 |     "(df.write.format(\"parquet\") \n",
129 |     " .mode(\"overwrite\") \n",
130 |     " .save(\"../data/data_lake/netflix_parquet_data\"))"
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "markdown",
135 |    "id": "5ab23f75-9f3d-4f61-ae7c-88894fc1c89d",
136 |    "metadata": {},
137 |    "source": [
138 |     "### Write Compressed Data"
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "code",
143 |    "execution_count": 12,
144 |    "id": "8c15d6d9-da07-4f51-a26e-d1e37c935505",
145 |    "metadata": {
146 |     "tags": []
147 |    },
148 |    "outputs": [
149 |     {
150 |      "name": "stderr",
151 |      "output_type": "stream",
152 |      "text": [
153 |       "                                                                                \r"
154 |      ]
155 |     }
156 |    ],
157 |    "source": [
158 |     "(df.write\n",
159 |     " .format(\"csv\")\n",
160 |     " .mode(\"overwrite\") \n",
161 |     " .option(\"header\", \"true\")\n",
162 |     " .option(\"delimiter\", \",\")\n",
163 |     " .option(\"codec\", \"org.apache.hadoop.io.compress.GzipCodec\")\n",
164 |     " .save(\"../data/data_lake/netflix_csv_data.gz\"))"
165 |    ]
166 |   },
167 |   {
168 |    "cell_type": "markdown",
169 |    "id": "3fda5914-2979-4064-a0ad-58e8e63515de",
170 |    "metadata": {},
171 |    "source": [
172 |     "### Specify the Number of Partitions"
173 |    ]
174 |   },
175 |   {
176 |    "cell_type": "code",
177 |    "execution_count": 14,
178 |    "id": "1d842078-d65b-4337-829a-d282a9705ebe",
179 |    "metadata": {
180 |     "tags": []
181 |    },
182 |    "outputs": [
183 |     {
184 |      "name": "stderr",
185 |      "output_type": "stream",
186 |      "text": [
187 |       "                                                                                \r"
188 |      ]
189 |     }
190 |    ],
191 |    "source": [
192 |     "(df.repartition(4) \n",
193 |     " .write.format(\"csv\") \n",
194 |     " .mode(\"overwrite\") \n",
195 |     " .option(\"header\", \"true\") \n",
196 |     " .option(\"delimiter\", \",\") \n",
197 |     " .save(\"../data/data_lake/netflix_csv_data_4_part\")) "
198 |    ]
199 |   },
200 |   {
201 |    "cell_type": "markdown",
202 |    "id": "a81d1655-d1f2-433b-be5c-21d793b92cb4",
203 |    "metadata": {},
204 |    "source": [
205 |     "### Use `coalesce()` to Reduce number fo Partitions"
206 |    ]
207 |   },
208 |   {
209 |    "cell_type": "code",
210 |    "execution_count": 15,
211 |    "id": "6f3b93ee-33cc-48df-9869-d825c85e9c4f",
212 |    "metadata": {
213 |     "tags": []
214 |    },
215 |    "outputs": [
216 |     {
217 |      "name": "stderr",
218 |      "output_type": "stream",
219 |      "text": [
220 |       "                                                                                \r"
221 |      ]
222 |     }
223 |    ],
224 |    "source": [
225 |     "(df.coalesce(1)\n",
226 |     " .write.format(\"csv\")\n",
227 |     " .mode(\"overwrite\") \n",
228 |     " .option(\"header\", \"true\")\n",
229 |     " .option(\"delimiter\", \",\")\n",
230 |     " .save(\"../data/data_lake/netflix_csv_data_whole\"))"
231 |    ]
232 |   },
233 |   {
234 |    "cell_type": "markdown",
235 |    "id": "5c472365-efe7-4e0c-83fe-5cd6cf0fd16c",
236 |    "metadata": {},
237 |    "source": [
238 |     "### Use `partitionBy()` to write partitions based on a column"
239 |    ]
240 |   },
241 |   {
242 |    "cell_type": "code",
243 |    "execution_count": 16,
244 |    "id": "dc9cc49d-4f32-47ee-80f9-3a723aabcc90",
245 |    "metadata": {
246 |     "tags": []
247 |    },
248 |    "outputs": [
249 |     {
250 |      "name": "stderr",
251 |      "output_type": "stream",
252 |      "text": [
253 |       "                                                                                \r"
254 |      ]
255 |     }
256 |    ],
257 |    "source": [
258 |     "# partition the CSV data by the 'release_year' column\n",
259 |     "(df.write.format('csv')\n",
260 |     " .option('header', 'true')\n",
261 |     " .option('delimiter', ',')\n",
262 |     " .mode('overwrite')\n",
263 |     " .partitionBy('release_year')\n",
264 |     " .save(\"../data/data_lake/netflix_csv_data_partitioned\"))"
265 |    ]
266 |   },
267 |   {
268 |    "cell_type": "code",
269 |    "execution_count": 17,
270 |    "id": "a4e7e70c-23e8-468e-81c9-7f57640aa511",
271 |    "metadata": {
272 |     "tags": []
273 |    },
274 |    "outputs": [],
275 |    "source": [
276 |     "spark.stop()"
277 |    ]
278 |   }
279 |  ],
280 |  "metadata": {
281 |   "kernelspec": {
282 |    "display_name": "Python 3 (ipykernel)",
283 |    "language": "python",
284 |    "name": "python3"
285 |   },
286 |   "language_info": {
287 |    "codemirror_mode": {
288 |     "name": "ipython",
289 |     "version": 3
290 |    },
291 |    "file_extension": ".py",
292 |    "mimetype": "text/x-python",
293 |    "name": "python",
294 |    "nbconvert_exporter": "python",
295 |    "pygments_lexer": "ipython3",
296 |    "version": "3.10.6"
297 |   },
298 |   "vscode": {
299 |    "interpreter": {
300 |     "hash": "916dbcbb3f70747c44a77c7bcd40155683ae19c65e1c03b4aa3499c5328201f1"
301 |    }
302 |   }
303 |  },
304 |  "nbformat": 4,
305 |  "nbformat_minor": 5
306 | }
307 | 


--------------------------------------------------------------------------------
/Chapter03/3.1 create-delta-tables.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 9,
  6 |    "id": "54f45c54-0b95-4fd9-a180-fe3be96ab99d",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "from delta import configure_spark_with_delta_pip, DeltaTable\n",
 11 |     "from pyspark.sql import SparkSession"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": 10,
 17 |    "id": "e7850eab-3759-491d-a70a-7a02977db101",
 18 |    "metadata": {},
 19 |    "outputs": [],
 20 |    "source": [
 21 |     "builder = (SparkSession.builder\n",
 22 |     "           .appName(\"create-delta-table\")\n",
 23 |     "           .master(\"spark://spark-master:7077\")\n",
 24 |     "           .config(\"spark.executor.memory\", \"512m\")   \n",
 25 |     "           .config(\"spark.sql.extensions\", \"io.delta.sql.DeltaSparkSessionExtension\")\n",
 26 |     "           .config(\"spark.sql.catalog.spark_catalog\", \"org.apache.spark.sql.delta.catalog.DeltaCatalog\"))\n",
 27 |     "\n",
 28 |     "spark = configure_spark_with_delta_pip(builder).getOrCreate()\n",
 29 |     "spark.sparkContext.setLogLevel(\"ERROR\")"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "code",
 34 |    "execution_count": 11,
 35 |    "id": "5786a610-0463-45af-a229-1cfb8181cad8",
 36 |    "metadata": {},
 37 |    "outputs": [
 38 |     {
 39 |      "name": "stdout",
 40 |      "output_type": "stream",
 41 |      "text": [
 42 |       "The sparksql_magic extension is already loaded. To reload it, use:\n",
 43 |       "  %reload_ext sparksql_magic\n"
 44 |      ]
 45 |     }
 46 |    ],
 47 |    "source": [
 48 |     "get_ipython().run_line_magic('load_ext', 'sparksql_magic')\n",
 49 |     "get_ipython().run_line_magic('config', 'SparkSql.limit=20')"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "code",
 54 |    "execution_count": 12,
 55 |    "id": "228e8f56-4fd9-497a-947f-8ce968889dc7",
 56 |    "metadata": {
 57 |     "tags": []
 58 |    },
 59 |    "outputs": [
 60 |     {
 61 |      "name": "stderr",
 62 |      "output_type": "stream",
 63 |      "text": [
 64 |       "                                                                                \r"
 65 |      ]
 66 |     },
 67 |     {
 68 |      "data": {
 69 |       "text/html": [
 70 |        "<table><tr style=\"border-bottom: 1px solid\"></tr></table>"
 71 |       ],
 72 |       "text/plain": [
 73 |        "<IPython.core.display.HTML object>"
 74 |       ]
 75 |      },
 76 |      "execution_count": 12,
 77 |      "metadata": {},
 78 |      "output_type": "execute_result"
 79 |     }
 80 |    ],
 81 |    "source": [
 82 |     "%%sparksql\n",
 83 |     "CREATE OR REPLACE TABLE default.netflix_titles (\n",
 84 |     "    show_id STRING,\n",
 85 |     "    type STRING,\n",
 86 |     "    title STRING,\n",
 87 |     "    director STRING,\n",
 88 |     "    cast STRING,\n",
 89 |     "    country STRING,\n",
 90 |     "    date_added STRING,\n",
 91 |     "    release_year STRING,\n",
 92 |     "    rating STRING,\n",
 93 |     "    duration STRING,\n",
 94 |     "    listed_in STRING,\n",
 95 |     "    description STRING \n",
 96 |     ") USING DELTA LOCATION '/opt/workspace/data/delta_lake/netflix_titles';"
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "code",
101 |    "execution_count": 13,
102 |    "id": "a0e26449-e04a-4cf5-a42f-5594943bd3fa",
103 |    "metadata": {
104 |     "tags": []
105 |    },
106 |    "outputs": [],
107 |    "source": [
108 |     "# Read CSV file into a DataFrame\n",
109 |     "df = (spark.read\n",
110 |     "      .format(\"csv\")\n",
111 |     "      .option(\"header\", \"true\")\n",
112 |     "      .load(\"../data/netflix_titles.csv\"))"
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "code",
117 |    "execution_count": 14,
118 |    "id": "f8d3d68c-c3b0-4a8a-91b0-32366f2199fc",
119 |    "metadata": {
120 |     "tags": []
121 |    },
122 |    "outputs": [
123 |     {
124 |      "name": "stdout",
125 |      "output_type": "stream",
126 |      "text": [
127 |       "root\n",
128 |       " |-- show_id: string (nullable = true)\n",
129 |       " |-- type: string (nullable = true)\n",
130 |       " |-- title: string (nullable = true)\n",
131 |       " |-- director: string (nullable = true)\n",
132 |       " |-- cast: string (nullable = true)\n",
133 |       " |-- country: string (nullable = true)\n",
134 |       " |-- date_added: string (nullable = true)\n",
135 |       " |-- release_year: string (nullable = true)\n",
136 |       " |-- rating: string (nullable = true)\n",
137 |       " |-- duration: string (nullable = true)\n",
138 |       " |-- listed_in: string (nullable = true)\n",
139 |       " |-- description: string (nullable = true)\n",
140 |       "\n"
141 |      ]
142 |     }
143 |    ],
144 |    "source": [
145 |     "df.printSchema()"
146 |    ]
147 |   },
148 |   {
149 |    "cell_type": "code",
150 |    "execution_count": 15,
151 |    "id": "5c447d82-f3f3-4b78-98b9-cb8e1e79512c",
152 |    "metadata": {
153 |     "tags": []
154 |    },
155 |    "outputs": [
156 |     {
157 |      "name": "stderr",
158 |      "output_type": "stream",
159 |      "text": [
160 |       "                                                                                \r"
161 |      ]
162 |     }
163 |    ],
164 |    "source": [
165 |     "df.write.format(\"delta\").mode(\"overwrite\").saveAsTable(\"default.netflix_titles\")"
166 |    ]
167 |   },
168 |   {
169 |    "cell_type": "code",
170 |    "execution_count": 16,
171 |    "id": "23e92c59-b032-49be-8081-471c3dfa79e3",
172 |    "metadata": {},
173 |    "outputs": [
174 |     {
175 |      "name": "stderr",
176 |      "output_type": "stream",
177 |      "text": [
178 |       "                                                                                \r"
179 |      ]
180 |     },
181 |     {
182 |      "data": {
183 |       "text/html": [
184 |        "<table><tr style=\"border-bottom: 1px solid\"><td style=\"font-weight: bold\">show_id</td><td style=\"font-weight: bold\">type</td><td style=\"font-weight: bold\">title</td><td style=\"font-weight: bold\">director</td><td style=\"font-weight: bold\">cast</td><td style=\"font-weight: bold\">country</td><td style=\"font-weight: bold\">date_added</td><td style=\"font-weight: bold\">release_year</td><td style=\"font-weight: bold\">rating</td><td style=\"font-weight: bold\">duration</td><td style=\"font-weight: bold\">listed_in</td><td style=\"font-weight: bold\">description</td></tr><tr><td>s1</td><td>Movie</td><td>Dick Johnson Is Dead</td><td>Kirsten Johnson</td><td>null</td><td>United States</td><td>September 25, 2021</td><td>2020</td><td>PG-13</td><td>90 min</td><td>Documentaries</td><td>As her father nears the end of his life, filmmaker Kirsten Johnson stages his death in inventive and comical ways to help them both face the inevitable.</td></tr><tr><td>s2</td><td>TV Show</td><td>Blood &amp; Water</td><td>null</td><td>Ama Qamata, Khosi Ngema, Gail Mabalane, Thabang Molaba, Dillon Windvogel, Natasha Thahane, Arno Greeff, Xolile Tshabalala, Getmore Sithole, Cindy Mahlangu, Ryle De Morny, Greteli Fincham, Sello Maake Ka-Ncube, Odwa Gwanya, Mekaila Mathys, Sandi Schultz, Duane Williams, Shamilla Miller, Patrick Mofokeng</td><td>South Africa</td><td>September 24, 2021</td><td>2021</td><td>TV-MA</td><td>2 Seasons</td><td>International TV Shows, TV Dramas, TV Mysteries</td><td>After crossing paths at a party, a Cape Town teen sets out to prove whether a private-school swimming star is her sister who was abducted at birth.</td></tr><tr><td>s3</td><td>TV Show</td><td>Ganglands</td><td>Julien Leclercq</td><td>Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabiha Akkari, Sofia Lesaffre, Salim Kechiouche, Noureddine Farihi, Geert Van Rampelberg, Bakary Diombera</td><td>null</td><td>September 24, 2021</td><td>2021</td><td>TV-MA</td><td>1 Season</td><td>Crime TV Shows, International TV Shows, TV Action &amp; Adventure</td><td>To protect his family from a powerful drug lord, skilled thief Mehdi and his expert team of robbers are pulled into a violent and deadly turf war.</td></tr></table>"
185 |       ],
186 |       "text/plain": [
187 |        "<IPython.core.display.HTML object>"
188 |       ]
189 |      },
190 |      "execution_count": 16,
191 |      "metadata": {},
192 |      "output_type": "execute_result"
193 |     }
194 |    ],
195 |    "source": [
196 |     "%%sparksql \n",
197 |     "SELECT * FROM default.netflix_titles LIMIT 3;"
198 |    ]
199 |   },
200 |   {
201 |    "cell_type": "code",
202 |    "execution_count": 17,
203 |    "id": "bbdd7547-1607-4aa8-9e44-9e82f7356ef4",
204 |    "metadata": {},
205 |    "outputs": [],
206 |    "source": [
207 |     "spark.stop()"
208 |    ]
209 |   },
210 |   {
211 |    "cell_type": "code",
212 |    "execution_count": null,
213 |    "id": "bcf3399a-fd4b-4f7e-98cc-658629165132",
214 |    "metadata": {},
215 |    "outputs": [],
216 |    "source": []
217 |   }
218 |  ],
219 |  "metadata": {
220 |   "kernelspec": {
221 |    "display_name": "Python 3 (ipykernel)",
222 |    "language": "python",
223 |    "name": "python3"
224 |   },
225 |   "language_info": {
226 |    "codemirror_mode": {
227 |     "name": "ipython",
228 |     "version": 3
229 |    },
230 |    "file_extension": ".py",
231 |    "mimetype": "text/x-python",
232 |    "name": "python",
233 |    "nbconvert_exporter": "python",
234 |    "pygments_lexer": "ipython3",
235 |    "version": "3.10.12"
236 |   }
237 |  },
238 |  "nbformat": 4,
239 |  "nbformat_minor": 5
240 | }
241 | 


--------------------------------------------------------------------------------
/Chapter04/4.0 events-gen-kafka.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 2,
  6 |    "id": "8393e835-14fb-4aa3-833f-d60aa5464018",
  7 |    "metadata": {},
  8 |    "outputs": [
  9 |     {
 10 |      "name": "stdout",
 11 |      "output_type": "stream",
 12 |      "text": [
 13 |       "{'user_id': 94, 'event_type': 'click', 'event_time': '02/04/2024, 18:14:31', 'processing_time': '02/04/2024, 18:14:47'}\n",
 14 |       "{'user_id': 71, 'event_type': 'like', 'event_time': '02/04/2024, 18:14:55', 'processing_time': '02/04/2024, 18:14:57'}\n",
 15 |       "{'user_id': 75, 'event_type': 'share', 'event_time': '02/04/2024, 18:15:04', 'processing_time': '02/04/2024, 18:15:07'}\n",
 16 |       "{'user_id': 85, 'event_type': 'purchase', 'event_time': '02/04/2024, 18:14:55', 'processing_time': '02/04/2024, 18:15:17'}\n",
 17 |       "{'user_id': 87, 'event_type': 'share', 'event_time': '02/04/2024, 18:15:07', 'processing_time': '02/04/2024, 18:15:27'}\n",
 18 |       "{'user_id': 16, 'event_type': 'like', 'event_time': '02/04/2024, 18:15:24', 'processing_time': '02/04/2024, 18:15:37'}\n",
 19 |       "{'user_id': 2, 'event_type': 'purchase', 'event_time': '02/04/2024, 18:15:44', 'processing_time': '02/04/2024, 18:15:47'}\n",
 20 |       "{'user_id': 19, 'event_type': 'share', 'event_time': '02/04/2024, 18:15:51', 'processing_time': '02/04/2024, 18:15:57'}\n",
 21 |       "{'user_id': 82, 'event_type': 'view', 'event_time': '02/04/2024, 18:16:01', 'processing_time': '02/04/2024, 18:16:07'}\n",
 22 |       "{'user_id': 1, 'event_type': 'like', 'event_time': '02/04/2024, 18:16:06', 'processing_time': '02/04/2024, 18:16:17'}\n",
 23 |       "{'user_id': 45, 'event_type': 'view', 'event_time': '02/04/2024, 18:16:17', 'processing_time': '02/04/2024, 18:16:27'}\n",
 24 |       "{'user_id': 57, 'event_type': 'click', 'event_time': '02/04/2024, 18:16:24', 'processing_time': '02/04/2024, 18:16:37'}\n",
 25 |       "{'user_id': 13, 'event_type': 'click', 'event_time': '02/04/2024, 18:16:37', 'processing_time': '02/04/2024, 18:16:47'}\n",
 26 |       "{'user_id': 96, 'event_type': 'click', 'event_time': '02/04/2024, 18:16:56', 'processing_time': '02/04/2024, 18:16:57'}\n"
 27 |      ]
 28 |     },
 29 |     {
 30 |      "ename": "KeyboardInterrupt",
 31 |      "evalue": "",
 32 |      "output_type": "error",
 33 |      "traceback": [
 34 |       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
 35 |       "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
 36 |       "Cell \u001b[0;32mIn[2], line 40\u001b[0m\n\u001b[1;32m     37\u001b[0m \u001b[38;5;66;03m# Send the event to the Kafka topic\u001b[39;00m\n\u001b[1;32m     39\u001b[0m producer\u001b[38;5;241m.\u001b[39msend(topic, value\u001b[38;5;241m=\u001b[39mjson\u001b[38;5;241m.\u001b[39mdumps(event)\u001b[38;5;241m.\u001b[39mencode(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mutf-8\u001b[39m\u001b[38;5;124m'\u001b[39m))\n\u001b[0;32m---> 40\u001b[0m \u001b[43mtime\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msleep\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m10\u001b[39;49m\u001b[43m)\u001b[49m\n",
 37 |       "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
 38 |      ]
 39 |     }
 40 |    ],
 41 |    "source": [
 42 |     "# Import the required modules\n",
 43 |     "import random\n",
 44 |     "import json\n",
 45 |     "from kafka import KafkaProducer\n",
 46 |     "import time\n",
 47 |     "\n",
 48 |     "# using datetime module\n",
 49 |     "import datetime;\n",
 50 |     "\n",
 51 |     "# Define the bootstrap servers and the topic name\n",
 52 |     "bootstrap_servers = \"kafka:9092\"\n",
 53 |     "topic = \"events\"\n",
 54 |     "\n",
 55 |     "# Create a Kafka producer with JSON value serializer\n",
 56 |     "producer = KafkaProducer(bootstrap_servers=bootstrap_servers)\n",
 57 |     "\n",
 58 |     "# Define a function to generate random event data\n",
 59 |     "def generate_event():\n",
 60 |     "  # Generate a random user id from 1 to 100\n",
 61 |     "  current_time = time.time()\n",
 62 |     "  user_id = random.randint(1, 100)\n",
 63 |     "  # Generate a random event type from a list of options\n",
 64 |     "  event_type = random.choice([\"click\", \"view\", \"purchase\", \"like\", \"share\"])\n",
 65 |     "  # Generate a random event time from 0 to 9999\n",
 66 |     "  event_time = datetime.datetime.fromtimestamp(current_time- abs(random.normalvariate(0, 10))).strftime(\"%m/%d/%Y, %H:%M:%S\")\n",
 67 |     "  # Generate a random event time from 0 to 9999\n",
 68 |     "  processing_time =datetime.datetime.fromtimestamp(current_time).strftime(\"%m/%d/%Y, %H:%M:%S\")\n",
 69 |     "  # Return a dictionary with the event data\n",
 70 |     "  return {\"user_id\": user_id, \"event_type\": event_type, \"event_time\": event_time, \"processing_time\": processing_time}\n",
 71 |     "\n",
 72 |     "# Loop to generate and send events\n",
 73 |     "while True:\n",
 74 |     "  # Generate a random event\n",
 75 |     "  event = generate_event()\n",
 76 |     "  # Print the event to the console\n",
 77 |     "  print(event)\n",
 78 |     "  # Send the event to the Kafka topic\n",
 79 |     "    \n",
 80 |     "  producer.send(topic, value=json.dumps(event).encode('utf-8'))\n",
 81 |     "  time.sleep(10)\n"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "markdown",
 86 |    "id": "87081f46-4053-4df2-b262-aec86df50970",
 87 |    "metadata": {
 88 |     "jp-MarkdownHeadingCollapsed": true
 89 |    },
 90 |    "source": [
 91 |     "### "
 92 |    ]
 93 |   }
 94 |  ],
 95 |  "metadata": {
 96 |   "kernelspec": {
 97 |    "display_name": "Python 3 (ipykernel)",
 98 |    "language": "python",
 99 |    "name": "python3"
100 |   },
101 |   "language_info": {
102 |    "codemirror_mode": {
103 |     "name": "ipython",
104 |     "version": 3
105 |    },
106 |    "file_extension": ".py",
107 |    "mimetype": "text/x-python",
108 |    "name": "python",
109 |    "nbconvert_exporter": "python",
110 |    "pygments_lexer": "ipython3",
111 |    "version": "3.10.12"
112 |   }
113 |  },
114 |  "nbformat": 4,
115 |  "nbformat_minor": 5
116 | }
117 | 


--------------------------------------------------------------------------------
/Chapter04/4.0 user-gen-kafka.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "id": "8393e835-14fb-4aa3-833f-d60aa5464018",
  7 |    "metadata": {},
  8 |    "outputs": [
  9 |     {
 10 |      "name": "stdout",
 11 |      "output_type": "stream",
 12 |      "text": [
 13 |       "{'id': 46, 'name': 'user87', 'age': 56, 'gender': 'F', 'country': 'India', 'timestamp': '02/04/2024, 17:57:37'}\n",
 14 |       "{'id': 43, 'name': 'user54', 'age': 45, 'gender': 'F', 'country': 'Brazil', 'timestamp': '02/04/2024, 17:57:47'}\n",
 15 |       "{'id': 64, 'name': 'user43', 'age': 49, 'gender': 'M', 'country': 'Brazil', 'timestamp': '02/04/2024, 17:57:57'}\n",
 16 |       "{'id': 60, 'name': 'user8', 'age': 48, 'gender': 'F', 'country': 'USA', 'timestamp': '02/04/2024, 17:58:07'}\n",
 17 |       "{'id': 12, 'name': 'user41', 'age': 29, 'gender': 'F', 'country': 'Canada', 'timestamp': '02/04/2024, 17:58:17'}\n",
 18 |       "{'id': 12, 'name': 'user78', 'age': 28, 'gender': 'F', 'country': 'China', 'timestamp': '02/04/2024, 17:58:27'}\n",
 19 |       "{'id': 63, 'name': 'user39', 'age': 57, 'gender': 'M', 'country': 'Australia', 'timestamp': '02/04/2024, 17:58:37'}\n",
 20 |       "{'id': 32, 'name': 'user23', 'age': 54, 'gender': 'M', 'country': 'China', 'timestamp': '02/04/2024, 17:58:47'}\n",
 21 |       "{'id': 32, 'name': 'user53', 'age': 19, 'gender': 'M', 'country': 'USA', 'timestamp': '02/04/2024, 17:58:57'}\n",
 22 |       "{'id': 58, 'name': 'user52', 'age': 48, 'gender': 'F', 'country': 'India', 'timestamp': '02/04/2024, 17:59:07'}\n",
 23 |       "{'id': 65, 'name': 'user65', 'age': 44, 'gender': 'F', 'country': 'Australia', 'timestamp': '02/04/2024, 17:59:17'}\n",
 24 |       "{'id': 60, 'name': 'user20', 'age': 33, 'gender': 'F', 'country': 'USA', 'timestamp': '02/04/2024, 17:59:27'}\n",
 25 |       "{'id': 71, 'name': 'user61', 'age': 26, 'gender': 'F', 'country': 'Canada', 'timestamp': '02/04/2024, 17:59:37'}\n",
 26 |       "{'id': 69, 'name': 'user74', 'age': 28, 'gender': 'F', 'country': 'India', 'timestamp': '02/04/2024, 17:59:47'}\n",
 27 |       "{'id': 85, 'name': 'user29', 'age': 56, 'gender': 'F', 'country': 'Brazil', 'timestamp': '02/04/2024, 17:59:57'}\n",
 28 |       "{'id': 90, 'name': 'user18', 'age': 29, 'gender': 'F', 'country': 'UK', 'timestamp': '02/04/2024, 18:00:07'}\n",
 29 |       "{'id': 52, 'name': 'user15', 'age': 36, 'gender': 'M', 'country': 'UK', 'timestamp': '02/04/2024, 18:00:17'}\n",
 30 |       "{'id': 60, 'name': 'user59', 'age': 41, 'gender': 'F', 'country': 'Australia', 'timestamp': '02/04/2024, 18:00:27'}\n",
 31 |       "{'id': 99, 'name': 'user19', 'age': 18, 'gender': 'M', 'country': 'Canada', 'timestamp': '02/04/2024, 18:00:37'}\n",
 32 |       "{'id': 95, 'name': 'user62', 'age': 63, 'gender': 'M', 'country': 'USA', 'timestamp': '02/04/2024, 18:00:47'}\n",
 33 |       "{'id': 9, 'name': 'user14', 'age': 46, 'gender': 'F', 'country': 'Canada', 'timestamp': '02/04/2024, 18:00:57'}\n",
 34 |       "{'id': 79, 'name': 'user90', 'age': 22, 'gender': 'M', 'country': 'India', 'timestamp': '02/04/2024, 18:01:07'}\n",
 35 |       "{'id': 8, 'name': 'user54', 'age': 25, 'gender': 'M', 'country': 'India', 'timestamp': '02/04/2024, 18:01:17'}\n",
 36 |       "{'id': 18, 'name': 'user44', 'age': 25, 'gender': 'F', 'country': 'China', 'timestamp': '02/04/2024, 18:01:27'}\n",
 37 |       "{'id': 64, 'name': 'user92', 'age': 63, 'gender': 'F', 'country': 'Australia', 'timestamp': '02/04/2024, 18:01:37'}\n",
 38 |       "{'id': 13, 'name': 'user9', 'age': 60, 'gender': 'M', 'country': 'Canada', 'timestamp': '02/04/2024, 18:01:47'}\n",
 39 |       "{'id': 63, 'name': 'user91', 'age': 45, 'gender': 'M', 'country': 'China', 'timestamp': '02/04/2024, 18:01:57'}\n",
 40 |       "{'id': 29, 'name': 'user34', 'age': 61, 'gender': 'F', 'country': 'UK', 'timestamp': '02/04/2024, 18:02:07'}\n",
 41 |       "{'id': 48, 'name': 'user93', 'age': 58, 'gender': 'M', 'country': 'Canada', 'timestamp': '02/04/2024, 18:02:17'}\n",
 42 |       "{'id': 26, 'name': 'user9', 'age': 57, 'gender': 'M', 'country': 'India', 'timestamp': '02/04/2024, 18:02:27'}\n",
 43 |       "{'id': 97, 'name': 'user94', 'age': 44, 'gender': 'F', 'country': 'Brazil', 'timestamp': '02/04/2024, 18:02:37'}\n",
 44 |       "{'id': 86, 'name': 'user68', 'age': 44, 'gender': 'F', 'country': 'UK', 'timestamp': '02/04/2024, 18:02:47'}\n",
 45 |       "{'id': 47, 'name': 'user45', 'age': 20, 'gender': 'M', 'country': 'Canada', 'timestamp': '02/04/2024, 18:02:57'}\n",
 46 |       "{'id': 41, 'name': 'user59', 'age': 47, 'gender': 'M', 'country': 'Canada', 'timestamp': '02/04/2024, 18:03:07'}\n",
 47 |       "{'id': 59, 'name': 'user79', 'age': 65, 'gender': 'M', 'country': 'Australia', 'timestamp': '02/04/2024, 18:03:17'}\n",
 48 |       "{'id': 76, 'name': 'user41', 'age': 50, 'gender': 'M', 'country': 'USA', 'timestamp': '02/04/2024, 18:03:27'}\n",
 49 |       "{'id': 18, 'name': 'user11', 'age': 57, 'gender': 'F', 'country': 'Australia', 'timestamp': '02/04/2024, 18:03:37'}\n",
 50 |       "{'id': 51, 'name': 'user70', 'age': 56, 'gender': 'F', 'country': 'India', 'timestamp': '02/04/2024, 18:03:47'}\n",
 51 |       "{'id': 69, 'name': 'user15', 'age': 22, 'gender': 'F', 'country': 'Canada', 'timestamp': '02/04/2024, 18:03:57'}\n",
 52 |       "{'id': 13, 'name': 'user83', 'age': 51, 'gender': 'F', 'country': 'Brazil', 'timestamp': '02/04/2024, 18:04:07'}\n",
 53 |       "{'id': 84, 'name': 'user75', 'age': 53, 'gender': 'M', 'country': 'USA', 'timestamp': '02/04/2024, 18:04:17'}\n",
 54 |       "{'id': 34, 'name': 'user79', 'age': 30, 'gender': 'F', 'country': 'Canada', 'timestamp': '02/04/2024, 18:04:27'}\n",
 55 |       "{'id': 45, 'name': 'user25', 'age': 20, 'gender': 'M', 'country': 'Brazil', 'timestamp': '02/04/2024, 18:04:37'}\n",
 56 |       "{'id': 34, 'name': 'user95', 'age': 50, 'gender': 'M', 'country': 'China', 'timestamp': '02/04/2024, 18:04:47'}\n",
 57 |       "{'id': 46, 'name': 'user52', 'age': 54, 'gender': 'M', 'country': 'UK', 'timestamp': '02/04/2024, 18:04:57'}\n",
 58 |       "{'id': 46, 'name': 'user68', 'age': 51, 'gender': 'M', 'country': 'India', 'timestamp': '02/04/2024, 18:05:07'}\n"
 59 |      ]
 60 |     }
 61 |    ],
 62 |    "source": [
 63 |     "import json\n",
 64 |     "import random\n",
 65 |     "import time\n",
 66 |     "from kafka import KafkaProducer\n",
 67 |     "# using datetime module\n",
 68 |     "import datetime;\n",
 69 |     "\n",
 70 |     "producer = KafkaProducer(bootstrap_servers='kafka:9092')\n",
 71 |     "countries = ['USA', 'UK', 'India', 'China', 'Brazil', 'Canada', 'Australia']\n",
 72 |     "genders = ['M', 'F']\n",
 73 |     "\n",
 74 |     "while True:\n",
 75 |     "    current_time = time.time()\n",
 76 |     "    message = {\n",
 77 |     "        'id': random.randint(1, 100),\n",
 78 |     "        'name': f'user{random.randint(1, 100)}',\n",
 79 |     "        'age': random.randint(18, 65),\n",
 80 |     "        'gender': random.choice(genders),\n",
 81 |     "        'country': random.choice(countries),\n",
 82 |     "        'timestamp':datetime.datetime.fromtimestamp(current_time).strftime(\"%m/%d/%Y, %H:%M:%S\")\n",
 83 |     "    }\n",
 84 |     "    producer.send('users', value=json.dumps(message).encode('utf-8'))\n",
 85 |     "    print(message)\n",
 86 |     "    time.sleep(10)"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "markdown",
 91 |    "id": "87081f46-4053-4df2-b262-aec86df50970",
 92 |    "metadata": {
 93 |     "jp-MarkdownHeadingCollapsed": true
 94 |    },
 95 |    "source": [
 96 |     "### "
 97 |    ]
 98 |   }
 99 |  ],
100 |  "metadata": {
101 |   "kernelspec": {
102 |    "display_name": "Python 3 (ipykernel)",
103 |    "language": "python",
104 |    "name": "python3"
105 |   },
106 |   "language_info": {
107 |    "codemirror_mode": {
108 |     "name": "ipython",
109 |     "version": 3
110 |    },
111 |    "file_extension": ".py",
112 |    "mimetype": "text/x-python",
113 |    "name": "python",
114 |    "nbconvert_exporter": "python",
115 |    "pygments_lexer": "ipython3",
116 |    "version": "3.10.12"
117 |   }
118 |  },
119 |  "nbformat": 4,
120 |  "nbformat_minor": 5
121 | }
122 | 


--------------------------------------------------------------------------------
/Chapter04/4.1 config-streaming.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "2cb9c58f-e466-48b4-81ab-07872b03c918",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "For running this recipe, we first need to set up incoming streaming data. We will feed data by opening a terminal window in Jupyter labs UI and run the following command that uses the nc (netcat) utility to create a socket connection on port 9999 and listen for incoming data: \n",
  9 |     "\n",
 10 |     "`nc -lk 9999 `\n",
 11 |     "\n",
 12 |     "Once the previous command is running, you can start typing any text on the command line. \n",
 13 |     "\n",
 14 |     "For example, you can enter the following text: \n",
 15 |     "\n",
 16 |     "Fundamentals of Data Engineering: Plan and Build Robust Data Systems by Joe Reis and Matt Housley. This book provides a concise overview of the data engineering landscape and a framework of best practices to assess and solve data engineering problems. It also helps you choose the best technologies and architectures for your data needs. \n",
 17 |     " \n",
 18 |     "Designing Data-Intensive Applications: The Big Ideas Behind Reliable, Scalable, and Maintainable Systems** by Martin Kleppmann. This book explains the fundamental principles and trade-offs behind the design of distributed data systems. It covers topics such as replication, partitioning, consistency, fault tolerance, batch and stream processing, and data model"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": 1,
 24 |    "id": "54f45c54-0b95-4fd9-a180-fe3be96ab99d",
 25 |    "metadata": {},
 26 |    "outputs": [],
 27 |    "source": [
 28 |     "from pyspark.sql import SparkSession\n",
 29 |     "from pyspark.sql.functions import explode, split"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "code",
 34 |    "execution_count": 2,
 35 |    "id": "e7850eab-3759-491d-a70a-7a02977db101",
 36 |    "metadata": {},
 37 |    "outputs": [
 38 |     {
 39 |      "name": "stderr",
 40 |      "output_type": "stream",
 41 |      "text": [
 42 |       "Setting default log level to \"WARN\".\n",
 43 |       "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n",
 44 |       "24/02/04 17:37:37 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n"
 45 |      ]
 46 |     }
 47 |    ],
 48 |    "source": [
 49 |     "spark = (SparkSession.builder\n",
 50 |     "           .appName(\"config-streaming\")\n",
 51 |     "           .master(\"spark://spark-master:7077\")\n",
 52 |     "           .config(\"spark.executor.memory\", \"512m\")\n",
 53 |     "           .getOrCreate())\n",
 54 |     "spark.sparkContext.setLogLevel(\"ERROR\")"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": 3,
 60 |    "id": "228e8f56-4fd9-497a-947f-8ce968889dc7",
 61 |    "metadata": {
 62 |     "tags": []
 63 |    },
 64 |    "outputs": [],
 65 |    "source": [
 66 |     "# Create DataFrame representing the stream of input lines from connection to localhost:9999\n",
 67 |     "lines = (spark.readStream\n",
 68 |     "         .format(\"socket\")\n",
 69 |     "         .option(\"host\", \"localhost\")\n",
 70 |     "         .option(\"port\", 9999)\n",
 71 |     "         .load())"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "code",
 76 |    "execution_count": 4,
 77 |    "id": "a0e26449-e04a-4cf5-a42f-5594943bd3fa",
 78 |    "metadata": {
 79 |     "tags": []
 80 |    },
 81 |    "outputs": [],
 82 |    "source": [
 83 |     "# Split the lines into words\n",
 84 |     "words = lines.select(\n",
 85 |     "   explode(split(lines.value, \" \")).alias(\"word\"))"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "code",
 90 |    "execution_count": 5,
 91 |    "id": "f8d3d68c-c3b0-4a8a-91b0-32366f2199fc",
 92 |    "metadata": {
 93 |     "tags": []
 94 |    },
 95 |    "outputs": [],
 96 |    "source": [
 97 |     "# Generate running word count\n",
 98 |     "wordCounts = words.groupBy(\"word\").count()"
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "code",
103 |    "execution_count": 6,
104 |    "id": "5c447d82-f3f3-4b78-98b9-cb8e1e79512c",
105 |    "metadata": {
106 |     "tags": []
107 |    },
108 |    "outputs": [
109 |     {
110 |      "name": "stderr",
111 |      "output_type": "stream",
112 |      "text": [
113 |       "                                                                                \r"
114 |      ]
115 |     },
116 |     {
117 |      "name": "stdout",
118 |      "output_type": "stream",
119 |      "text": [
120 |       "-------------------------------------------\n",
121 |       "Batch: 0\n",
122 |       "-------------------------------------------\n",
123 |       "+----+-----+\n",
124 |       "|word|count|\n",
125 |       "+----+-----+\n",
126 |       "+----+-----+\n",
127 |       "\n"
128 |      ]
129 |     },
130 |     {
131 |      "name": "stderr",
132 |      "output_type": "stream",
133 |      "text": [
134 |       "                                                                                \r"
135 |      ]
136 |     },
137 |     {
138 |      "name": "stdout",
139 |      "output_type": "stream",
140 |      "text": [
141 |       "-------------------------------------------\n",
142 |       "Batch: 1\n",
143 |       "-------------------------------------------\n",
144 |       "+------------+-----+\n",
145 |       "|        word|count|\n",
146 |       "+------------+-----+\n",
147 |       "|        Data|    2|\n",
148 |       "|    overview|    1|\n",
149 |       "|Fundamentals|    1|\n",
150 |       "|      stream|    1|\n",
151 |       "|          by|    2|\n",
152 |       "|       solve|    1|\n",
153 |       "|         you|    1|\n",
154 |       "|   landscape|    1|\n",
155 |       "|    systems.|    1|\n",
156 |       "|replication,|    1|\n",
157 |       "|         for|    1|\n",
158 |       "|         Joe|    1|\n",
159 |       "|  tolerance,|    1|\n",
160 |       "|    provides|    1|\n",
161 |       "|        Reis|    1|\n",
162 |       "|      topics|    1|\n",
163 |       "|   practices|    1|\n",
164 |       "|       model|    1|\n",
165 |       "|     concise|    1|\n",
166 |       "| distributed|    1|\n",
167 |       "+------------+-----+\n",
168 |       "only showing top 20 rows\n",
169 |       "\n"
170 |      ]
171 |     },
172 |     {
173 |      "name": "stderr",
174 |      "output_type": "stream",
175 |      "text": [
176 |       "                                                                                \r"
177 |      ]
178 |     },
179 |     {
180 |      "name": "stdout",
181 |      "output_type": "stream",
182 |      "text": [
183 |       "-------------------------------------------\n",
184 |       "Batch: 2\n",
185 |       "-------------------------------------------\n",
186 |       "+------------+-----+\n",
187 |       "|        word|count|\n",
188 |       "+------------+-----+\n",
189 |       "|   Dynamical|    1|\n",
190 |       "|        Data|    2|\n",
191 |       "|     complex|    1|\n",
192 |       "|    overview|    1|\n",
193 |       "|     Science|    1|\n",
194 |       "|Fundamentals|    1|\n",
195 |       "|      stream|    1|\n",
196 |       "|      Nathan|    1|\n",
197 |       "|          by|    3|\n",
198 |       "|       solve|    2|\n",
199 |       "|         you|    2|\n",
200 |       "|   landscape|    1|\n",
201 |       "|          L.|    1|\n",
202 |       "|    systems.|    1|\n",
203 |       "|       apply|    1|\n",
204 |       "|replication,|    1|\n",
205 |       "|         for|    1|\n",
206 |       "|         Joe|    1|\n",
207 |       "|         how|    1|\n",
208 |       "|  reduction,|    1|\n",
209 |       "+------------+-----+\n",
210 |       "only showing top 20 rows\n",
211 |       "\n"
212 |      ]
213 |     }
214 |    ],
215 |    "source": [
216 |     " # Start running the query that prints the running counts to the console\n",
217 |     "query = (wordCounts.writeStream\n",
218 |     "         .outputMode(\"complete\")\n",
219 |     "         .format(\"console\")\n",
220 |     "         .start())"
221 |    ]
222 |   },
223 |   {
224 |    "cell_type": "markdown",
225 |    "id": "b1330621-ab2a-4e08-b5c2-41312d472fd5",
226 |    "metadata": {},
227 |    "source": [
228 |     "Open the terminal and add more data to the netcat listener. See the following example text: \n",
229 |     "\n",
230 |     "__Data-Driven Science and Engineering: Machine Learning, Dynamical Systems, and Control by Steven L. Brunton and J. Nathan Kutz13. This book teaches you how to apply machine learning and data analytics techniques to solve complex engineering and scientific problems. It covers topics such as dimensionality reduction, sparse sensing, system identification, and control design.__\n",
231 |     "\n",
232 |     "A new batch for the stream query is triggered and the output is updated as shown: "
233 |    ]
234 |   },
235 |   {
236 |    "cell_type": "code",
237 |    "execution_count": 7,
238 |    "id": "bbdd7547-1607-4aa8-9e44-9e82f7356ef4",
239 |    "metadata": {},
240 |    "outputs": [],
241 |    "source": [
242 |     "query.stop()"
243 |    ]
244 |   },
245 |   {
246 |    "cell_type": "code",
247 |    "execution_count": 8,
248 |    "id": "3fb39cc3-e38d-4bcb-96d3-7493e83f8c42",
249 |    "metadata": {},
250 |    "outputs": [],
251 |    "source": [
252 |     "spark.stop()"
253 |    ]
254 |   },
255 |   {
256 |    "cell_type": "code",
257 |    "execution_count": null,
258 |    "id": "6410a564-9653-4ece-aeb4-e56df9bd881d",
259 |    "metadata": {},
260 |    "outputs": [],
261 |    "source": []
262 |   }
263 |  ],
264 |  "metadata": {
265 |   "kernelspec": {
266 |    "display_name": "Python 3 (ipykernel)",
267 |    "language": "python",
268 |    "name": "python3"
269 |   },
270 |   "language_info": {
271 |    "codemirror_mode": {
272 |     "name": "ipython",
273 |     "version": 3
274 |    },
275 |    "file_extension": ".py",
276 |    "mimetype": "text/x-python",
277 |    "name": "python",
278 |    "nbconvert_exporter": "python",
279 |    "pygments_lexer": "ipython3",
280 |    "version": "3.10.12"
281 |   }
282 |  },
283 |  "nbformat": 4,
284 |  "nbformat_minor": 5
285 | }
286 | 


--------------------------------------------------------------------------------
/Chapter05/5.0 events-gen-kafka.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "id": "8393e835-14fb-4aa3-833f-d60aa5464018",
  7 |    "metadata": {},
  8 |    "outputs": [
  9 |     {
 10 |      "name": "stdout",
 11 |      "output_type": "stream",
 12 |      "text": [
 13 |       "{'user_id': 47, 'event_type': 'purchase', 'event_time': '08/26/2023, 12:15:57', 'processing_time': '08/26/2023, 12:16:13'}\n",
 14 |       "{'user_id': 44, 'event_type': 'click', 'event_time': '08/26/2023, 12:16:14', 'processing_time': '08/26/2023, 12:16:24'}\n",
 15 |       "{'user_id': 63, 'event_type': 'click', 'event_time': '08/26/2023, 12:16:21', 'processing_time': '08/26/2023, 12:16:34'}\n",
 16 |       "{'user_id': 68, 'event_type': 'purchase', 'event_time': '08/26/2023, 12:16:32', 'processing_time': '08/26/2023, 12:16:44'}\n",
 17 |       "{'user_id': 89, 'event_type': 'click', 'event_time': '08/26/2023, 12:16:32', 'processing_time': '08/26/2023, 12:16:54'}\n",
 18 |       "{'user_id': 15, 'event_type': 'view', 'event_time': '08/26/2023, 12:16:55', 'processing_time': '08/26/2023, 12:17:04'}\n",
 19 |       "{'user_id': 94, 'event_type': 'view', 'event_time': '08/26/2023, 12:16:54', 'processing_time': '08/26/2023, 12:17:14'}\n",
 20 |       "{'user_id': 45, 'event_type': 'view', 'event_time': '08/26/2023, 12:17:23', 'processing_time': '08/26/2023, 12:17:24'}\n",
 21 |       "{'user_id': 17, 'event_type': 'purchase', 'event_time': '08/26/2023, 12:17:23', 'processing_time': '08/26/2023, 12:17:34'}\n",
 22 |       "{'user_id': 48, 'event_type': 'share', 'event_time': '08/26/2023, 12:17:36', 'processing_time': '08/26/2023, 12:17:44'}\n",
 23 |       "{'user_id': 86, 'event_type': 'purchase', 'event_time': '08/26/2023, 12:17:48', 'processing_time': '08/26/2023, 12:17:54'}\n",
 24 |       "{'user_id': 92, 'event_type': 'like', 'event_time': '08/26/2023, 12:18:01', 'processing_time': '08/26/2023, 12:18:04'}\n",
 25 |       "{'user_id': 50, 'event_type': 'purchase', 'event_time': '08/26/2023, 12:18:12', 'processing_time': '08/26/2023, 12:18:14'}\n",
 26 |       "{'user_id': 94, 'event_type': 'share', 'event_time': '08/26/2023, 12:18:22', 'processing_time': '08/26/2023, 12:18:24'}\n",
 27 |       "{'user_id': 84, 'event_type': 'purchase', 'event_time': '08/26/2023, 12:18:30', 'processing_time': '08/26/2023, 12:18:34'}\n",
 28 |       "{'user_id': 54, 'event_type': 'click', 'event_time': '08/26/2023, 12:18:34', 'processing_time': '08/26/2023, 12:18:44'}\n",
 29 |       "{'user_id': 24, 'event_type': 'share', 'event_time': '08/26/2023, 12:18:41', 'processing_time': '08/26/2023, 12:18:54'}\n",
 30 |       "{'user_id': 92, 'event_type': 'share', 'event_time': '08/26/2023, 12:19:00', 'processing_time': '08/26/2023, 12:19:04'}\n",
 31 |       "{'user_id': 66, 'event_type': 'click', 'event_time': '08/26/2023, 12:18:55', 'processing_time': '08/26/2023, 12:19:14'}\n",
 32 |       "{'user_id': 72, 'event_type': 'view', 'event_time': '08/26/2023, 12:19:17', 'processing_time': '08/26/2023, 12:19:24'}\n",
 33 |       "{'user_id': 67, 'event_type': 'click', 'event_time': '08/26/2023, 12:19:23', 'processing_time': '08/26/2023, 12:19:34'}\n",
 34 |       "{'user_id': 80, 'event_type': 'purchase', 'event_time': '08/26/2023, 12:19:38', 'processing_time': '08/26/2023, 12:19:44'}\n",
 35 |       "{'user_id': 61, 'event_type': 'share', 'event_time': '08/26/2023, 12:19:39', 'processing_time': '08/26/2023, 12:19:54'}\n",
 36 |       "{'user_id': 54, 'event_type': 'purchase', 'event_time': '08/26/2023, 12:20:00', 'processing_time': '08/26/2023, 12:20:04'}\n",
 37 |       "{'user_id': 57, 'event_type': 'view', 'event_time': '08/26/2023, 12:20:13', 'processing_time': '08/26/2023, 12:20:14'}\n",
 38 |       "{'user_id': 21, 'event_type': 'like', 'event_time': '08/26/2023, 12:20:20', 'processing_time': '08/26/2023, 12:20:24'}\n",
 39 |       "{'user_id': 55, 'event_type': 'like', 'event_time': '08/26/2023, 12:20:21', 'processing_time': '08/26/2023, 12:20:34'}\n",
 40 |       "{'user_id': 55, 'event_type': 'purchase', 'event_time': '08/26/2023, 12:20:37', 'processing_time': '08/26/2023, 12:20:44'}\n",
 41 |       "{'user_id': 83, 'event_type': 'purchase', 'event_time': '08/26/2023, 12:20:51', 'processing_time': '08/26/2023, 12:20:54'}\n",
 42 |       "{'user_id': 46, 'event_type': 'share', 'event_time': '08/26/2023, 12:20:44', 'processing_time': '08/26/2023, 12:21:04'}\n",
 43 |       "{'user_id': 38, 'event_type': 'click', 'event_time': '08/26/2023, 12:21:09', 'processing_time': '08/26/2023, 12:21:14'}\n",
 44 |       "{'user_id': 84, 'event_type': 'like', 'event_time': '08/26/2023, 12:21:17', 'processing_time': '08/26/2023, 12:21:24'}\n",
 45 |       "{'user_id': 63, 'event_type': 'purchase', 'event_time': '08/26/2023, 12:21:10', 'processing_time': '08/26/2023, 12:21:34'}\n"
 46 |      ]
 47 |     },
 48 |     {
 49 |      "ename": "KeyboardInterrupt",
 50 |      "evalue": "",
 51 |      "output_type": "error",
 52 |      "traceback": [
 53 |       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
 54 |       "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
 55 |       "Cell \u001b[0;32mIn[1], line 40\u001b[0m\n\u001b[1;32m     37\u001b[0m \u001b[38;5;66;03m# Send the event to the Kafka topic\u001b[39;00m\n\u001b[1;32m     39\u001b[0m producer\u001b[38;5;241m.\u001b[39msend(topic, value\u001b[38;5;241m=\u001b[39mjson\u001b[38;5;241m.\u001b[39mdumps(event)\u001b[38;5;241m.\u001b[39mencode(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mutf-8\u001b[39m\u001b[38;5;124m'\u001b[39m))\n\u001b[0;32m---> 40\u001b[0m \u001b[43mtime\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msleep\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m10\u001b[39;49m\u001b[43m)\u001b[49m\n",
 56 |       "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
 57 |      ]
 58 |     }
 59 |    ],
 60 |    "source": [
 61 |     "# Import the required modules\n",
 62 |     "import random\n",
 63 |     "import json\n",
 64 |     "from kafka import KafkaProducer\n",
 65 |     "import time\n",
 66 |     "\n",
 67 |     "# using datetime module\n",
 68 |     "import datetime;\n",
 69 |     "\n",
 70 |     "# Define the bootstrap servers and the topic name\n",
 71 |     "bootstrap_servers = \"kafka:9092\"\n",
 72 |     "topic = \"events\"\n",
 73 |     "\n",
 74 |     "# Create a Kafka producer with JSON value serializer\n",
 75 |     "producer = KafkaProducer(bootstrap_servers=bootstrap_servers)\n",
 76 |     "\n",
 77 |     "# Define a function to generate random event data\n",
 78 |     "def generate_event():\n",
 79 |     "  # Generate a random user id from 1 to 100\n",
 80 |     "  current_time = time.time()\n",
 81 |     "  user_id = random.randint(1, 100)\n",
 82 |     "  # Generate a random event type from a list of options\n",
 83 |     "  event_type = random.choice([\"click\", \"view\", \"purchase\", \"like\", \"share\"])\n",
 84 |     "  # Generate a random event time from 0 to 9999\n",
 85 |     "  event_time = datetime.datetime.fromtimestamp(current_time- abs(random.normalvariate(0, 10))).strftime(\"%m/%d/%Y, %H:%M:%S\")\n",
 86 |     "  # Generate a random event time from 0 to 9999\n",
 87 |     "  processing_time =datetime.datetime.fromtimestamp(current_time).strftime(\"%m/%d/%Y, %H:%M:%S\")\n",
 88 |     "  # Return a dictionary with the event data\n",
 89 |     "  return {\"user_id\": user_id, \"event_type\": event_type, \"event_time\": event_time, \"processing_time\": processing_time}\n",
 90 |     "\n",
 91 |     "# Loop to generate and send events\n",
 92 |     "while True:\n",
 93 |     "  # Generate a random event\n",
 94 |     "  event = generate_event()\n",
 95 |     "  # Print the event to the console\n",
 96 |     "  print(event)\n",
 97 |     "  # Send the event to the Kafka topic\n",
 98 |     "    \n",
 99 |     "  producer.send(topic, value=json.dumps(event).encode('utf-8'))\n",
100 |     "  time.sleep(10)\n"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "markdown",
105 |    "id": "87081f46-4053-4df2-b262-aec86df50970",
106 |    "metadata": {
107 |     "jp-MarkdownHeadingCollapsed": true
108 |    },
109 |    "source": [
110 |     "### "
111 |    ]
112 |   }
113 |  ],
114 |  "metadata": {
115 |   "kernelspec": {
116 |    "display_name": "Python 3 (ipykernel)",
117 |    "language": "python",
118 |    "name": "python3"
119 |   },
120 |   "language_info": {
121 |    "codemirror_mode": {
122 |     "name": "ipython",
123 |     "version": 3
124 |    },
125 |    "file_extension": ".py",
126 |    "mimetype": "text/x-python",
127 |    "name": "python",
128 |    "nbconvert_exporter": "python",
129 |    "pygments_lexer": "ipython3",
130 |    "version": "3.10.12"
131 |   }
132 |  },
133 |  "nbformat": 4,
134 |  "nbformat_minor": 5
135 | }
136 | 


--------------------------------------------------------------------------------
/Chapter06/6.1 monitor-spark-ui.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "id": "6bfbe105-2fd1-43f1-95e9-525d85226a13",
  7 |    "metadata": {},
  8 |    "outputs": [
  9 |     {
 10 |      "name": "stderr",
 11 |      "output_type": "stream",
 12 |      "text": [
 13 |       "Setting default log level to \"WARN\".\n",
 14 |       "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n",
 15 |       "23/09/03 11:00:25 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n"
 16 |      ]
 17 |     }
 18 |    ],
 19 |    "source": [
 20 |     "from pyspark.sql import SparkSession\n",
 21 |     "\n",
 22 |     "# Create a new SparkSession\n",
 23 |     "spark = (SparkSession\n",
 24 |     "         .builder\n",
 25 |     "         .appName(\"monitor-spark-ui\")\n",
 26 |     "         .master(\"spark://spark-master:7077\")\n",
 27 |     "         .config(\"spark.executor.memory\", \"512m\")\n",
 28 |     "         .getOrCreate())\n",
 29 |     "\n",
 30 |     "# Set log level to ERROR\n",
 31 |     "spark.sparkContext.setLogLevel(\"ERROR\")"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": 2,
 37 |    "id": "d348a886-776d-42db-936c-0d7339969642",
 38 |    "metadata": {},
 39 |    "outputs": [],
 40 |    "source": [
 41 |     "from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DateType\n",
 42 |     "\n",
 43 |     "# Define a Schema\n",
 44 |     "schema = StructType([\n",
 45 |     "    StructField(\"show_id\", StringType(), True),\n",
 46 |     "    StructField(\"type\", StringType(), True),\n",
 47 |     "    StructField(\"title\", StringType(), True),\n",
 48 |     "    StructField(\"director\", StringType(), True),\n",
 49 |     "    StructField(\"cast\", StringType(), True),\n",
 50 |     "    StructField(\"country\", StringType(), True),\n",
 51 |     "    StructField(\"date_added\", DateType(), True),\n",
 52 |     "    StructField(\"release_year\", IntegerType(), True),\n",
 53 |     "    StructField(\"rating\", StringType(), True),\n",
 54 |     "    StructField(\"duration\", StringType(), True),\n",
 55 |     "    StructField(\"listed_in\", StringType(), True),\n",
 56 |     "    StructField(\"description\", StringType(), True)])\n"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "code",
 61 |    "execution_count": 3,
 62 |    "id": "8670a1c7-4876-4870-8a8c-aedb67ee703e",
 63 |    "metadata": {},
 64 |    "outputs": [],
 65 |    "source": [
 66 |     "# Read CSV file into a DataFrame\n",
 67 |     "df = (spark.read.format(\"csv\")\n",
 68 |     "      .option(\"header\", \"true\")\n",
 69 |     "      .schema(schema)\n",
 70 |     "      .load(\"../data/netflix_titles.csv\"))"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "code",
 75 |    "execution_count": 4,
 76 |    "id": "0d196aa1-1af0-4100-9fee-17ad1edff93c",
 77 |    "metadata": {},
 78 |    "outputs": [],
 79 |    "source": [
 80 |     "# Filter rows where release_year ge is greater than 2020\n",
 81 |     "df = df.filter(df.release_year > 2020)"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "code",
 86 |    "execution_count": 5,
 87 |    "id": "cf6e81ec-736f-4955-b953-a9210cf55112",
 88 |    "metadata": {},
 89 |    "outputs": [],
 90 |    "source": [
 91 |     "# Group by country and count\n",
 92 |     "df = df.groupBy(\"country\").count()"
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "code",
 97 |    "execution_count": 6,
 98 |    "id": "831d1ff5-cd30-47ba-91ee-d4e664c67917",
 99 |    "metadata": {},
100 |    "outputs": [
101 |     {
102 |      "name": "stderr",
103 |      "output_type": "stream",
104 |      "text": [
105 |       "[Stage 0:>                                                          (0 + 1) / 1]\r"
106 |      ]
107 |     },
108 |     {
109 |      "name": "stdout",
110 |      "output_type": "stream",
111 |      "text": [
112 |       "+--------------------+-----+\n",
113 |       "|             country|count|\n",
114 |       "+--------------------+-----+\n",
115 |       "|India, United Kin...|    1|\n",
116 |       "|France, United St...|    3|\n",
117 |       "|              Sweden|    3|\n",
118 |       "|              Turkey|    5|\n",
119 |       "|China, United Sta...|    1|\n",
120 |       "|             Germany|    5|\n",
121 |       "|              Jordan|    1|\n",
122 |       "|              France|    7|\n",
123 |       "|    Uruguay, Germany|    1|\n",
124 |       "|United States, India|    1|\n",
125 |       "|Belgium, United K...|    1|\n",
126 |       "|                null|  208|\n",
127 |       "|           Argentina|    2|\n",
128 |       "|Mexico, United St...|    1|\n",
129 |       "|             Belgium|    2|\n",
130 |       "|               India|   31|\n",
131 |       "|       United States|  137|\n",
132 |       "|               China|    4|\n",
133 |       "|United States, Cz...|    1|\n",
134 |       "|United States, Japan|    2|\n",
135 |       "+--------------------+-----+\n",
136 |       "only showing top 20 rows\n",
137 |       "\n"
138 |      ]
139 |     },
140 |     {
141 |      "name": "stderr",
142 |      "output_type": "stream",
143 |      "text": [
144 |       "                                                                                \r"
145 |      ]
146 |     }
147 |    ],
148 |    "source": [
149 |     "# Show the result\n",
150 |     "df.show()"
151 |    ]
152 |   },
153 |   {
154 |    "cell_type": "code",
155 |    "execution_count": null,
156 |    "id": "030c5f42-391f-4b5f-93ff-efa79ae4bf36",
157 |    "metadata": {},
158 |    "outputs": [],
159 |    "source": []
160 |   }
161 |  ],
162 |  "metadata": {
163 |   "kernelspec": {
164 |    "display_name": "Python 3 (ipykernel)",
165 |    "language": "python",
166 |    "name": "python3"
167 |   },
168 |   "language_info": {
169 |    "codemirror_mode": {
170 |     "name": "ipython",
171 |     "version": 3
172 |    },
173 |    "file_extension": ".py",
174 |    "mimetype": "text/x-python",
175 |    "name": "python",
176 |    "nbconvert_exporter": "python",
177 |    "pygments_lexer": "ipython3",
178 |    "version": "3.10.12"
179 |   }
180 |  },
181 |  "nbformat": 4,
182 |  "nbformat_minor": 5
183 | }
184 | 


--------------------------------------------------------------------------------
/Chapter06/6.2 broadcast-variables.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 12,
  6 |    "id": "3d05d7a1-fe97-491a-a177-c1886a5f8baf",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "from pyspark.sql import SparkSession \n",
 11 |     "from pyspark.sql.functions import rand, when, pandas_udf, PandasUDFType\n",
 12 |     "from pyspark.sql.types import BooleanType\n",
 13 |     "import pandas as pd"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "code",
 18 |    "execution_count": 2,
 19 |    "id": "6bfbe105-2fd1-43f1-95e9-525d85226a13",
 20 |    "metadata": {},
 21 |    "outputs": [
 22 |     {
 23 |      "name": "stderr",
 24 |      "output_type": "stream",
 25 |      "text": [
 26 |       "Setting default log level to \"WARN\".\n",
 27 |       "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n",
 28 |       "24/02/21 12:33:51 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n"
 29 |      ]
 30 |     }
 31 |    ],
 32 |    "source": [
 33 |     "# Create a new SparkSession\n",
 34 |     "spark = (SparkSession\n",
 35 |     "         .builder\n",
 36 |     "         .appName(\"broadcast-variables\")\n",
 37 |     "         .master(\"spark://spark-master:7077\")\n",
 38 |     "         .config(\"spark.executor.memory\", \"512m\")\n",
 39 |     "         .getOrCreate())\n",
 40 |     "\n",
 41 |     "# Set log level to ERROR\n",
 42 |     "spark.sparkContext.setLogLevel(\"ERROR\")"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": 7,
 48 |    "id": "2ae2e1f7-45c3-486a-970e-0727eb303197",
 49 |    "metadata": {},
 50 |    "outputs": [
 51 |     {
 52 |      "name": "stderr",
 53 |      "output_type": "stream",
 54 |      "text": [
 55 |       "[Stage 0:>                                                          (0 + 1) / 1]\r"
 56 |      ]
 57 |     },
 58 |     {
 59 |      "name": "stdout",
 60 |      "output_type": "stream",
 61 |      "text": [
 62 |       "+---+------+------+------------+\n",
 63 |       "| id|salary|gender|country_code|\n",
 64 |       "+---+------+------+------------+\n",
 65 |       "|  0|  8000|     M|          US|\n",
 66 |       "|  1|  3500|     F|        null|\n",
 67 |       "|  2|  9700|     F|        null|\n",
 68 |       "|  3|  4800|     F|        null|\n",
 69 |       "|  4|  9100|     F|        null|\n",
 70 |       "+---+------+------+------------+\n",
 71 |       "only showing top 5 rows\n",
 72 |       "\n"
 73 |      ]
 74 |     },
 75 |     {
 76 |      "name": "stderr",
 77 |      "output_type": "stream",
 78 |      "text": [
 79 |       "                                                                                \r"
 80 |      ]
 81 |     }
 82 |    ],
 83 |    "source": [
 84 |     "# Create some sample data frames\n",
 85 |     "# A large data frame with 1 million rows\n",
 86 |     "large_df = (spark.range(0, 1000000)\n",
 87 |     "            .withColumn(\"salary\", 100*(rand() * 100).cast(\"int\"))\n",
 88 |     "            .withColumn(\"gender\", when((rand() * 2).cast(\"int\") == 0, \"M\").otherwise(\"F\"))\n",
 89 |     "            .withColumn(\"country_code\", \n",
 90 |     "                        when((rand() * 4).cast(\"int\") == 0, \"US\")\n",
 91 |     "                        .when((rand() * 4).cast(\"int\") == 1, \"CN\")\n",
 92 |     "                        .when((rand() * 4).cast(\"int\") == 2, \"IN\")\n",
 93 |     "                        .when((rand() * 4).cast(\"int\") == 3, \"BR\")))\n",
 94 |     "large_df.show(5)"
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "code",
 99 |    "execution_count": 8,
100 |    "id": "5e1b2f38-375a-4c39-94ab-45597162caf2",
101 |    "metadata": {},
102 |    "outputs": [],
103 |    "source": [
104 |     "# Define lookup table\n",
105 |     "lookup = {\"US\": \"United States\", \"CN\": \"China\", \"IN\": \"India\", \"BR\": \"Brazil\", \"RU\": \"Russia\"}\n",
106 |     "\n",
107 |     "# Create broadcast variable\n",
108 |     "broadcast_lookup = spark.sparkContext.broadcast(lookup)"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "code",
113 |    "execution_count": 9,
114 |    "id": "11da2278-39a4-43a3-a650-413edce1ba0c",
115 |    "metadata": {},
116 |    "outputs": [
117 |     {
118 |      "name": "stderr",
119 |      "output_type": "stream",
120 |      "text": [
121 |       "/usr/local/lib/python3.10/dist-packages/pyspark/sql/pandas/functions.py:399: UserWarning: In Python 3.6+ and Spark 3.0+, it is preferred to specify type hints for pandas UDF instead of specifying pandas UDF type which will be deprecated in the future releases. See SPARK-28264 for more details.\n",
122 |       "  warnings.warn(\n"
123 |      ]
124 |     }
125 |    ],
126 |    "source": [
127 |     "@pandas_udf('string', PandasUDFType.SCALAR)\n",
128 |     "def country_convert(s):\n",
129 |     "    return s.map(broadcast_lookup.value)"
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "code",
134 |    "execution_count": 10,
135 |    "id": "70902427-3e46-48af-9f3b-f1b3a3619cb7",
136 |    "metadata": {},
137 |    "outputs": [
138 |     {
139 |      "name": "stderr",
140 |      "output_type": "stream",
141 |      "text": [
142 |       "[Stage 1:>                                                          (0 + 1) / 1]\r"
143 |      ]
144 |     },
145 |     {
146 |      "name": "stdout",
147 |      "output_type": "stream",
148 |      "text": [
149 |       "+---+------+------+------------+-------------+\n",
150 |       "| id|salary|gender|country_code| country_name|\n",
151 |       "+---+------+------+------------+-------------+\n",
152 |       "|  0|  8000|     M|          US|United States|\n",
153 |       "|  1|  3500|     F|        null|         null|\n",
154 |       "|  2|  9700|     F|        null|         null|\n",
155 |       "|  3|  4800|     F|        null|         null|\n",
156 |       "|  4|  9100|     F|        null|         null|\n",
157 |       "+---+------+------+------------+-------------+\n",
158 |       "only showing top 5 rows\n",
159 |       "\n"
160 |      ]
161 |     },
162 |     {
163 |      "name": "stderr",
164 |      "output_type": "stream",
165 |      "text": [
166 |       "                                                                                \r"
167 |      ]
168 |     }
169 |    ],
170 |    "source": [
171 |     "large_df.withColumn(\"country_name\", country_convert(large_df.country_code)).show(5)"
172 |    ]
173 |   },
174 |   {
175 |    "cell_type": "code",
176 |    "execution_count": 13,
177 |    "id": "d2492f49-744d-44ea-a253-1b73c1d04710",
178 |    "metadata": {},
179 |    "outputs": [],
180 |    "source": [
181 |     "@pandas_udf(BooleanType(), PandasUDFType.SCALAR)\n",
182 |     "def filter_unknown_country(s):\n",
183 |     "    return s.isin(broadcast_lookup.value)"
184 |    ]
185 |   },
186 |   {
187 |    "cell_type": "code",
188 |    "execution_count": 14,
189 |    "id": "480aba43-0e1d-4f84-82e0-c7cdaa9debb5",
190 |    "metadata": {},
191 |    "outputs": [
192 |     {
193 |      "name": "stderr",
194 |      "output_type": "stream",
195 |      "text": [
196 |       "[Stage 2:>                                                          (0 + 1) / 1]\r"
197 |      ]
198 |     },
199 |     {
200 |      "name": "stdout",
201 |      "output_type": "stream",
202 |      "text": [
203 |       "+---+------+------+------------+\n",
204 |       "| id|salary|gender|country_code|\n",
205 |       "+---+------+------+------------+\n",
206 |       "|  0|  8000|     M|          US|\n",
207 |       "|  6|  3400|     F|          US|\n",
208 |       "|  7|  8400|     M|          CN|\n",
209 |       "|  8|  1100|     F|          US|\n",
210 |       "|  9|  2900|     M|          CN|\n",
211 |       "+---+------+------+------------+\n",
212 |       "only showing top 5 rows\n",
213 |       "\n"
214 |      ]
215 |     },
216 |     {
217 |      "name": "stderr",
218 |      "output_type": "stream",
219 |      "text": [
220 |       "                                                                                \r"
221 |      ]
222 |     }
223 |    ],
224 |    "source": [
225 |     "large_df.filter(filter_unknown_country(large_df.country_code)).show(5)"
226 |    ]
227 |   },
228 |   {
229 |    "cell_type": "code",
230 |    "execution_count": 15,
231 |    "id": "526788c3-4a1d-4314-b2cc-b8f3c13683c4",
232 |    "metadata": {},
233 |    "outputs": [],
234 |    "source": [
235 |     "spark.stop()"
236 |    ]
237 |   },
238 |   {
239 |    "cell_type": "code",
240 |    "execution_count": null,
241 |    "id": "875d3099-a7e6-48d2-ac28-09985403102d",
242 |    "metadata": {},
243 |    "outputs": [],
244 |    "source": []
245 |   }
246 |  ],
247 |  "metadata": {
248 |   "kernelspec": {
249 |    "display_name": "Python 3 (ipykernel)",
250 |    "language": "python",
251 |    "name": "python3"
252 |   },
253 |   "language_info": {
254 |    "codemirror_mode": {
255 |     "name": "ipython",
256 |     "version": 3
257 |    },
258 |    "file_extension": ".py",
259 |    "mimetype": "text/x-python",
260 |    "name": "python",
261 |    "nbconvert_exporter": "python",
262 |    "pygments_lexer": "ipython3",
263 |    "version": "3.10.12"
264 |   }
265 |  },
266 |  "nbformat": 4,
267 |  "nbformat_minor": 5
268 | }
269 | 


--------------------------------------------------------------------------------
/Chapter06/6.5 cache-and-persist.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 6,
  6 |    "id": "6020138a-0dc8-4b7e-bad8-9cf2ef7133aa",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "from pyspark.sql import SparkSession \n",
 11 |     "from pyspark import StorageLevel \n",
 12 |     "from pyspark.sql.functions import rand, current_date, date_sub"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "code",
 17 |    "execution_count": 2,
 18 |    "id": "2b24dc6e-9c01-4a8b-9b6e-0d71b953bd82",
 19 |    "metadata": {},
 20 |    "outputs": [
 21 |     {
 22 |      "name": "stderr",
 23 |      "output_type": "stream",
 24 |      "text": [
 25 |       "Setting default log level to \"WARN\".\n",
 26 |       "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n",
 27 |       "24/02/21 13:44:30 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n"
 28 |      ]
 29 |     }
 30 |    ],
 31 |    "source": [
 32 |     "spark = (SparkSession.builder\n",
 33 |     "         .appName(\"cache-and-persist\")\n",
 34 |     "         .master(\"spark://spark-master:7077\")\n",
 35 |     "         .config(\"spark.executor.memory\", \"512m\")\n",
 36 |     "         .getOrCreate())\n",
 37 |     "\n",
 38 |     "spark.sparkContext.setLogLevel(\"ERROR\")"
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "code",
 43 |    "execution_count": 9,
 44 |    "id": "8e6169b6-ba60-4544-bf4f-b58cbb2e3d22",
 45 |    "metadata": {},
 46 |    "outputs": [],
 47 |    "source": [
 48 |     "# Define a function to measure the execution time of a query\n",
 49 |     "import time\n",
 50 |     "\n",
 51 |     "def measure_time(query):\n",
 52 |     "    start = time.time()\n",
 53 |     "    query.collect() # Force the query execution by calling an action\n",
 54 |     "    end = time.time()\n",
 55 |     "    print(f\"Execution time: {end - start} seconds\")"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": 3,
 61 |    "id": "1d243df0-aa88-4ea7-a363-76e00eafc3cc",
 62 |    "metadata": {},
 63 |    "outputs": [
 64 |     {
 65 |      "name": "stderr",
 66 |      "output_type": "stream",
 67 |      "text": [
 68 |       "[Stage 0:>                                                          (0 + 1) / 1]\r"
 69 |      ]
 70 |     },
 71 |     {
 72 |      "name": "stdout",
 73 |      "output_type": "stream",
 74 |      "text": [
 75 |       "+---+----------+---------+\n",
 76 |       "| id|      date|ProductId|\n",
 77 |       "+---+----------+---------+\n",
 78 |       "|  0|2024-02-10|       67|\n",
 79 |       "|  1|2023-07-12|       39|\n",
 80 |       "|  2|2023-08-10|        8|\n",
 81 |       "|  3|2023-05-22|       29|\n",
 82 |       "|  4|2023-06-22|       63|\n",
 83 |       "+---+----------+---------+\n",
 84 |       "only showing top 5 rows\n",
 85 |       "\n"
 86 |      ]
 87 |     },
 88 |     {
 89 |      "name": "stderr",
 90 |      "output_type": "stream",
 91 |      "text": [
 92 |       "                                                                                \r"
 93 |      ]
 94 |     }
 95 |    ],
 96 |    "source": [
 97 |     "# Create some sample data frames\n",
 98 |     "# A large data frame with 10 million rows and two columns: id and value\n",
 99 |     "large_df = (spark.range(0, 10000000)\n",
100 |     "            .withColumn(\"date\", date_sub(current_date(), (rand() * 365).cast(\"int\")))\n",
101 |     "            .withColumn(\"ProductId\", (rand() * 100).cast(\"int\")))\n",
102 |     "large_df.show(5)"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "code",
107 |    "execution_count": 4,
108 |    "id": "1bb8e6b1-94fa-4d45-9b95-ef6887b8e4c1",
109 |    "metadata": {},
110 |    "outputs": [
111 |     {
112 |      "name": "stdout",
113 |      "output_type": "stream",
114 |      "text": [
115 |       "Disk Memory Deserialized 1x Replicated\n"
116 |      ]
117 |     }
118 |    ],
119 |    "source": [
120 |     "# Cache the DataFrame using cache() method\n",
121 |     "large_df.cache()\n",
122 |     "# Check the storage level of the cached DataFrame\n",
123 |     "print(large_df.storageLevel)"
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "code",
128 |    "execution_count": 7,
129 |    "id": "f7d80802-2095-47fa-83be-672a56f8c71c",
130 |    "metadata": {},
131 |    "outputs": [
132 |     {
133 |      "name": "stdout",
134 |      "output_type": "stream",
135 |      "text": [
136 |       "Disk Memory Deserialized 1x Replicated\n"
137 |      ]
138 |     }
139 |    ],
140 |    "source": [
141 |     "# Persist the DataFrame using persist() method with a different storage level\n",
142 |     "large_df.persist(StorageLevel.MEMORY_AND_DISK_DESER)\n",
143 |     "# Check the storage level of the persisted DataFrame\n",
144 |     "print(large_df.storageLevel)"
145 |    ]
146 |   },
147 |   {
148 |    "cell_type": "code",
149 |    "execution_count": 10,
150 |    "id": "760553dd-2781-4b03-9e68-7c7442e6dc4b",
151 |    "metadata": {},
152 |    "outputs": [
153 |     {
154 |      "name": "stderr",
155 |      "output_type": "stream",
156 |      "text": [
157 |       "                                                                                \r"
158 |      ]
159 |     },
160 |     {
161 |      "name": "stdout",
162 |      "output_type": "stream",
163 |      "text": [
164 |       "Execution time: 8.600075006484985 seconds\n",
165 |       "+---------+---------+\n",
166 |       "|ProductId|count(Id)|\n",
167 |       "+---------+---------+\n",
168 |       "|       31|    99961|\n",
169 |       "|       85|    99746|\n",
170 |       "|       65|   100023|\n",
171 |       "|       53|   100615|\n",
172 |       "|       78|    99985|\n",
173 |       "+---------+---------+\n",
174 |       "only showing top 5 rows\n",
175 |       "\n"
176 |      ]
177 |     }
178 |    ],
179 |    "source": [
180 |     "results_df = large_df.groupBy(\"ProductId\").agg({\"Id\": \"count\"}) \n",
181 |     "measure_time(results_df)\n",
182 |     "# Show the result\n",
183 |     "results_df.show(5)"
184 |    ]
185 |   },
186 |   {
187 |    "cell_type": "code",
188 |    "execution_count": 11,
189 |    "id": "58fe1655-0f67-481c-8fc0-061ea5360e7c",
190 |    "metadata": {},
191 |    "outputs": [
192 |     {
193 |      "name": "stdout",
194 |      "output_type": "stream",
195 |      "text": [
196 |       "Execution time: 0.984121561050415 seconds\n"
197 |      ]
198 |     },
199 |     {
200 |      "name": "stderr",
201 |      "output_type": "stream",
202 |      "text": [
203 |       "[Stage 10:=============================>                            (1 + 1) / 2]\r"
204 |      ]
205 |     },
206 |     {
207 |      "name": "stdout",
208 |      "output_type": "stream",
209 |      "text": [
210 |       "+---------+---------+\n",
211 |       "|ProductId|count(Id)|\n",
212 |       "+---------+---------+\n",
213 |       "|       31|    99961|\n",
214 |       "|       85|    99746|\n",
215 |       "|       65|   100023|\n",
216 |       "|       53|   100615|\n",
217 |       "|       78|    99985|\n",
218 |       "+---------+---------+\n",
219 |       "only showing top 5 rows\n",
220 |       "\n"
221 |      ]
222 |     },
223 |     {
224 |      "name": "stderr",
225 |      "output_type": "stream",
226 |      "text": [
227 |       "                                                                                \r"
228 |      ]
229 |     }
230 |    ],
231 |    "source": [
232 |     "results_df = large_df.groupBy(\"ProductId\").agg({\"Id\": \"count\"}) \n",
233 |     "measure_time(results_df)\n",
234 |     "# Show the result\n",
235 |     "results_df.show(5)"
236 |    ]
237 |   },
238 |   {
239 |    "cell_type": "code",
240 |    "execution_count": 12,
241 |    "id": "e6b5a908-2260-467c-96a3-6245ebba3198",
242 |    "metadata": {},
243 |    "outputs": [
244 |     {
245 |      "name": "stdout",
246 |      "output_type": "stream",
247 |      "text": [
248 |       "Serialized 1x Replicated\n"
249 |      ]
250 |     }
251 |    ],
252 |    "source": [
253 |     "# Unpersist the DataFrame using unpersist() method\n",
254 |     "large_df.unpersist()\n",
255 |     "# Check the storage level of the unpersisted DataFrame\n",
256 |     "print(large_df.storageLevel)"
257 |    ]
258 |   },
259 |   {
260 |    "cell_type": "code",
261 |    "execution_count": 13,
262 |    "id": "e360de96-05cf-4575-92a4-567200d91f06",
263 |    "metadata": {},
264 |    "outputs": [],
265 |    "source": [
266 |     "spark.stop()"
267 |    ]
268 |   },
269 |   {
270 |    "cell_type": "code",
271 |    "execution_count": null,
272 |    "id": "3e55fe9f-b1e7-4885-92f2-d48e758928c6",
273 |    "metadata": {},
274 |    "outputs": [],
275 |    "source": []
276 |   }
277 |  ],
278 |  "metadata": {
279 |   "kernelspec": {
280 |    "display_name": "Python 3 (ipykernel)",
281 |    "language": "python",
282 |    "name": "python3"
283 |   },
284 |   "language_info": {
285 |    "codemirror_mode": {
286 |     "name": "ipython",
287 |     "version": 3
288 |    },
289 |    "file_extension": ".py",
290 |    "mimetype": "text/x-python",
291 |    "name": "python",
292 |    "nbconvert_exporter": "python",
293 |    "pygments_lexer": "ipython3",
294 |    "version": "3.10.12"
295 |   }
296 |  },
297 |  "nbformat": 4,
298 |  "nbformat_minor": 5
299 | }
300 | 


--------------------------------------------------------------------------------
/Chapter08/8.1 building-databricks-workflow.yml:
--------------------------------------------------------------------------------
 1 | resources:
 2 |   jobs:
 3 |     On_Shelf_Availability_Workflow:
 4 |       name: On-Shelf-Availability Workflow
 5 |       schedule:
 6 |         quartz_cron_expression: 36 0 0 * * ?
 7 |         timezone_id: UTC
 8 |         pause_status: PAUSED
 9 |       tasks:
10 |         - task_key: Setup
11 |           notebook_task:
12 |             notebook_path: /Repos/pulkit.chadha.packt@gmail.com/Data-Engineering-with-Apache-Spark-and-Delta-Lake-Cookbook/Chapter08/Setup
13 |             base_parameters:
14 |               catalog: main
15 |               schema: on_shelf_availability
16 |             source: WORKSPACE
17 |           job_cluster_key: Job_cluster
18 |         - task_key: Download_Inventory_Data
19 |           depends_on:
20 |             - task_key: Setup
21 |           notebook_task:
22 |             notebook_path: /Repos/pulkit.chadha.packt@gmail.com/Data-Engineering-with-Apache-Spark-and-Delta-Lake-Cookbook/Chapter08/Download
23 |               Inventory Data
24 |             base_parameters:
25 |               catalog: main
26 |               schema: on_shelf_availability
27 |             source: WORKSPACE
28 |           job_cluster_key: Job_cluster
29 |       job_clusters:
30 |         - job_cluster_key: Job_cluster
31 |           new_cluster:
32 |             cluster_name: ""
33 |             spark_version: 14.2.x-scala2.12
34 |             aws_attributes:
35 |               first_on_demand: 1
36 |               availability: SPOT_WITH_FALLBACK
37 |               zone_id: us-west-2a
38 |               spot_bid_price_percent: 100
39 |               ebs_volume_count: 0
40 |             node_type_id: i4i.large
41 |             enable_elastic_disk: false
42 |             data_security_mode: SINGLE_USER
43 |             runtime_engine: STANDARD
44 |             num_workers: 2


--------------------------------------------------------------------------------
/Chapter08/8.4 conditional-branching.yml:
--------------------------------------------------------------------------------
 1 | resources:
 2 |   jobs:
 3 |     On_Shelf_Availability_Workflow:
 4 |       name: On-Shelf-Availability Workflow
 5 |       schedule:
 6 |         quartz_cron_expression: 36 0 0 * * ?
 7 |         timezone_id: UTC
 8 |         pause_status: PAUSED
 9 |       tasks:
10 |         - task_key: Setup
11 |           notebook_task:
12 |             notebook_path: /Repos/pulkit.chadha.packt@gmail.com/Data-Engineering-with-Apache-Spark-and-Delta-Lake-Cookbook/Chapter08/Setup
13 |             base_parameters:
14 |               catalog: main
15 |               schema: on_shelf_availability
16 |             source: WORKSPACE
17 |           job_cluster_key: Job_cluster
18 |         - task_key: Download_Inventory_Data
19 |           depends_on:
20 |             - task_key: Setup
21 |           notebook_task:
22 |             notebook_path: /Repos/pulkit.chadha.packt@gmail.com/Data-Engineering-with-Apache-Spark-and-Delta-Lake-Cookbook/Chapter08/Download
23 |               Inventory Data
24 |             base_parameters:
25 |               catalog: main
26 |               schema: on_shelf_availability
27 |             source: WORKSPACE
28 |           job_cluster_key: Job_cluster
29 |         - task_key: Check_File_Size
30 |           depends_on:
31 |             - task_key: Download_Inventory_Data
32 |           condition_task:
33 |             op: GREATER_THAN
34 |             left: "{{tasks.Download_Inventory_Data.values.file_size}}"
35 |             right: "0"
36 |         - task_key: OSA_Data_Preparation_DLT
37 |           depends_on:
38 |             - task_key: Check_File_Size
39 |               outcome: "true"
40 |           pipeline_task:
41 |             pipeline_id: 970bb4d4-35f0-4169-91b9-bf522f95076c
42 |             full_refresh: true
43 |         - task_key: Cleanup
44 |           depends_on:
45 |             - task_key: Download_Inventory_Data
46 |             - task_key: Setup
47 |             - task_key: Check_File_Size
48 |               outcome: "false"
49 |           run_if: AT_LEAST_ONE_FAILED
50 |           notebook_task:
51 |             notebook_path: /Repos/pulkit.chadha.packt@gmail.com/Data-Engineering-with-Apache-Spark-and-Delta-Lake-Cookbook/Chapter08/Clean
52 |               Up
53 |             base_parameters:
54 |               catalog: main
55 |               schema: on_shelf_availability
56 |             source: WORKSPACE
57 |           job_cluster_key: Job_cluster
58 |       job_clusters:
59 |         - job_cluster_key: Job_cluster
60 |           new_cluster:
61 |             cluster_name: ""
62 |             spark_version: 14.2.x-scala2.12
63 |             aws_attributes:
64 |               first_on_demand: 1
65 |               availability: SPOT_WITH_FALLBACK
66 |               zone_id: us-west-2a
67 |               spot_bid_price_percent: 100
68 |               ebs_volume_count: 0
69 |             node_type_id: i4i.large
70 |             enable_elastic_disk: false
71 |             data_security_mode: SINGLE_USER
72 |             runtime_engine: STANDARD
73 |             num_workers: 2
74 | 


--------------------------------------------------------------------------------
/Chapter08/Clean Up.sql:
--------------------------------------------------------------------------------
 1 | -- Databricks notebook source
 2 | CREATE WIDGET TEXT catalog DEFAULT "main";
 3 | CREATE WIDGET TEXT schema DEFAULT "on_shelf_availability";
 4 | 
 5 | -- COMMAND ----------
 6 | 
 7 | USE CATALOG ${catalog}
 8 | USE SCHEMA ${schema};
 9 | 
10 | -- COMMAND ----------
11 | 
12 | DROP VOLUME IF NOT EXISTS data;
13 | DROP SCHEMA IF NOT EXISTS ${schema} CASCADE;
14 | 


--------------------------------------------------------------------------------
/Chapter08/Data Preparation DLT.sql:
--------------------------------------------------------------------------------
 1 | -- Databricks notebook source
 2 | CREATE
 3 | OR REFRESH STREAMING LIVE TABLE inventory_raw AS
 4 | SELECT
 5 |   *
 6 | FROM
 7 |   cloud_files(
 8 |     "/Volumes/dbdemos/on-shelf-availability/data/osa_raw_data*.csv",
 9 |     "csv",
10 |     map(
11 |       "cloudFiles.inferColumnTypes",
12 |       "true",
13 |       "dateFormat",
14 |       "yyyyMMdd",
15 |       "cloudFiles.schemaHints",
16 |       "date DATE"
17 |     )
18 |   )
19 | 
20 | -- COMMAND ----------
21 | 
22 | CREATE
23 | OR REFRESH STREAMING LIVE TABLE vendor AS
24 | SELECT
25 |   *
26 | FROM
27 |   cloud_files(
28 |     "/Volumes/dbdemos/on-shelf-availability/data/vendor_leadtime_info*.csv",
29 |     "csv",
30 |     map("cloudFiles.inferColumnTypes", "true")
31 |   )
32 | 
33 | -- COMMAND ----------
34 | 
35 | 
36 | 
37 | -- COMMAND ----------
38 | 
39 | CREATE
40 | OR REFRESH LIVE TABLE inventory AS
41 | SELECT
42 |   cross_view.date,
43 |   cross_view.store_id,
44 |   cross_view.sku,
45 |   int_data.product_category,
46 |   int_data.total_sales_units,
47 |   int_data.on_hand_inventory_units,
48 |   int_data.replenishment_units,
49 |   int_data.inventory_pipeline,
50 |   int_data.units_in_transit,
51 |   int_data.units_in_dc,
52 |   int_data.units_on_order,
53 |   int_data.units_under_promotion,
54 |   int_data.shelf_capacity,
55 |   CASE WHEN int_data.units_under_promotion > 0 THEN 1 ELSE 0 END as promotion_flag,
56 |   CASE WHEN int_data.replenishment_units > 0 THEN 1 ELSE 0 END as replenishment_flag
57 | FROM
58 |   (
59 |     SELECT
60 |       to_date(
61 |         date_add('2019-01-01', cast(abs(t.id) as int)),
62 |         'yy-MM-dd'
63 |       ) as date,
64 |       store_id,
65 |       sku
66 |     FROM
67 |       range(datediff('2019-01-01', '2021-05-03'), 1) AS t
68 |       CROSS JOIN (
69 |         SELECT
70 |           store_id,
71 |           sku
72 |         FROM
73 |           live.inventory_raw
74 |         GROUP BY
75 |           ALL
76 |       )
77 |   ) cross_view
78 |   LEFT OUTER JOIN live.inventory_raw int_data ON cross_view.date = int_data.date
79 |   AND cross_view.store_id = int_data.store_id
80 |   AND cross_view.sku = int_data.sku
81 | 
82 | -- COMMAND ----------
83 | 
84 | 
85 | 


--------------------------------------------------------------------------------
/Chapter08/Download Inventory Data.sql:
--------------------------------------------------------------------------------
 1 | -- Databricks notebook source
 2 | CREATE WIDGET TEXT catalog DEFAULT "main";
 3 | CREATE WIDGET TEXT schema DEFAULT "on_shelf_availability";
 4 | 
 5 | 
 6 | -- COMMAND ----------
 7 | 
 8 | -- MAGIC %python
 9 | -- MAGIC import os
10 | -- MAGIC os.environ['catalog']=dbutils.widgets.get("catalog")
11 | -- MAGIC os.environ['schema']=dbutils.widgets.get("schema")
12 | -- MAGIC os.environ['volumeName']=dbutils.jobs.taskValues.get(taskKey = "Setup", key = "volumeName", default = "data", debugValue = "data") 
13 | 
14 | -- COMMAND ----------
15 | 
16 | -- MAGIC %sh
17 | -- MAGIC cd /Volumes/$catalog/$schema/${volumeName}
18 | -- MAGIC wget https://raw.githubusercontent.com/tredenceofficial/OSA-Data/main/osa_raw_data.csv
19 | -- MAGIC wget https://raw.githubusercontent.com/tredenceofficial/OSA-Data/main/vendor_leadtime_info.csv
20 | 
21 | -- COMMAND ----------
22 | 
23 | -- MAGIC %python
24 | -- MAGIC import os
25 | -- MAGIC file_path = f'/Volumes/{dbutils.widgets.get("catalog")}/{dbutils.widgets.get("schema")}/{os.environ["volumeName"]}/osa_raw_data.csv'
26 | -- MAGIC if os.path.exists(file_path):
27 | -- MAGIC     file_size = os.path.getsize(file_path)
28 | -- MAGIC     dbutils.jobs.taskValues.set(key='file_size', value=file_size)
29 | -- MAGIC     print(file_size)
30 | -- MAGIC else:
31 | -- MAGIC     print(f"{file_path} does not exist")
32 | -- MAGIC     dbutils.jobs.taskValues.set(key='file_size', value=0)
33 | 
34 | -- COMMAND ----------
35 | 
36 | 
37 | 


--------------------------------------------------------------------------------
/Chapter08/Setup.sql:
--------------------------------------------------------------------------------
 1 | -- Databricks notebook source
 2 | CREATE WIDGET TEXT catalog DEFAULT "main";
 3 | CREATE WIDGET TEXT schema DEFAULT "on_shelf_availability";
 4 | 
 5 | -- COMMAND ----------
 6 | 
 7 | USE CATALOG ${catalog}
 8 | 
 9 | -- COMMAND ----------
10 | 
11 | CREATE SCHEMA IF NOT EXISTS ${schema};
12 | 
13 | -- COMMAND ----------
14 | 
15 | USE SCHEMA ${schema};
16 | 
17 | -- COMMAND ----------
18 | 
19 | CREATE VOLUME IF NOT EXISTS data;
20 | 
21 | -- COMMAND ----------
22 | 
23 | -- MAGIC %python
24 | -- MAGIC dbutils.jobs.taskValues.set(key = 'volumeName', value = 'data')
25 | 


--------------------------------------------------------------------------------
/Chapter09/9.1 create-medallion-arch-DLT.sql:
--------------------------------------------------------------------------------
 1 | -- Databricks notebook source
 2 | CREATE
 3 | OR REFRESH STREAMING TABLE device_data AS
 4 | SELECT
 5 |   *
 6 | FROM
 7 |   cloud_files(
 8 |     "/databricks-datasets/iot-stream/data-device",
 9 |     "json",
10 |     map("cloudFiles.inferColumnTypes", "true")
11 |   )
12 | 
13 | -- COMMAND ----------
14 | 
15 | CREATE
16 | OR REFRESH STREAMING TABLE user_data AS
17 | SELECT
18 |   *
19 | FROM
20 |   cloud_files(
21 |     "/databricks-datasets/iot-stream/data-user",
22 |     "csv",
23 |     map("cloudFiles.inferColumnTypes", "true")
24 |   )
25 | 
26 | -- COMMAND ----------
27 | 
28 | CREATE
29 | OR REFRESH STREAMING TABLE user_data_prepared (
30 |   CONSTRAINT valid_user EXPECT (user_id IS NOT NULL) ON VIOLATION DROP ROW
31 | ) -- COMMENT ""
32 | AS
33 | SELECT
34 |   users.userid as user_id,
35 |   CASE
36 |     WHEN users.gender = 'F' THEN 'Female'
37 |     WHEN users.gender = 'M' THEN 'Male'
38 |   END AS gender,
39 |   users.age,
40 |   users.height,
41 |   users.weight,
42 |   CAST(users.smoker as BOOLEAN) AS isSmoker,
43 |   CAST(users.familyhistory as BOOLEAN) AS hasFamilyHistory,
44 |   users.cholestlevs AS cholestrolLevels,
45 |   users.bp AS bloodPressure,
46 |   users.risk
47 | FROM
48 |   STREAM(live.user_data) users;
49 | 
50 | -- COMMAND ----------
51 | 
52 | CREATE
53 | OR REFRESH STREAMING TABLE device_data_prepared (
54 |   CONSTRAINT valid_timestamp EXPECT (timestamp IS NOT NULL)
55 | ) -- COMMENT ""
56 | AS
57 | SELECT
58 |   device.id,
59 |   device.device_id,
60 |   device.user_id,
61 |   device.calories_burnt,
62 |   device.miles_walked,
63 |   device.num_steps,
64 |   CAST(device.timestamp as TIMESTAMP) AS timestamp
65 | FROM
66 |   STREAM(live.device_data) device
67 | 
68 | -- COMMAND ----------
69 | 
70 | CREATE
71 | OR REFRESH LIVE TABLE user_metrics AS
72 | SELECT
73 |   users.user_id,
74 |   users.gender,
75 |   users.age,
76 |   users.height,
77 |   users.weight,
78 |   users.isSmoker,
79 |   users.hasFamilyHistory,
80 |   users.cholestrolLevels,
81 |   users.bloodPressure,
82 |   users.risk,
83 |   SUM(devices.calories_burnt) AS totalCaloriesBurnt,
84 |   SUM(devices.miles_walked) AS totalMilesWalked,
85 |   SUM(devices.num_steps) AS totalNumberOfSteps
86 | FROM
87 |   live.user_data_prepared users
88 |   LEFT OUTER JOIN LIVE.device_data_prepared devices on devices.user_id = users.user_id
89 | GROUP BY
90 |   ALL;
91 | 


--------------------------------------------------------------------------------
/Chapter09/9.3 data-quality-and-validation.sql:
--------------------------------------------------------------------------------
 1 | -- Databricks notebook source
 2 | CREATE
 3 | OR REFRESH LIVE TABLE customers (
 4 |   CONSTRAINT valid_customer_key EXPECT (c_custkey IS NOT NULL) ON VIOLATION DROP ROW
 5 | ) AS
 6 | SELECT
 7 |   *
 8 | FROM
 9 |   samples.tpch.customer
10 | 
11 | -- COMMAND ----------
12 | 
13 | CREATE TEMPORARY LIVE TABLE duplicate_customers_test (
14 |   CONSTRAINT unique_customer_key EXPECT (cnt = 1) ON VIOLATION DROP ROW
15 | ) AS
16 | SELECT
17 |   c_custkey, count(*) as cnt
18 | FROM
19 |   live.customers
20 | GROUP BY ALL;
21 | 
22 | -- COMMAND ----------
23 | 
24 | CREATE
25 | OR REFRESH LIVE TABLE orders (
26 |   CONSTRAINT valid_order_key EXPECT (o_orderkey IS NOT NULL) ON VIOLATION DROP ROW,
27 |   CONSTRAINT valid_customer_key EXPECT (o_custkey IS NOT NULL) ON VIOLATION DROP ROW,
28 |   CONSTRAINT valid_reference_customer EXPECT (cust.c_custkey IS NOT NULL) ON VIOLATION DROP ROW
29 | ) AS
30 | SELECT
31 |   ord.*,
32 |   cust.c_custkey
33 | FROM
34 |   samples.tpch.orders ord
35 |   LEFT OUTER JOIN live.customers cust on cust.c_custkey = ord.o_custkey
36 | 
37 | -- COMMAND ----------
38 | 
39 | CREATE TEMPORARY LIVE TABLE duplicate_orders_test (
40 |   CONSTRAINT unique_order_key EXPECT (cnt = 1) ON VIOLATION DROP ROW
41 | ) AS
42 | SELECT
43 |   o_orderkey, count(*) as cnt
44 | FROM
45 |   live.orders
46 | GROUP BY ALL;
47 | 


--------------------------------------------------------------------------------
/Chapter09/9.4 quarantine-bad-data-dlt.sql:
--------------------------------------------------------------------------------
 1 | -- Databricks notebook source
 2 | CREATE
 3 | OR REFRESH STREAMING LIVE TABLE raw_farmers_market AS
 4 | SELECT
 5 |   *
 6 | FROM
 7 |   cloud_files(
 8 |     "/databricks-datasets/data.gov/farmers_markets_geographic_data/data-001/",
 9 |     "csv",
10 |     map(
11 |       "cloudFiles.inferColumnTypes",
12 |       "true"
13 |     )
14 |   )
15 | 
16 | -- COMMAND ----------
17 | 
18 | CREATE
19 | OR REFRESH STREAMING LIVE TABLE farmers_market_clean (
20 |   CONSTRAINT valid_website EXPECT (Website IS NOT NULL) ON VIOLATION DROP ROW,
21 |   CONSTRAINT valid_location EXPECT (Location IS NOT NULL) ON VIOLATION DROP ROW
22 | ) AS
23 | SELECT
24 |   *
25 | FROM
26 |   STREAM(live.raw_farmers_market) 
27 | 
28 | -- COMMAND ----------
29 | 
30 | CREATE
31 | OR REFRESH STREAMING LIVE TABLE farmers_market_quarantine (
32 |   CONSTRAINT valid_website EXPECT (NOT(Website IS NOT NULL)) ON VIOLATION DROP ROW,
33 |   CONSTRAINT valid_location EXPECT (NOT(Location IS NOT NULL)) ON VIOLATION DROP ROW
34 | ) AS
35 | SELECT
36 |   *
37 | FROM
38 |   STREAM(live.raw_farmers_market)
39 | 


--------------------------------------------------------------------------------
/Chapter09/9.5 monitor-delta-live-table-pipelines.sql:
--------------------------------------------------------------------------------
 1 | -- Databricks notebook source
 2 | CREATE OR REPLACE VIEW main.pkc_farmers_market.event_log_raw AS SELECT * FROM event_log("5f5e0278-f9c8-49dc-bfd2-ade8c07b4453");
 3 | 
 4 | -- COMMAND ----------
 5 | 
 6 | -- DBTITLE 1,Query data quality from the event log
 7 | SELECT
 8 |   update_id,
 9 |   row_expectations.dataset as dataset,
10 |   row_expectations.name as expectation,
11 |   SUM(row_expectations.passed_records) as passing_records,
12 |   SUM(row_expectations.failed_records) as failing_records
13 | FROM
14 |   (
15 |     SELECT
16 |       origin.update_id,explode(
17 |         from_json(
18 |           details :flow_progress :data_quality :expectations,
19 |           "array<struct<name: string, dataset: string, passed_records: int, failed_records: int>>"
20 |         )
21 |       ) row_expectations
22 |     FROM
23 |       main.pkc_farmers_market.event_log_raw
24 |     WHERE
25 |       event_type = 'flow_progress'
26 |   )
27 | GROUP BY
28 |   update_id,
29 |   row_expectations.dataset,
30 |   row_expectations.name
31 | 
32 | -- COMMAND ----------
33 | 
34 | -- DBTITLE 1,Monitor compute resource utilization
35 | SELECT
36 |   origin.update_id,
37 |   timestamp,
38 |   Double(details :cluster_resources.avg_num_queued_tasks) as queue_size,
39 |   Double(
40 |     details :cluster_resources.avg_task_slot_utilization
41 |   ) as utilization,
42 |   Double(details :cluster_resources.num_executors) as current_executors,
43 |   Double(
44 |     details :cluster_resources.latest_requested_num_executors
45 |   ) as latest_requested_num_executors,
46 |   Double(details :cluster_resources.optimal_num_executors) as optimal_num_executors,
47 |   details :cluster_resources.state as autoscaling_state
48 | FROM
49 |   main.pkc_farmers_market.event_log_raw
50 | WHERE
51 |   event_type = 'cluster_resources'
52 | ORDER BY
53 |   origin.update_id,
54 |   timestamp
55 | 
56 | -- COMMAND ----------
57 | 
58 | -- DBTITLE 1,Query user actions in the event log
59 | SELECT
60 |   timestamp,
61 |   details :user_action :action,
62 |   details :user_action :user_name
63 | FROM
64 |   main.pkc_farmers_market.event_log_raw
65 | WHERE
66 |   event_type = 'user_action'
67 | 


--------------------------------------------------------------------------------
/Chapter09/9.6 dlt-dabs-cicd/9.6 create-medallion-arch-DLT.sql:
--------------------------------------------------------------------------------
 1 | -- Databricks notebook source
 2 | CREATE
 3 | OR REFRESH STREAMING TABLE device_data AS
 4 | SELECT
 5 |   *
 6 | FROM
 7 |   cloud_files(
 8 |     "/databricks-datasets/iot-stream/data-device",
 9 |     "json",
10 |     map("cloudFiles.inferColumnTypes", "true")
11 |   )
12 | 
13 | -- COMMAND ----------
14 | 
15 | CREATE
16 | OR REFRESH STREAMING TABLE user_data AS
17 | SELECT
18 |   *
19 | FROM
20 |   cloud_files(
21 |     "/databricks-datasets/iot-stream/data-user",
22 |     "csv",
23 |     map("cloudFiles.inferColumnTypes", "true")
24 |   )
25 | 
26 | -- COMMAND ----------
27 | 
28 | CREATE
29 | OR REFRESH STREAMING TABLE user_data_prepared (
30 |   CONSTRAINT valid_user EXPECT (user_id IS NOT NULL) ON VIOLATION DROP ROW
31 | ) -- COMMENT ""
32 | AS
33 | SELECT
34 |   users.userid as user_id,
35 |   CASE
36 |     WHEN users.gender = 'F' THEN 'Female'
37 |     WHEN users.gender = 'M' THEN 'Male'
38 |   END AS gender,
39 |   users.age,
40 |   users.height,
41 |   users.weight,
42 |   CAST(users.smoker as BOOLEAN) AS isSmoker,
43 |   CAST(users.familyhistory as BOOLEAN) AS hasFamilyHistory,
44 |   users.cholestlevs AS cholestrolLevels,
45 |   users.bp AS bloodPressure,
46 |   users.risk
47 | FROM
48 |   STREAM(live.user_data) users;
49 | 
50 | -- COMMAND ----------
51 | 
52 | CREATE
53 | OR REFRESH STREAMING TABLE device_data_prepared (
54 |   CONSTRAINT valid_timestamp EXPECT (timestamp IS NOT NULL)
55 | ) -- COMMENT ""
56 | AS
57 | SELECT
58 |   device.id,
59 |   device.device_id,
60 |   device.user_id,
61 |   device.calories_burnt,
62 |   device.miles_walked,
63 |   device.num_steps,
64 |   CAST(device.timestamp as TIMESTAMP) AS timestamp
65 | FROM
66 |   STREAM(live.device_data) device
67 | 
68 | -- COMMAND ----------
69 | 
70 | CREATE
71 | OR REFRESH LIVE TABLE user_metrics AS
72 | SELECT
73 |   users.user_id,
74 |   users.gender,
75 |   users.age,
76 |   users.height,
77 |   users.weight,
78 |   users.isSmoker,
79 |   users.hasFamilyHistory,
80 |   users.cholestrolLevels,
81 |   users.bloodPressure,
82 |   users.risk,
83 |   SUM(devices.calories_burnt) AS totalCaloriesBurnt,
84 |   SUM(devices.miles_walked) AS totalMilesWalked,
85 |   SUM(devices.num_steps) AS totalNumberOfSteps
86 | FROM
87 |   live.user_data_prepared users
88 |   LEFT OUTER JOIN LIVE.device_data_prepared devices on devices.user_id = users.user_id
89 | GROUP BY
90 |   ALL;
91 | 


--------------------------------------------------------------------------------
/Chapter09/9.6 dlt-dabs-cicd/databricks.yml:
--------------------------------------------------------------------------------
 1 | # This is a Databricks asset bundle definition for dlt_dabs_cicd.
 2 | # See https://docs.databricks.com/dev-tools/bundles/index.html for documentation.
 3 | bundle:
 4 |   name: dlt_dabs_cicd
 5 | 
 6 | include:
 7 |   - dlt_dabs_cicd_pipeline.yml
 8 | 
 9 | targets:
10 |   # The 'dev' target, used for development purposes.
11 |   # Whenever a developer deploys using 'dev', they get their own copy.
12 |   dev:
13 |     # We use 'mode: development' to make sure everything deployed to this target gets a prefix
14 |     # like '[dev my_user_name]'. Setting this mode also disables any schedules and
15 |     # automatic triggers for jobs and enables the 'development' mode for Delta Live Tables pipelines.
16 |     mode: development
17 |     default: true
18 |     workspace:
19 |       host: https://adb-7934447987890817.7.azuredatabricks.net/
20 | 
21 |   # The 'prod' target, used for production deployment.
22 |   prod:
23 |     # For production deployments, we only have a single copy, so we override the
24 |     # workspace.root_path default of /Users/${workspace.current_user.userName}/.bundle/${bundle.target}/${bundle.name}
25 |     # to a path that is not specific to the current user.
26 |     mode: production
27 |     workspace:
28 |       host: https://adb-7934447987890817.7.azuredatabricks.net/
29 |       root_path: /Shared/.bundle/prod/${bundle.name}
30 |     run_as:
31 |       # Use service principal could be used here using service_principal_name
32 |       # (see Databricks documentation).
33 |       user_name: pulkit.chadha.packt@gmail.com
34 | 


--------------------------------------------------------------------------------
/Chapter09/9.6 dlt-dabs-cicd/dlt_dabs_cicd_pipeline.yml:
--------------------------------------------------------------------------------
 1 | # The main pipeline for dlt_dabs_cicd
 2 | resources:
 3 |   pipelines:
 4 |     dlt_dabs_cicd_pipeline:
 5 |       name: dlt_dabs_cicd_pipeline
 6 |       target: dlt_dabs_cicd_${bundle.environment}
 7 |       continuous: false
 8 |       channel: CURRENT
 9 |       photon: false
10 |       libraries:
11 |         - notebook:
12 |             path: 9.6 create-medallion-arch-DLT.sql
13 |       clusters:
14 |         - label: default
15 |           autoscale:
16 |             min_workers: 1
17 |             max_workers: 1
18 |             mode: ENHANCED
19 | 


--------------------------------------------------------------------------------
/Chapter09/9.7 apply-changes_into-dlt.sql:
--------------------------------------------------------------------------------
 1 | -- Databricks notebook source
 2 | CREATE TEMPORARY STREAMING LIVE VIEW movie_and_show_titles AS
 3 | SELECT
 4 |   *,
 5 |   now() as ts
 6 | FROM
 7 |   cloud_files(
 8 |     "/Volumes/main/netflix/data",
 9 |     "csv",
10 |     map("header", "true")
11 |   );
12 | 
13 | -- COMMAND ----------
14 | 
15 | CREATE
16 | OR REFRESH STREAMING TABLE movie_and_show_titles_scd_1;
17 | 
18 | APPLY CHANGES INTO 
19 |   live.movie_and_show_titles_scd_1
20 | FROM 
21 |   STREAM(LIVE.movie_and_show_titles)
22 | KEYS 
23 |   (type, title, director) 
24 | SEQUENCE BY 
25 |   ts 
26 | COLUMNS * EXCEPT (ts)
27 | STORED AS 
28 |   SCD TYPE 1;
29 | 
30 | -- COMMAND ----------
31 | 
32 | CREATE
33 | OR REFRESH STREAMING TABLE movie_and_show_titles_scd_2;
34 | 
35 | APPLY CHANGES INTO 
36 |   live.movie_and_show_titles_scd_2
37 | FROM 
38 |   STREAM(LIVE.movie_and_show_titles)
39 | KEYS 
40 |   (type, title, director)
41 | SEQUENCE BY 
42 |   ts 
43 | COLUMNS * EXCEPT (ts)
44 | STORED AS 
45 |   SCD TYPE 2;
46 | 


--------------------------------------------------------------------------------
/Chapter10/10.2 uc_object_hierarchy.sql:
--------------------------------------------------------------------------------
 1 | -- Databricks notebook source
 2 | CREATE CATALOG de_book MANAGED LOCATION 's3://data-lake/de-book-ext-data';
 3 | 
 4 | -- COMMAND ----------
 5 | 
 6 | USE CATALOG de_book;
 7 | CREATE SCHEMA credit_card;
 8 | 
 9 | -- COMMAND ----------
10 | 
11 | USE CATALOG de_book;
12 | USE SCHEMA credit_card;
13 | CREATE TABLE IF NOT EXISTS transactions_table (
14 |   Transaction_ID STRING,
15 |   Transaction_Date STRING,
16 |   Credit_Card_ID STRING,
17 |   Transaction_Value FLOAT,
18 |   Transaction_Segment STRING
19 | );
20 | INSERT INTO
21 |   transactions_table
22 | VALUES
23 |   (    'CTID28830551',    '24-Apr-16',    '1629-9566-3285-2123',    23649,    'SEG25'  ),
24 |   (    'CTID45504917',    '11-Feb-16',    '3697-6001-4909-5350',    26726,    'SEG16'  ),
25 |   (    'CTID47312290',    '1-Nov-16',     '5864-4475-3659-1440',    22012,    'SEG14'  ),
26 |   (    'CTID25637718',    '28-Jan-16',    '5991-4421-8476-3804',    37637,    'SEG17'  );
27 | 
28 | -- COMMAND ----------
29 | 
30 | USE CATALOG de_book;
31 | USE SCHEMA credit_card;
32 | CREATE OR REPLACE VIEW transactions_view (Credit_Card_ID, total_Transaction_Value)
33 | COMMENT 'A view that shows the total transaction value by credit card'
34 | AS SELECT Credit_Card_ID, SUM(Transaction_Value) AS total_Transaction_Value FROM de_book.credit_card.transactions_table GROUP BY Credit_Card_ID;
35 | 
36 | -- COMMAND ----------
37 | 
38 | SELECT * FROM transactions_table WHERE Transaction_Value > 25000;
39 | 
40 | -- COMMAND ----------
41 | 
42 | SELECT *  FROM transactions_view;
43 | 
44 | -- COMMAND ----------
45 | 
46 | CREATE EXTERNAL VOLUME de_book.credit_card.files
47 | LOCATION 's3://data-lake/de-book-ext-data/files';
48 | 
49 | -- COMMAND ----------
50 | 
51 | 
52 | 


--------------------------------------------------------------------------------
/Chapter10/10.4 tags_comments_metadata.sql:
--------------------------------------------------------------------------------
 1 | -- Databricks notebook source
 2 | COMMENT ON TABLE de_book.credit_card.transactions_table IS 'This table contains transaction information from the credit_card database';
 3 | 
 4 | -- COMMAND ----------
 5 | 
 6 | ALTER TABLE
 7 |   de_book.credit_card.transactions_table
 8 | SET
 9 |   TAGS (
10 |     'business_unit' = 'finance',
11 |     'data_sensitivity' = 'medium',
12 |     'data_quality' = 'high'
13 |   );
14 | 
15 | -- COMMAND ----------
16 | 
17 | ALTER TABLE
18 |   de_book.credit_card.transactions_table
19 | ALTER COLUMN
20 |   Transaction_ID COMMENT 'A unique identifier for the transaction.';
21 | 
22 | -- COMMAND ----------
23 | 
24 | ALTER TABLE
25 |   de_book.credit_card.transactions_table
26 | ALTER COLUMN
27 |   Transaction_ID
28 | SET
29 |   TAGS (
30 |     'data_protection' = 'non-PII',
31 |     'isIdentifier' = 'true'
32 |   );
33 | 
34 | -- COMMAND ----------
35 | 
36 | DESCRIBE DETAIL de_book.credit_card.transactions_table;
37 | 
38 | -- COMMAND ----------
39 | 
40 | SELECT
41 |   catalog_name,
42 |   schema_name,
43 |   table_name,
44 |   tag_name,
45 |   tag_value
46 | FROM
47 |   de_book.information_schema.table_tags
48 | WHERE
49 |   catalog_name = 'de_book'
50 |   and schema_name = 'credit_card'
51 |   and table_name = 'transactions_table';
52 | 
53 | -- COMMAND ----------
54 | 
55 | ALTER TABLE
56 |   de_book.credit_card.transactions_table UNSET TAGS ('business_unit', 'data_sensitivity');
57 | 
58 | -- COMMAND ----------
59 | 
60 | ALTER TABLE
61 |   de_book.credit_card.transactions_table
62 | ALTER COLUMN
63 |   Transaction_ID UNSET TAGS ('data_type');
64 | 
65 | -- COMMAND ----------
66 | 
67 | 
68 | 


--------------------------------------------------------------------------------
/Chapter10/10.5 filter_sensitive_data.sql:
--------------------------------------------------------------------------------
 1 | -- Databricks notebook source
 2 | -- Create a sample table with customer information
 3 | USE CATALOG de_book;
 4 | USE SCHEMA credit_card;
 5 | 
 6 | CREATE TABLE customer (
 7 |   id INT,
 8 |   name STRING,
 9 |   email STRING,
10 |   phone STRING,
11 |   ssn STRING,
12 |   country STRING
13 | );
14 | 
15 | -- Insert some sample data into the table
16 | INSERT INTO customer VALUES
17 | (1, 'Alice', 'alice@example.com', '+1-111-1111', '111--111-1111','USA'),
18 | (2, 'Bob', 'bob@example.com', '+1-222-2222', '222-222-2222','USA'),
19 | (3, 'Charlie', 'charlie@example.com', '+1-333-3333', '333-333-3333','USA'),
20 | (4, 'David', 'david@example.com', '+44-444-4444','444-444-4444', 'UK'),
21 | (5, 'Eve', 'eve@example.com', '+44-555-5555', '+555-555-5555','UK');
22 | 
23 | 
24 | -- COMMAND ----------
25 | 
26 | CREATE FUNCTION country_filter(country STRING)
27 | RETURN IF(IS_ACCOUNT_GROUP_MEMBER('admin'), true, country='USA');
28 | 
29 | -- COMMAND ----------
30 | 
31 | ALTER TABLE customer SET ROW FILTER country_filter ON (country);
32 | 
33 | -- COMMAND ----------
34 | 
35 | CREATE
36 | OR REPLACE FUNCTION country_filter(country STRING) RETURN IF(
37 |   IS_ACCOUNT_GROUP_MEMBER('admin'),
38 |   true,
39 |   IF(
40 |     IS_ACCOUNT_GROUP_MEMBER('usteam')
41 |     AND country = 'USA',
42 |     true,
43 |     IF(
44 |       IS_ACCOUNT_GROUP_MEMBER('ukteam')
45 |       AND country = 'UK',
46 |       true,
47 |       false
48 |     )
49 |   )
50 | );
51 | 
52 | -- COMMAND ----------
53 | 
54 | ALTER TABLE customer DROP ROW FILTER;
55 | 
56 | -- COMMAND ----------
57 | 
58 | DROP FUNCTION country_filter;
59 | 
60 | -- COMMAND ----------
61 | 
62 | -- Create a UDF that masks the email column by replacing the domain part with '***'
63 | CREATE FUNCTION mask_email (email STRING) RETURN CASE
64 |   WHEN is_account_group_member('hr_dept') THEN email
65 |   ELSE CONCAT(SPLIT(email, '@')[0], '@***')
66 | END;
67 | 
68 | -- COMMAND ----------
69 | 
70 | ALTER TABLE customer ALTER COLUMN email SET MASK mask_email;
71 | 
72 | -- COMMAND ----------
73 | 
74 | ALTER TABLE customer ALTER COLUMN email DROP MASK;
75 | 
76 | -- COMMAND ----------
77 | 
78 | CREATE
79 | OR REPLACE FUNCTION mask_email (email STRING) RETURN CASE
80 |   WHEN is_account_group_member('hr_dept')
81 |   OR is_account_group_member('finance_dept') THEN email
82 |   ELSE CONCAT(SPLIT(email, '@') [0], '@***')
83 | END;
84 | 
85 | -- COMMAND ----------
86 | 
87 | ALTER TABLE customer ALTER COLUMN email DROP MASK;
88 | DROP FUNCTION mask_email;
89 | 
90 | -- COMMAND ----------
91 | 
92 | 
93 | 


--------------------------------------------------------------------------------
/Chapter10/10.6 lineage_view.sql:
--------------------------------------------------------------------------------
 1 | -- Databricks notebook source
 2 | USE CATALOG de_book;
 3 | USE SCHEMA credit_card;
 4 | 
 5 | -- COMMAND ----------
 6 | 
 7 | CREATE
 8 | OR REPLACE TABLE usa_customers AS
 9 | SELECT
10 |   *
11 | FROM
12 |   customer
13 | WHERE
14 |   country = 'USA';
15 | 
16 | -- COMMAND ----------
17 | 
18 | CREATE
19 |   OR REPLACE TABLE uk_customers AS
20 | SELECT
21 |   *
22 | FROM
23 |   customer
24 | WHERE
25 |   country = 'UK';
26 | 
27 | -- COMMAND ----------
28 | 
29 | 
30 | 


--------------------------------------------------------------------------------
/Chapter10/10.7 system_tables.sql:
--------------------------------------------------------------------------------
 1 | -- Databricks notebook source
 2 | -- MAGIC %sh 
 3 | -- MAGIC curl -v -X GET -H "Authorization: Bearer <PAT Token>" "https://<workspace>.cloud.databricks.com/api/2.0/unity-catalog/metastores/<metastore-id>/systemschemas"
 4 | 
 5 | -- COMMAND ----------
 6 | 
 7 | -- MAGIC %sh
 8 | -- MAGIC curl -v -X POST -H "Authorization: Bearer <PAT Token>" "https://<workspace>.cloud.databricks.com/api/2.0/unity-catalog/metastores/<metastore-id>/systemschemas/<schema-name>/enable"
 9 | 
10 | -- COMMAND ----------
11 | 
12 | -- MAGIC %sh
13 | -- MAGIC curl -v -X POST -H "Authorization: Bearer <PAT Token>" "https://<workspace>.cloud.databricks.com/api/2.0/unity-catalog/metastores/<metastore-id>/systemschemas/system.access/enable"
14 | 
15 | -- COMMAND ----------
16 | 
17 | GRANT SELECT ON TABLE system.access.audit TO analysts
18 | 
19 | -- COMMAND ----------
20 | 
21 | REVOKE SELECT ON TABLE system.billing.usage FROM developers
22 | 
23 | -- COMMAND ----------
24 | 
25 | SELECT
26 |   user_identity.email as user_id,
27 |   COUNT(*) AS event_count
28 | FROM
29 |   system.access.audit
30 | WHERE
31 |   event_time >= current_date - interval 30 days
32 | GROUP BY
33 |   user_id
34 | ORDER BY
35 |   event_count DESC
36 | LIMIT
37 |   10
38 | 
39 | -- COMMAND ----------
40 | 
41 | 
42 | 
43 | -- COMMAND ----------
44 | 
45 | SELECT
46 |   b.sku_name,
47 |   SUM(b.usage_quantity) AS usage_hours,
48 |   SUM(b.usage_quantity * p.pricing.default) AS cost
49 | FROM
50 |   system.billing.usage AS b
51 |   JOIN system.billing.list_prices AS p ON b.sku_name = p.sku_name
52 |   AND b.usage_date BETWEEN p.price_start_time
53 |   AND coalesce(p.price_end_time, current_timestamp())
54 | WHERE
55 |   b.usage_start_time >= date_trunc('month', current_date) - interval 1 month
56 |   AND b.usage_start_time < date_trunc('month', current_date)
57 | GROUP BY
58 |   b.sku_name
59 | ORDER BY
60 |   cost DESC
61 | 
62 | -- COMMAND ----------
63 | 
64 | SELECT
65 |   source_table_full_name,
66 |   target_table_full_name,
67 |   event_time,
68 |   created_by,
69 |   entity_type,
70 |   entity_id,
71 |   entity_run_id
72 | FROM
73 |   system.access.table_lineage
74 | WHERE
75 |   target_table_full_name = 'de_book.credit_card.usa_customers'
76 | ORDER BY
77 |   event_time DESC
78 | 


--------------------------------------------------------------------------------
/Chapter11/11.1 connect_to_git_repo.py:
--------------------------------------------------------------------------------
1 | # Databricks notebook source
2 | 
3 | 


--------------------------------------------------------------------------------
/Chapter11/11.3 using_databricks_sdk.py:
--------------------------------------------------------------------------------
 1 | from databricks.sdk import WorkspaceClient
 2 | 
 3 | w = WorkspaceClient()
 4 | 
 5 | # List the existing clusters
 6 | clusters = w.clusters.list()
 7 | 
 8 | # Loop through the clusters and print their names and states
 9 | for cluster in clusters:
10 |     print(f"Cluster name: {cluster.cluster_name} - {cluster.state}")


--------------------------------------------------------------------------------
/Chapter11/11.4 databricks_vscode_extension.py:
--------------------------------------------------------------------------------
 1 | from pyspark.sql import SparkSession 
 2 | from pyspark.sql.functions import * 
 3 | from pyspark.sql.types import *
 4 | # Create a new SparkSession
 5 | spark = (SparkSession
 6 |          .builder
 7 |          .appName("optimize-data-shuffles")
 8 |          .master("spark://spark-master:7077")
 9 |          .config("spark.executor.memory", "512m")
10 |          .getOrCreate())
11 | 
12 | # Create some sample data frames
13 | # A large data frame with 1 million rows and two columns: id and value
14 | large_df = (spark.range(0, 1000000)
15 |             .withColumn("date", date_sub(current_date(), (rand() * 365).cast("int")))
16 |             .withColumn("age", (rand() * 100).cast("int"))
17 |             .withColumn("salary", 100*(rand() * 100).cast("int"))
18 |             .withColumn("gender", when((rand() * 2).cast("int") == 0, "M").otherwise("F"))
19 |             .withColumn("grade", 
20 |                         when((rand() * 5).cast("int") == 0, "IC")
21 |                         .when((rand() * 5).cast("int") == 1, "IC-2")
22 |                         .when((rand() * 5).cast("int") == 2, "M1")
23 |                         .when((rand() * 5).cast("int") == 3, "M2")
24 |                         .when((rand() * 5).cast("int") == 4, "IC-3")
25 |                         .otherwise("M3")))
26 | large_df.show(5)


--------------------------------------------------------------------------------
/Chapter11/11.5_databricks_asset_bundles/dabs_cicd_example/.github/workflows/deploy_to_prod_CD.yml:
--------------------------------------------------------------------------------
 1 | # This workflow validates, deploys, and runs the specified bundle
 2 | # within a production target named "prod".
 3 | name: "Production deployment"
 4 | 
 5 | # Ensure that only a single job or workflow using the same concurrency group
 6 | # runs at a time.
 7 | concurrency: 1
 8 | 
 9 | # Trigger this workflow whenever a pull request is pushed to the repo's
10 | # main branch.
11 | on:
12 |   push:
13 |     branches:
14 |       - main
15 | 
16 | jobs:
17 |   deploy:
18 |     name: "Deploy bundle"
19 |     runs-on: ubuntu-latest
20 | 
21 |     steps:
22 |       # Check out this repo, so that this workflow can access it.
23 |       - uses: actions/checkout@v3
24 | 
25 |       # Download the Databricks CLI.
26 |       # See https://github.com/databricks/setup-cli
27 |       - uses: databricks/setup-cli@main
28 | 
29 |       # Deploy the bundle to the "prod" target as defined
30 |       # in the bundle's settings file.
31 |       - run: databricks bundle deploy
32 |         working-directory: .
33 |         env:
34 |           DATABRICKS_TOKEN: ${{secrets.DATABRICKS_TOKEN}}
35 |           DATABRICKS_BUNDLE_ENV: prod
36 | 
37 |   # Validate, deploy, and then run the bundle.
38 |   pipeline_update:
39 |     name: "Run Job"
40 |     runs-on: ubuntu-latest
41 | 
42 |     # Run the "deploy" job first.
43 |     needs:
44 |       - deploy
45 | 
46 |     steps:
47 |       # Check out this repo, so that this workflow can access it.
48 |       - uses: actions/checkout@v3
49 | 
50 |       # Use the downloaded Databricks CLI.
51 |       - uses: databricks/setup-cli@main
52 | 
53 |       # Run the Databricks workflow named "de_book_dabs_example_job" as defined in the
54 |       # bundle that was just deployed.
55 |       - run: databricks bundle run de_book_dabs_example_job --refresh-all
56 |         working-directory: .
57 |         env:
58 |           DATABRICKS_TOKEN: ${{secrets.DATABRICKS_TOKEN}}
59 |           DATABRICKS_BUNDLE_ENV: prod
60 | 


--------------------------------------------------------------------------------
/Chapter11/11.5_databricks_asset_bundles/dabs_cicd_example/.github/workflows/deploy_to_qa_CI.yml:
--------------------------------------------------------------------------------
 1 | # This workflow validates, deploys, and runs the specified bundle
 2 | # within a pre-production target named "qa".
 3 | name: "QA deployment"
 4 | 
 5 | # Ensure that only a single job or workflow using the same concurrency group
 6 | # runs at a time.
 7 | concurrency: 1
 8 | 
 9 | # Trigger this workflow whenever a pull request is opened against the repo's
10 | # main branch or an existing pull request's head branch is updated.
11 | on:
12 |   pull_request:
13 |     types:
14 |       - opened
15 |       - synchronize
16 | 
17 | jobs:
18 |   # Used by the "pipeline_update" job to deploy the bundle.
19 |   # Bundle validation is automatically performed as part of this deployment.
20 |   # If validation fails, this workflow fails.
21 |   deploy:
22 |     name: "Deploy bundle"
23 |     runs-on: ubuntu-latest
24 | 
25 |     steps:
26 |       # Check out this repo, so that this workflow can access it.
27 |       - uses: actions/checkout@v3
28 | 
29 |       # Download the Databricks CLI.
30 |       # See https://github.com/databricks/setup-cli
31 |       - uses: databricks/setup-cli@main
32 | 
33 |       # Deploy the bundle to the "qa" target as defined
34 |       # in the bundle's settings file.
35 |       - run: databricks bundle deploy
36 |         working-directory: .
37 |         env:
38 |           DATABRICKS_TOKEN: ${{secrets.DATABRICKS_TOKEN}}
39 |           DATABRICKS_BUNDLE_ENV: dev
40 | 
41 |   # Validate, deploy, and then run the bundle.
42 |   pipeline_update:
43 |     name: "Run Job"
44 |     runs-on: ubuntu-latest
45 | 
46 |     # Run the "deploy" job first.
47 |     needs:
48 |       - deploy
49 | 
50 |     steps:
51 |       # Check out this repo, so that this workflow can access it.
52 |       - uses: actions/checkout@v3
53 | 
54 |       # Use the downloaded Databricks CLI.
55 |       - uses: databricks/setup-cli@main
56 | 
57 |       # Run the Databricks workflow named "de_book_dabs_example_job" as defined in the
58 |       # bundle that was just deployed.
59 |       - run: databricks bundle run de_book_dabs_example_job --refresh-all
60 |         working-directory: .
61 |         env:
62 |           DATABRICKS_TOKEN: ${{secrets.DATABRICKS_TOKEN}}
63 |           DATABRICKS_BUNDLE_ENV: dev
64 | 


--------------------------------------------------------------------------------
/Chapter11/11.5_databricks_asset_bundles/dabs_cicd_example/README.md:
--------------------------------------------------------------------------------
 1 | # dabs_cicd_example
 2 | 
 3 | The 'dabs_cicd_example' project was generated by using the default-python template.
 4 | 
 5 | ## Getting started
 6 | 
 7 | 1. Install the Databricks CLI from https://docs.databricks.com/dev-tools/cli/databricks-cli.html
 8 | 
 9 | 2. Authenticate to your Databricks workspace:
10 |     ```
11 |     $ databricks configure
12 |     ```
13 | 
14 | 3. To deploy a development copy of this project, type:
15 |     ```
16 |     $ databricks bundle deploy --target dev
17 |     ```
18 |     (Note that "dev" is the default target, so the `--target` parameter
19 |     is optional here.)
20 | 
21 |     This deploys everything that's defined for this project.
22 |     For example, the default template would deploy a job called
23 |     `[dev yourname] dabs_cicd_example_job` to your workspace.
24 |     You can find that job by opening your workpace and clicking on **Workflows**.
25 | 
26 | 4. Similarly, to deploy a production copy, type:
27 |    ```
28 |    $ databricks bundle deploy --target prod
29 |    ```
30 | 
31 | 5. To run a job or pipeline, use the "run" comand:
32 |    ```
33 |    $ databricks bundle run
34 |    ```
35 | 
36 | 6. Optionally, install developer tools such as the Databricks extension for Visual Studio Code from
37 |    https://docs.databricks.com/dev-tools/vscode-ext.html.
38 | 
39 | 7. For documentation on the Databricks asset bundles format used
40 |    for this project, and for CI/CD configuration, see
41 |    https://docs.databricks.com/dev-tools/bundles/index.html.
42 | 


--------------------------------------------------------------------------------
/Chapter11/11.5_databricks_asset_bundles/dabs_cicd_example/databricks.yml:
--------------------------------------------------------------------------------
 1 | # This is a Databricks asset bundle definition for dabs_cicd_example.
 2 | # See https://docs.databricks.com/dev-tools/bundles/index.html for documentation.
 3 | bundle:
 4 |   name: dabs_cicd_example
 5 | 
 6 | include:
 7 |   - resources/*.yml
 8 | 
 9 | targets:
10 |   # The 'dev' target, used for development purposes.
11 |   # Whenever a developer deploys using 'dev', they get their own copy.
12 |   dev:
13 |     # We use 'mode: development' to make sure everything deployed to this target gets a prefix
14 |     # like '[dev my_user_name]'. Setting this mode also disables any schedules and
15 |     # automatic triggers for jobs and enables the 'development' mode for Delta Live Tables pipelines.
16 |     mode: development
17 |     default: true
18 |     workspace:
19 |       host: https://adb-7934447987890817.7.azuredatabricks.net/
20 | 
21 |   # Optionally, there could be a 'staging' target here.
22 |   # (See Databricks docs on CI/CD at https://docs.databricks.com/dev-tools/bundles/index.html.)
23 |   #
24 |   # staging:
25 |   #  workspace:
26 |   #    host: https://adb-7934447987890817.7.azuredatabricks.net/
27 | 
28 |   # The 'prod' target, used for production deployment.
29 |   prod:
30 |     # For production deployments, we only have a single copy, so we override the
31 |     # workspace.root_path default of
32 |     # /Users/${workspace.current_user.userName}/.bundle/${bundle.target}/${bundle.name}
33 |     # to a path that is not specific to the current user.
34 |     mode: production
35 |     workspace:
36 |       host: https://adb-7934447987890817.7.azuredatabricks.net/
37 |       root_path: /Shared/.bundle/prod/${bundle.name}
38 |     run_as:
39 |       # This runs as pulkit.chadha.packt@gmail.com in production. Alternatively,
40 |       # a service principal could be used here using service_principal_name
41 |       # (see Databricks documentation).
42 |       user_name: pulkit.chadha.packt@gmail.com
43 | 


--------------------------------------------------------------------------------
/Chapter11/11.5_databricks_asset_bundles/dabs_cicd_example/fixtures/.gitkeep:
--------------------------------------------------------------------------------
 1 | # Fixtures
 2 | 
 3 | This folder is reserved for fixtures, such as CSV files.
 4 | 
 5 | Below is an example of how to load fixtures as a data frame:
 6 | 
 7 | ```
 8 | import pandas as pd
 9 | import os
10 | 
11 | def get_absolute_path(*relative_parts):
12 |     if 'dbutils' in globals():
13 |         base_dir = os.path.dirname(dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get()) # type: ignore
14 |         path = os.path.normpath(os.path.join(base_dir, *relative_parts))
15 |         return path if path.startswith("/Workspace") else "/Workspace" + path
16 |     else:
17 |         return os.path.join(*relative_parts)
18 | 
19 | csv_file = get_absolute_path("..", "fixtures", "mycsv.csv")
20 | df = pd.read_csv(csv_file)
21 | display(df)
22 | ```
23 | 


--------------------------------------------------------------------------------
/Chapter11/11.5_databricks_asset_bundles/dabs_cicd_example/requirements-dev.txt:
--------------------------------------------------------------------------------
 1 | ## requirements-dev.txt: dependencies for local development.
 2 | ##
 3 | ## For defining dependencies used by jobs in Databricks Workflows, see
 4 | ## https://docs.databricks.com/dev-tools/bundles/library-dependencies.html
 5 | 
 6 | ## pytest is the default package used for testing
 7 | pytest
 8 | 
 9 | ## databricks-connect can be used to run parts of this project locally.
10 | ## See https://docs.databricks.com/dev-tools/databricks-connect.html.
11 | ##
12 | ## databricks-connect is automatically installed if you're using Databricks
13 | ## extension for Visual Studio Code
14 | ## (https://docs.databricks.com/dev-tools/vscode-ext/dev-tasks/databricks-connect.html).
15 | ##
16 | ## To manually install databricks-connect, either follow the instructions
17 | ## at https://docs.databricks.com/dev-tools/databricks-connect.html
18 | ## to install the package system-wide. Or uncomment the line below to install a
19 | ## version of db-connect that corresponds to the Databricks Runtime version used
20 | ## for this project.
21 | #
22 | # databricks-connect>=13.3,<13.4
23 | 


--------------------------------------------------------------------------------
/Chapter11/11.5_databricks_asset_bundles/dabs_cicd_example/resources/dabs_cicd_example_job.yml:
--------------------------------------------------------------------------------
 1 | # The main job for dabs_cicd_example
 2 | resources:
 3 |   jobs:
 4 |     dabs_cicd_example_job:
 5 |       name: dabs_cicd_example_job
 6 | 
 7 |       schedule:
 8 |         quartz_cron_expression: '44 37 8 * * ?'
 9 |         timezone_id: Europe/Amsterdam
10 | 
11 |       email_notifications:
12 |         on_failure:
13 |           - pulkit.chadha.packt@gmail.com
14 | 
15 |       tasks:
16 |         - task_key: notebook_task
17 |           job_cluster_key: job_cluster
18 |           notebook_task:
19 |             notebook_path: ../src/notebook.ipynb
20 |         
21 |         - task_key: refresh_pipeline
22 |           depends_on:
23 |             - task_key: notebook_task
24 |           pipeline_task:
25 |             pipeline_id: ${resources.pipelines.dabs_cicd_example_pipeline.id}
26 |         
27 |       job_clusters:
28 |         - job_cluster_key: job_cluster
29 |           new_cluster:
30 |             spark_version: 13.3.x-scala2.12
31 |             node_type_id: i3.xlarge
32 |             autoscale:
33 |                 min_workers: 1
34 |                 max_workers: 4
35 | 


--------------------------------------------------------------------------------
/Chapter11/11.5_databricks_asset_bundles/dabs_cicd_example/resources/dabs_cicd_example_pipeline.yml:
--------------------------------------------------------------------------------
 1 | # The main pipeline for dabs_cicd_example
 2 | resources:
 3 |   pipelines:
 4 |     dabs_cicd_example_pipeline:
 5 |       name: dabs_cicd_example_pipeline
 6 |       target: dabs_cicd_example_${bundle.environment}
 7 |       libraries:
 8 |         - notebook:
 9 |             path: ../src/dlt_pipeline.ipynb
10 | 
11 |       configuration:
12 |         bundle.sourcePath: /Workspace/${workspace.file_path}/src
13 | 


--------------------------------------------------------------------------------
/Chapter11/11.5_databricks_asset_bundles/dabs_cicd_example/scratch/README.md:
--------------------------------------------------------------------------------
1 | # scratch
2 | 
3 | This folder is reserved for personal, exploratory notebooks.
4 | By default these are not committed to Git, as 'scratch' is listed in .gitignore.
5 | 


--------------------------------------------------------------------------------
/Chapter11/11.5_databricks_asset_bundles/dabs_cicd_example/scratch/exploration.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": null,
 6 |    "metadata": {
 7 |     "application/vnd.databricks.v1+cell": {
 8 |      "cellMetadata": {
 9 |       "byteLimit": 2048000,
10 |       "rowLimit": 10000
11 |      },
12 |      "inputWidgets": {},
13 |      "nuid": "6bca260b-13d1-448f-8082-30b60a85c9ae",
14 |      "showTitle": false,
15 |      "title": ""
16 |     }
17 |    },
18 |    "outputs": [],
19 |    "source": [
20 |     "spark.range(10)"
21 |    ]
22 |   }
23 |  ],
24 |  "metadata": {
25 |   "application/vnd.databricks.v1+notebook": {
26 |    "dashboards": [],
27 |    "language": "python",
28 |    "notebookMetadata": {
29 |     "pythonIndentUnit": 2
30 |    },
31 |    "notebookName": "ipynb-notebook",
32 |    "widgets": {}
33 |   },
34 |   "kernelspec": {
35 |    "display_name": "Python 3",
36 |    "language": "python",
37 |    "name": "python3"
38 |   },
39 |   "language_info": {
40 |    "name": "python",
41 |    "version": "3.11.4"
42 |   }
43 |  },
44 |  "nbformat": 4,
45 |  "nbformat_minor": 0
46 | }
47 | 


--------------------------------------------------------------------------------
/Chapter11/11.5_databricks_asset_bundles/dabs_cicd_example/src/dlt_pipeline.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "metadata": {
 6 |     "application/vnd.databricks.v1+cell": {
 7 |      "cellMetadata": {},
 8 |      "inputWidgets": {},
 9 |      "nuid": "9a626959-61c8-4bba-84d2-2a4ecab1f7ec",
10 |      "showTitle": false,
11 |      "title": ""
12 |     }
13 |    },
14 |    "source": [
15 |     "# DLT pipeline\n",
16 |     "\n",
17 |     "This Delta Live Tables (DLT) definition is executed using a pipeline defined in resources/dabs_cicd_example_pipeline.yml."
18 |    ]
19 |   },
20 |   {
21 |    "cell_type": "code",
22 |    "execution_count": 0,
23 |    "metadata": {
24 |     "application/vnd.databricks.v1+cell": {
25 |      "cellMetadata": {},
26 |      "inputWidgets": {},
27 |      "nuid": "9198e987-5606-403d-9f6d-8f14e6a4017f",
28 |      "showTitle": false,
29 |      "title": ""
30 |     }
31 |    },
32 |    "outputs": [],
33 |    "source": [
34 |     "import dlt\n",
35 |     "from pyspark.sql.functions import expr\n",
36 |     "from pyspark.sql import SparkSession\n",
37 |     "spark = SparkSession.builder.getOrCreate()"
38 |    ]
39 |   },
40 |   {
41 |    "cell_type": "code",
42 |    "execution_count": 0,
43 |    "metadata": {
44 |     "application/vnd.databricks.v1+cell": {
45 |      "cellMetadata": {},
46 |      "inputWidgets": {},
47 |      "nuid": "3fc19dba-61fd-4a89-8f8c-24fee63bfb14",
48 |      "showTitle": false,
49 |      "title": ""
50 |     }
51 |    },
52 |    "outputs": [],
53 |    "source": [
54 |     "\n",
55 |     "@dlt.view\n",
56 |     "def taxi_raw():\n",
57 |     "  return spark.read.format(\"json\").load(\"/databricks-datasets/nyctaxi/sample/json/\")\n",
58 |     "\n",
59 |     "@dlt.table\n",
60 |     "def filtered_taxis():\n",
61 |     "  return dlt.read(\"taxi_raw\").filter(expr(\"fare_amount < 30\"))"
62 |    ]
63 |   }
64 |  ],
65 |  "metadata": {
66 |   "application/vnd.databricks.v1+notebook": {
67 |    "dashboards": [],
68 |    "language": "python",
69 |    "notebookMetadata": {
70 |     "pythonIndentUnit": 2
71 |    },
72 |    "notebookName": "dlt_pipeline",
73 |    "widgets": {}
74 |   },
75 |   "kernelspec": {
76 |    "display_name": "Python 3",
77 |    "language": "python",
78 |    "name": "python3"
79 |   },
80 |   "language_info": {
81 |    "name": "python",
82 |    "version": "3.11.4"
83 |   }
84 |  },
85 |  "nbformat": 4,
86 |  "nbformat_minor": 0
87 | }
88 | 


--------------------------------------------------------------------------------
/Chapter11/11.5_databricks_asset_bundles/dabs_cicd_example/src/notebook.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "metadata": {
 6 |     "application/vnd.databricks.v1+cell": {
 7 |      "cellMetadata": {},
 8 |      "inputWidgets": {},
 9 |      "nuid": "ee353e42-ff58-4955-9608-12865bd0950e",
10 |      "showTitle": false,
11 |      "title": ""
12 |     }
13 |    },
14 |    "source": [
15 |     "# Default notebook\n",
16 |     "\n",
17 |     "This default notebook is executed using Databricks Workflows as defined in resources/dabs_cicd_example_job.yml."
18 |    ]
19 |   },
20 |   {
21 |    "cell_type": "code",
22 |    "execution_count": 0,
23 |    "metadata": {
24 |     "application/vnd.databricks.v1+cell": {
25 |      "cellMetadata": {
26 |       "byteLimit": 2048000,
27 |       "rowLimit": 10000
28 |      },
29 |      "inputWidgets": {},
30 |      "nuid": "6bca260b-13d1-448f-8082-30b60a85c9ae",
31 |      "showTitle": false,
32 |      "title": ""
33 |     }
34 |    },
35 |    "outputs": [],
36 |    "source": [
37 |     "spark.range(10)"
38 |    ]
39 |   }
40 |  ],
41 |  "metadata": {
42 |   "application/vnd.databricks.v1+notebook": {
43 |    "dashboards": [],
44 |    "language": "python",
45 |    "notebookMetadata": {
46 |     "pythonIndentUnit": 2
47 |    },
48 |    "notebookName": "notebook",
49 |    "widgets": {}
50 |   },
51 |   "kernelspec": {
52 |    "display_name": "Python 3",
53 |    "language": "python",
54 |    "name": "python3"
55 |   },
56 |   "language_info": {
57 |    "name": "python",
58 |    "version": "3.11.4"
59 |   }
60 |  },
61 |  "nbformat": 4,
62 |  "nbformat_minor": 0
63 | }
64 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Packt
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Data Engineering with Databricks Cookbook
 2 | 
 3 | <a href="https://www.packtpub.com/product/data-engineering-with-databricks-cookbook/9781837633357"><img src="https://content.packt.com/_/image/original/B19798/cover_image_large.jpg" alt="no-image" height="256px" align="right"></a>
 4 | 
 5 | This is the code repository for [Data Engineering with Databricks Cookbook](https://www.packtpub.com/product/data-engineering-with-databricks-cookbook/9781837633357), published by Packt.
 6 | 
 7 | **Build effective data and AI solutions using Apache Spark, Databricks, and Delta Lake**
 8 | 
 9 | ## What is this book about?
10 | This book shows you how to use Apache Spark, Delta Lake, and Databricks to build data pipelines, manage and transform data, optimize performance, and more. Additionally, you’ll implement DataOps and DevOps practices, and orchestrate data workflows.
11 | 
12 | This book covers the following exciting features:
13 | * Perform data loading, ingestion, and processing with Apache Spark
14 | * Discover data transformation techniques and custom user-defined functions (UDFs) in Apache Spark
15 | * Manage and optimize Delta tables with Apache Spark and Delta Lake APIs
16 | * Use Spark Structured Streaming for real-time data processing
17 | * Optimize Apache Spark application and Delta table query performance
18 | * Implement DataOps and DevOps practices on Databricks
19 | * Orchestrate data pipelines with Delta Live Tables and Databricks Workflows
20 | * Implement data governance policies with Unity Catalog
21 | 
22 | If you feel this book is for you, get your [copy](https://www.amazon.com/Engineering-Apache-Spark-Delta-Cookbook/dp/1837633355) today!
23 | 
24 | <a href="https://www.packtpub.com/?utm_source=github&utm_medium=banner&utm_campaign=GitHubBanner"><img src="https://raw.githubusercontent.com/PacktPublishing/GitHub/master/GitHub.png" 
25 | alt="https://www.packtpub.com/" border="5" /></a>
26 | 
27 | ## Instructions and Navigations
28 | All of the code is organized into folders. For example, Chapter01.
29 | 
30 | The code will look like the following:
31 | ```
32 | from pyspark.sql import SparkSession
33 | 
34 | spark = (SparkSession.builder
35 |  .appName("read-csv-data")
36 |  .master(«spark://spark-master:7077»)
37 |  .config(«spark.executor.memory", "512m")
38 |  .getOrCreate())
39 | 
40 | spark.sparkContext.setLogLevel("ERROR")
41 | ```
42 | 
43 | **Following is what you need for this book:**
44 | This book is for data engineers, data scientists, and data practitioners who want to learn how to build efficient and scalable data pipelines using Apache Spark, Delta Lake, and Databricks. To get the most out of this book, you should have basic knowledge of data architecture, SQL, and Python programming.
45 | 
46 | With the following software and hardware list you can run all code files present in the book (Chapter 1-11).
47 | ### Software and Hardware List
48 | | Chapter | Software required | OS required |
49 | | -------- | ------------------------------------ | ----------------------------------- |
50 | | 1-11 | Docker Engine version 18.02.0+ | Windows, Mac OS X, and Linux (any) |
51 | | 1-11 | Docker Compose version 1.25.5+ | Windows, Mac OS X, and Linux (any) |
52 | | 1-11 | Docker Desktop                 | Windows, Mac OS X, and Linux (any) |
53 | | 1-11 | Git                            | Windows, Mac OS X, and Linux (any) |
54 | 
55 | ### Related products
56 | * Business Intelligence with Databricks SQL [[Packt]](https://www.packtpub.com/product/business-intelligence-with-databricks-sql/9781803235332) [[Amazon]](https://www.amazon.com/Business-Intelligence-Databricks-SQL-intelligence/dp/1803235330/ref=sr_1_1?crid=1QYCAOZP9E3NH&dib=eyJ2IjoiMSJ9.nKZ7dRFPdDZyRvWwKM_NiTSZyweCLZ8g9JdktemcYzaWNiGWg9PuoxY2yb2jogGyK8hgRliKebDQfdHu2rRnTZTWZbsWOJAN33k65RFkAgdFX-csS8HgTFfjZj-SFKLpp4FC6LHwQvWr9Nq6f5x6eg.jh99qre-Hl4OHA9rypXLmSGsQp4exBvaZ2xUOPDQ0mM&dib_tag=se&keywords=Business+Intelligence+with+Databricks+SQL&qid=1718173191&s=books&sprefix=business+intelligence+with+databricks+sql%2Cstripbooks-intl-ship%2C553&sr=1-1)
57 | 
58 | * Optimizing Databricks Workloads [[Packt]](https://www.packtpub.com/product/optimizing-databricks-workloads/9781801819077) [[Amazon]](https://www.amazon.com/Optimizing-Databricks-Workloads-performance-workloads/dp/1801819076/ref=tmm_pap_swatch_0?_encoding=UTF8&dib_tag=se&dib=eyJ2IjoiMSJ9.cskfrEglx5gEbJF-FnhxlA.rCtKm1bO6Fi1mXUpq1Oai0kjAhGseGT2cCZ2Ccgxaak&qid=1718173341&sr=1-1)
59 | 
60 | ## Get to Know the Author
61 | **Pulkit Chadha**
62 |  is a seasoned technologist with over 15 years of experience in data engineering. His proficiency in crafting and refining data pipelines has been instrumental in driving success across diverse sectors such as healthcare, media and entertainment, hi-tech, and manufacturing. Pulkit’s tailored data engineering solutions are designed to address the unique challenges and aspirations of each enterprise he collaborates with.
63 | 
64 | 


--------------------------------------------------------------------------------
/build.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | #
  3 | # -- Build Apache Spark Standalone Cluster Docker Images
  4 | 
  5 | # ----------------------------------------------------------------------------------------------------------------------
  6 | # -- Variables ---------------------------------------------------------------------------------------------------------
  7 | # ----------------------------------------------------------------------------------------------------------------------
  8 | 
  9 | BUILD_DATE="$(date -u +'%Y-%m-%d')"
 10 | SPARK_VERSION="3.4.1"
 11 | HADOOP_VERSION="3"
 12 | DELTA_SPARK_VERSION="2.4.0"
 13 | DELTALAKE_VERSION="0.10.0"
 14 | JUPYTERLAB_VERSION="4.0.2"
 15 | PANDAS_VERSION="2.0.1"
 16 | DELTA_PACKAGE_VERSION="delta-core_2.12:2.4.0"
 17 | SPARK_VERSION_MAJOR=${SPARK_VERSION:0:1}
 18 | SPARK_XML_PACKAGE_VERSION="spark-xml_2.12:0.16.0"
 19 | SPARKSQL_MAGIC_VERSION="0.0.3"
 20 | KAFKA_PYTHON_VERSION="2.0.2"
 21 | 
 22 | # ----------------------------------------------------------------------------------------------------------------------
 23 | # -- Functions----------------------------------------------------------------------------------------------------------
 24 | # ----------------------------------------------------------------------------------------------------------------------
 25 | 
 26 | function cleanContainers() {
 27 | 
 28 |     container="$(docker ps -a | grep 'jupyterlab' | awk '{print $1}')"
 29 |     docker stop "${container}"
 30 |     docker rm "${container}"
 31 | 
 32 |     container="$(docker ps -a | grep 'spark-worker' -m 1 | awk '{print $1}')"
 33 |     while [ -n "${container}" ];
 34 |     do
 35 |       docker stop "${container}"
 36 |       docker rm "${container}"
 37 |       container="$(docker ps -a | grep 'spark-worker' -m 1 | awk '{print $1}')"
 38 |     done
 39 | 
 40 |     container="$(docker ps -a | grep 'spark-master' | awk '{print $1}')"
 41 |     docker stop "${container}"
 42 |     docker rm "${container}"
 43 | 
 44 |     container="$(docker ps -a | grep 'spark-base' | awk '{print $1}')"
 45 |     docker stop "${container}"
 46 |     docker rm "${container}"
 47 | 
 48 |     container="$(docker ps -a | grep 'base' | awk '{print $1}')"
 49 |     docker stop "${container}"
 50 |     docker rm "${container}"
 51 | 
 52 | }
 53 | 
 54 | function cleanImages() {
 55 | 
 56 |       docker rmi -f "$(docker images | grep -m 1 'jupyterlab' | awk '{print $3}')"
 57 | 
 58 |       docker rmi -f "$(docker images | grep -m 1 'spark-worker' | awk '{print $3}')"
 59 |       docker rmi -f "$(docker images | grep -m 1 'spark-master' | awk '{print $3}')"
 60 |       docker rmi -f "$(docker images | grep -m 1 'spark-base' | awk '{print $3}')"
 61 | 
 62 |       docker rmi -f "$(docker images | grep -m 1 'base' | awk '{print $3}')"
 63 | 
 64 | }
 65 | 
 66 | function cleanVolume() {
 67 |   docker volume rm "distributed-file-system"
 68 | }
 69 | 
 70 | function buildImages() {
 71 | 
 72 | 
 73 |     docker build \
 74 |       --build-arg build_date="${BUILD_DATE}" \
 75 |       --build-arg scala_version="${SCALA_VERSION}" \
 76 |       --build-arg delta_spark_version="${DELTA_SPARK_VERSION}" \
 77 |       --build-arg deltalake_version="${DELTALAKE_VERSION}" \
 78 |       --build-arg pandas_version="${PANDAS_VERSION}" \
 79 |       -f docker/base/Dockerfile \
 80 |       -t base:latest .
 81 | 
 82 |     docker build \
 83 |       --build-arg build_date="${BUILD_DATE}" \
 84 |       --build-arg scala_version="${SCALA_VERSION}" \
 85 |       --build-arg delta_spark_version="${DELTA_SPARK_VERSION}" \
 86 |       --build-arg deltalake_version="${DELTALAKE_VERSION}" \
 87 |       --build-arg pandas_version="${PANDAS_VERSION}" \
 88 |       --build-arg spark_version="${SPARK_VERSION}" \
 89 |       --build-arg hadoop_version="${HADOOP_VERSION}" \
 90 |       --build-arg delta_package_version="${DELTA_PACKAGE_VERSION}" \
 91 |       --build-arg spark_xml_package_version="${SPARK_XML_PACKAGE_VERSION}" \
 92 |       -f docker/spark-base/Dockerfile \
 93 |       -t spark-base:${SPARK_VERSION} .
 94 | 
 95 |     docker build \
 96 |       --build-arg build_date="${BUILD_DATE}" \
 97 |       --build-arg spark_version="${SPARK_VERSION}" \
 98 |       -f docker/spark-master/Dockerfile \
 99 |       -t spark-master:${SPARK_VERSION} .
100 | 
101 |     docker build \
102 |       --build-arg build_date="${BUILD_DATE}" \
103 |       --build-arg spark_version="${SPARK_VERSION}" \
104 |       -f docker/spark-worker/Dockerfile \
105 |       -t spark-worker:${SPARK_VERSION} .
106 | 
107 |     docker build \
108 |       --build-arg build_date="${BUILD_DATE}" \
109 |       --build-arg scala_version="${SCALA_VERSION}" \
110 |       --build-arg delta_spark_version="${DELTA_SPARK_VERSION}" \
111 |       --build-arg deltalake_version="${DELTALAKE_VERSION}" \
112 |       --build-arg pandas_version="${PANDAS_VERSION}" \
113 |       --build-arg spark_version="${SPARK_VERSION}" \
114 |       --build-arg jupyterlab_version="${JUPYTERLAB_VERSION}" \
115 |       --build-arg sparksql_magic_version="${SPARKSQL_MAGIC_VERSION}" \
116 |       --build-arg kafka_python_version="${KAFKA_PYTHON_VERSION}" \
117 |       -f docker/jupyterlab/Dockerfile \
118 |       -t jupyterlab:${JUPYTERLAB_VERSION}-spark-${SPARK_VERSION} .
119 | 
120 | }
121 | 
122 | # ----------------------------------------------------------------------------------------------------------------------
123 | # -- Main --------------------------------------------------------------------------------------------------------------
124 | # ----------------------------------------------------------------------------------------------------------------------
125 | 
126 | cleanContainers;
127 | cleanImages;
128 | cleanVolume;
129 | buildImages;
130 | 


--------------------------------------------------------------------------------
/data/Credit Card/FraudBase.csv:
--------------------------------------------------------------------------------
  1 | Transaction_ID,Fraud_Flag
  2 | CTID50558449,1
  3 | CTID55936882,1
  4 | CTID63762180,1
  5 | CTID76723439,1
  6 | CTID21246201,1
  7 | CTID26555772,1
  8 | CTID20567160,1
  9 | CTID54759604,1
 10 | CTID44626561,1
 11 | CTID73773088,1
 12 | CTID62499873,1
 13 | CTID70746134,1
 14 | CTID29266043,1
 15 | CTID97776833,1
 16 | CTID16281374,1
 17 | CTID81479835,1
 18 | CTID20456246,1
 19 | CTID28195227,1
 20 | CTID42582298,1
 21 | CTID90938173,1
 22 | CTID41668436,1
 23 | CTID89260233,1
 24 | CTID60575167,1
 25 | CTID78032595,1
 26 | CTID22306254,1
 27 | CTID25962688,1
 28 | CTID95859840,1
 29 | CTID29469747,1
 30 | CTID45648244,1
 31 | CTID56303514,1
 32 | CTID97760025,1
 33 | CTID56692687,1
 34 | CTID92439353,1
 35 | CTID84579664,1
 36 | CTID60614984,1
 37 | CTID49913734,1
 38 | CTID26134025,1
 39 | CTID38789752,1
 40 | CTID81793385,1
 41 | CTID73000031,1
 42 | CTID36997643,1
 43 | CTID55429304,1
 44 | CTID68050021,1
 45 | CTID12041601,1
 46 | CTID57707566,1
 47 | CTID95135812,1
 48 | CTID39550827,1
 49 | CTID38931999,1
 50 | CTID95884307,1
 51 | CTID73429087,1
 52 | CTID80787923,1
 53 | CTID42980186,1
 54 | CTID52594996,1
 55 | CTID36309710,1
 56 | CTID70222721,1
 57 | CTID80436326,1
 58 | CTID13865418,1
 59 | CTID40881434,1
 60 | CTID57357592,1
 61 | CTID45385078,1
 62 | CTID94511089,1
 63 | CTID54381577,1
 64 | CTID71795114,1
 65 | CTID26238474,1
 66 | CTID23350804,1
 67 | CTID89116114,1
 68 | CTID30763806,1
 69 | CTID99066676,1
 70 | CTID32892929,1
 71 | CTID53972836,1
 72 | CTID68584964,1
 73 | CTID69594649,1
 74 | CTID84024131,1
 75 | CTID44773525,1
 76 | CTID85085771,1
 77 | CTID24913963,1
 78 | CTID59571587,1
 79 | CTID63195033,1
 80 | CTID32907279,1
 81 | CTID37742156,1
 82 | CTID18338743,1
 83 | CTID96772424,1
 84 | CTID66265146,1
 85 | CTID56680308,1
 86 | CTID41847490,1
 87 | CTID31867370,1
 88 | CTID12270763,1
 89 | CTID43014391,1
 90 | CTID87470159,1
 91 | CTID98722314,1
 92 | CTID69782227,1
 93 | CTID70707358,1
 94 | CTID88342446,1
 95 | CTID34574410,1
 96 | CTID55853142,1
 97 | CTID65786114,1
 98 | CTID66191168,1
 99 | CTID33699337,1
100 | CTID21253563,1
101 | CTID89585938,1
102 | CTID15034243,1
103 | CTID85930060,1
104 | CTID99663510,1
105 | CTID15730669,1
106 | CTID57993591,1
107 | CTID91108283,1
108 | CTID30494187,1
109 | CTID51301522,1
110 | CTID49517337,1
111 | 


--------------------------------------------------------------------------------
/data/partitioned_recipes/._SUCCESS.crc:
--------------------------------------------------------------------------------
1 | crc    


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2019-01-01/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-01/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2019-01-01/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-01/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2019-01-02/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-02/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2019-01-02/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-02/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2019-01-03/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-03/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2019-01-03/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-03/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2019-01-04/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-04/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2019-01-04/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-04/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2019-01-05/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-05/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2019-01-05/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-05/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2019-01-06/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-06/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2019-01-06/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-06/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2019-01-07/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-07/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2019-01-07/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-07/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2019-01-08/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-08/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2019-01-08/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-08/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2019-01-09/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-09/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2019-01-09/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-09/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2019-01-10/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-10/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2019-01-10/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-10/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2019-01-11/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-11/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2019-01-11/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-11/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2019-01-12/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-12/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2019-01-12/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-12/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2019-01-13/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-13/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2019-01-13/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-13/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2019-01-14/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-14/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2019-01-14/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-14/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2019-01-15/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-15/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2019-01-15/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-15/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2019-01-16/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-16/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2019-01-16/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-16/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2019-01-17/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-17/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2019-01-17/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-17/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2019-01-18/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-18/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2019-01-18/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-18/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2019-01-19/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-19/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2019-01-19/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-19/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2019-01-20/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-20/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2019-01-20/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-20/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2019-01-21/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-21/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2019-01-21/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-21/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2019-01-22/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-22/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2019-01-22/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-22/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2019-01-23/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-23/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2019-01-23/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-23/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2019-01-24/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-24/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2019-01-24/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-24/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2019-01-25/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-25/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2019-01-25/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-25/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2019-01-26/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-26/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2019-01-26/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-26/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2019-01-27/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-27/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2019-01-27/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-27/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2019-01-28/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-28/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2019-01-28/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-28/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2019-01-29/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-29/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2019-01-29/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-29/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2019-01-30/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-30/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2019-01-30/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-30/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2019-01-31/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-31/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2019-01-31/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-31/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2020-01-01/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-01-01/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2020-01-01/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-01-01/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2020-01-02/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-01-02/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2020-01-02/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-01-02/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2020-01-03/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-01-03/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2020-01-03/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-01-03/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2020-01-07/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-01-07/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2020-01-07/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-01-07/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2020-01-09/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-01-09/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2020-01-09/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-01-09/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2020-01-10/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-01-10/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2020-01-10/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-01-10/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2020-01-13/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-01-13/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2020-01-13/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-01-13/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2020-01-14/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-01-14/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2020-01-14/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-01-14/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2020-01-16/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-01-16/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2020-01-16/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-01-16/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2020-01-21/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-01-21/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2020-01-21/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-01-21/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2020-01-23/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-01-23/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2020-01-23/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-01-23/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2020-01-28/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-01-28/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2020-01-28/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-01-28/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2020-01-29/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-01-29/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2020-01-29/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-01-29/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2020-01-30/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-01-30/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2020-01-30/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-01-30/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2020-01-31/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-01-31/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2020-01-31/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-01-31/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2020-02-03/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-02-03/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2020-02-03/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-02-03/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2020-02-04/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-02-04/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2020-02-04/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-02-04/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2020-02-05/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-02-05/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2020-02-05/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-02-05/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2020-02-06/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-02-06/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2020-02-06/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-02-06/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2020-02-07/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-02-07/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2020-02-07/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-02-07/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2020-02-10/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-02-10/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2020-02-10/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-02-10/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2020-02-11/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-02-11/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2020-02-11/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-02-11/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2020-02-13/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-02-13/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2020-02-13/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-02-13/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2020-02-14/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-02-14/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2020-02-14/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-02-14/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2020-02-16/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-02-16/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2020-02-16/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-02-16/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2020-02-18/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-02-18/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2020-02-18/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-02-18/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2020-02-20/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-02-20/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2020-02-20/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-02-20/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2020-02-23/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-02-23/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2020-02-23/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-02-23/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2020-02-24/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-02-24/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2020-02-24/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-02-24/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2020-02-25/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-02-25/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2020-02-25/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-02-25/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2020-02-27/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-02-27/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2020-02-27/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-02-27/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2020-02-28/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-02-28/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/data/partitioned_recipes/DatePublished=2020-02-28/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-02-28/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet


--------------------------------------------------------------------------------
/data/partitioned_recipes/_SUCCESS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/_SUCCESS


--------------------------------------------------------------------------------
/data/recipes.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/recipes.parquet


--------------------------------------------------------------------------------
/diagrams/Chapter10/unity_catalog_hierarchy.md:
--------------------------------------------------------------------------------
 1 | ```mermaid
 2 |     flowchart TD
 3 |         Metastore --> strCred(Storage\nCredentials)
 4 |         Metastore --> extLoc(External\nLocation)
 5 |         Metastore --> Catalog
 6 |         Metastore --> Share
 7 |         Metastore --> Recipient
 8 |         Metastore --> Provider
 9 |         Metastore --> Connection
10 |         Catalog --> Schema
11 |         Schema --> Table
12 |         Schema --> View
13 |         Schema --> Volume
14 |         Schema --> Model
15 |         Schema --> Functions
16 | ```


--------------------------------------------------------------------------------
/diagrams/Chapter10/unity_catalog_hierarchy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/diagrams/Chapter10/unity_catalog_hierarchy.png


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: "3.6"
 2 | volumes:
 3 |   shared-workspace:
 4 |     name: "distributed-file-system"
 5 |     driver: local
 6 |     driver_opts:
 7 |       o: bind
 8 |       type: none
 9 |       device: ./
10 | services:
11 |   zookeeper:
12 |     image: docker.io/bitnami/zookeeper:3.8.2
13 |     container_name: zookeeper
14 |     ports:
15 |       - "2181:2181"
16 |     volumes:
17 |       - shared-workspace:/opt/workspace
18 |     environment:
19 |       - ALLOW_ANONYMOUS_LOGIN=yes
20 |   kafka:
21 |     image: docker.io/bitnami/kafka:3.5.1
22 |     container_name: kafka
23 |     ports:
24 |       - "9092:9092"
25 |     environment:
26 |       - BITNAMI_DEBUG=yes
27 |       - KAFKA_BROKER_ID=1
28 |       - KAFKA_ENABLE_KRAFT=false
29 |       - KAFKA_CFG_LISTENERS=PLAINTEXT://kafka:9092
30 |       - KAFKA_CFG_ADVERTISED_LISTENERS=PLAINTEXT://kafka:9092
31 |       - KAFKA_CFG_ZOOKEEPER_CONNECT=zookeeper:2181
32 |       - KAFKA_CFG_INTER_BROKER_LISTENER_NAME=PLAINTEXT
33 |       - KAFKA_CFG_LISTENER_SECURITY_PROTOCOL_MAP=PLAINTEXT:PLAINTEXT
34 |       - ALLOW_PLAINTEXT_LISTENER=yes
35 |     depends_on:
36 |       - zookeeper
37 |   jupyterlab:
38 |     image: jupyterlab:4.0.2-spark-3.4.1
39 |     container_name: jupyterlab
40 |     ports:
41 |       - 8888:8888
42 |       - 4040:4040
43 |     volumes:
44 |       - shared-workspace:/opt/workspace
45 |   spark-master:
46 |     image: spark-master:3.4.1
47 |     container_name: spark-master
48 |     ports:
49 |       - 8080:8080
50 |       - 7077:7077
51 |     volumes:
52 |       - shared-workspace:/opt/workspace
53 |   spark-worker-1:
54 |     image: spark-worker:3.4.1
55 |     container_name: spark-worker-1
56 |     environment:
57 |       - SPARK_WORKER_CORES=1
58 |       - SPARK_WORKER_MEMORY=512m
59 |     ports:
60 |       - 8081:8081
61 |     volumes:
62 |       - shared-workspace:/opt/workspace
63 |     depends_on:
64 |       - spark-master
65 |   spark-worker-2:
66 |     image: spark-worker:3.4.1
67 |     container_name: spark-worker-2
68 |     environment:
69 |       - SPARK_WORKER_CORES=1
70 |       - SPARK_WORKER_MEMORY=512m
71 |     ports:
72 |       - 8082:8081
73 |     volumes:
74 |       - shared-workspace:/opt/workspace
75 |     depends_on:
76 |       - spark-master
77 | 


--------------------------------------------------------------------------------
/docker/base/Dockerfile:
--------------------------------------------------------------------------------
 1 | ARG java_image_tag=17-jre
 2 | FROM eclipse-temurin:${java_image_tag}
 3 | 
 4 | # -- Layer: Image Metadata
 5 | 
 6 | ARG build_date
 7 | ARG delta_spark_version
 8 | ARG deltalake_version
 9 | ARG pandas_version
10 | 
11 | LABEL org.label-schema.build-date=${build_date}
12 | LABEL org.label-schema.description="Data Engineering wih Apache Spark and Delta Lake Cookbook  - Cluster base image"
13 | LABEL org.label-schema.schema-version="1.0"
14 | 
15 | # -- Layer: OS + Python + Scala
16 | 
17 | ARG shared_workspace=/opt/workspace
18 | 
19 | RUN mkdir -p ${shared_workspace}/data && \
20 |     mkdir -p /usr/share/man/man1  && \
21 |     apt-get update -y && \
22 |     apt-get install -y --no-install-recommends curl python3 r-base netcat && \
23 |     ln -s /usr/bin/python3 /usr/bin/python && \
24 |     apt-get clean && \
25 |     rm -rf /var/lib/apt/lists/*
26 | 
27 | RUN apt-get update -y && \
28 |     apt-get install -y --no-install-recommends build-essential manpages-dev python3-pip python3-dev && \
29 |     pip3 install  --no-cache-dir --upgrade pip && \
30 |     apt-get clean && \
31 |     rm -rf /var/lib/apt/lists/*
32 | 
33 | # We are explicitly pinning the versions of various libraries which this Docker image runs on.
34 | RUN pip3 install --quiet --no-cache-dir \
35 |     delta-spark==${delta_spark_version} \
36 |     deltalake==${deltalake_version} \
37 |     pandas==${pandas_version}
38 | 
39 | ENV SCALA_HOME="/usr/bin/scala"
40 | ENV PATH=${PATH}:${SCALA_HOME}/bin
41 | ENV SHARED_WORKSPACE=${shared_workspace}
42 | 
43 | # -- Runtime
44 | 
45 | VOLUME ${shared_workspace}
46 | CMD ["bash"]
47 | 


--------------------------------------------------------------------------------
/docker/jupyterlab/00-first.py:
--------------------------------------------------------------------------------
 1 | from delta import *
 2 | from pyspark.sql import SparkSession
 3 | 
 4 | builder = (SparkSession.builder
 5 |            .appName("data-eng-cookbook")
 6 |            .master("spark://spark-master:7077")
 7 |            .config("spark.executor.memory", "512m")   
 8 |            .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
 9 |            .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog"))
10 | 
11 | spark = configure_spark_with_delta_pip(builder).getOrCreate()
12 | spark.sparkContext.setLogLevel("ERROR")
13 | 
14 | 
15 | get_ipython().run_line_magic('load_ext', 'sparksql_magic')
16 | get_ipython().run_line_magic('config', 'SparkSql.limit=20')


--------------------------------------------------------------------------------
/docker/jupyterlab/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM base
 2 | 
 3 | # -- Layer: Image Metadata
 4 | 
 5 | ARG build_date
 6 | 
 7 | LABEL org.label-schema.build-date=${build_date}
 8 | LABEL org.label-schema.name="Data Engineering wih Apache Spark and Delta Lake Cookbook - JupyterLab Image"
 9 | LABEL org.label-schema.description="JupyterLab image"
10 | 
11 | 
12 | # -- Layer: Notebooks and data
13 | 
14 | # ADD docker/jupyterlab/kafka-producer.py /
15 | 
16 | # -- Layer: JupyterLab + Python kernel for PySpark
17 | 
18 | ARG spark_version
19 | ARG jupyterlab_version
20 | ARG sparksql_magic_version
21 | ARG kafka_python_version
22 | 
23 | RUN pip3 install --no-cache-dir wget==3.2 \
24 |     pyspark==${spark_version} \
25 |     jupyterlab==${jupyterlab_version} \
26 |     sparksql-magic==${sparksql_magic_version} \
27 |     kafka-python==${kafka_python_version}
28 | 
29 | EXPOSE 8888
30 | 
31 | WORKDIR ${SHARED_WORKSPACE}
32 | # COPY docker/jupyterlab/00-first.py /root/.ipython/profile_default/startup/00-first.py
33 | CMD jupyter lab --ip=0.0.0.0 --port=8888 --no-browser --allow-root --NotebookApp.token=;python


--------------------------------------------------------------------------------
/docker/spark-base/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM base
 2 | 
 3 | # -- Layer: Image Metadata
 4 | ARG build_date
 5 | ARG delta_package_version
 6 | ARG spark_xml_package_version
 7 | 
 8 | LABEL org.label-schema.build-date=${build_date}
 9 | LABEL org.label-schema.description="Data Engineering wih Apache Spark and Delta Lake Cookbook - Spark base image"
10 | LABEL org.label-schema.schema-version="1.0"
11 | 
12 | # -- Layer: Apache Spark
13 | ARG spark_version
14 | ARG hadoop_version
15 | 
16 | RUN curl https://archive.apache.org/dist/spark/spark-${spark_version}/spark-${spark_version}-bin-hadoop${hadoop_version}.tgz -o spark.tgz && \
17 |     tar -xf spark.tgz && \
18 |     mv spark-${spark_version}-bin-hadoop${hadoop_version} /usr/bin/ && \
19 |     echo "alias pyspark=/usr/bin/spark-${spark_version}-bin-hadoop${hadoop_version}/bin/pyspark" >> ~/.bashrc && \
20 |     echo "alias spark-shell=/usr/bin/spark-${spark_version}-bin-hadoop${hadoop_version}/bin/spark-shell" >> ~/.bashrc && \
21 |     mkdir /usr/bin/spark-${spark_version}-bin-hadoop${hadoop_version}/logs && \
22 |     rm spark.tgz
23 | 
24 | ENV SPARK_HOME /usr/bin/spark-${spark_version}-bin-hadoop${hadoop_version}
25 | ENV SPARK_MASTER_HOST spark-master
26 | ENV SPARK_MASTER_PORT 7077
27 | ENV PYSPARK_PYTHON python3
28 | 
29 | # -- Runtime
30 | WORKDIR ${SPARK_HOME}
31 | 
32 | USER root
33 | 
34 | ARG NBuser=NBuser
35 | ARG GROUP=NBuser
36 | 
37 | RUN groupadd -r ${GROUP} && useradd -r -m -g ${GROUP} ${NBuser}
38 | 
39 | RUN chown -R "${NBuser}":"${GROUP}" /home/"${NBuser}"/ \
40 |     && chown -R "${NBuser}":"${GROUP}" "${SPARK_HOME}"\
41 |     && chown -R "${NBuser}":"${GROUP}" "${SHARED_WORKSPACE}"
42 | 
43 | USER ${NBuser}
44 | 
45 | 
46 | RUN ${SPARK_HOME}/bin/spark-shell --packages io.delta:${delta_package_version} \
47 |     --conf "spark.sql.extensions=io.delta.sql.DeltaSparkSessionExtension" \
48 |     --conf "spark.sql.catalog.spark_catalog=org.apache.spark.sql.delta.catalog.DeltaCatalog" \
49 |  && ${SPARK_HOME}/bin/spark-shell --packages com.databricks:${spark_xml_package_version} \
50 |  && ${SPARK_HOME}/bin/spark-shell --packages org.apache.spark:spark-sql-kafka-0-10_2.12:3.4.1


--------------------------------------------------------------------------------
/docker/spark-master/Dockerfile:
--------------------------------------------------------------------------------
 1 | ARG spark_version
 2 | FROM spark-base:${spark_version}
 3 | 
 4 | # -- Layer: Image Metadata
 5 | 
 6 | ARG build_date
 7 | 
 8 | LABEL org.label-schema.build-date=${build_date}
 9 | LABEL org.label-schema.description="Spark master image"
10 | LABEL org.label-schema.schema-version="1.0"
11 | 
12 | # -- Runtime
13 | 
14 | EXPOSE 8080 7077
15 | 
16 | CMD bin/spark-class org.apache.spark.deploy.master.Master >> logs/spark-master.out
17 | 


--------------------------------------------------------------------------------
/docker/spark-worker/Dockerfile:
--------------------------------------------------------------------------------
 1 | ARG spark_version
 2 | FROM spark-base:${spark_version}
 3 | 
 4 | # -- Layer: Image Metadata
 5 | 
 6 | ARG build_date
 7 | 
 8 | LABEL org.label-schema.build-date=${build_date}
 9 | LABEL org.label-schema.description="Spark worker image"
10 | LABEL org.label-schema.schema-version="1.0"
11 | 
12 | # -- Runtime
13 | 
14 | EXPOSE 8081
15 | 
16 | CMD bin/spark-class org.apache.spark.deploy.worker.Worker spark://${SPARK_MASTER_HOST}:${SPARK_MASTER_PORT} >> logs/spark-worker.out
17 | 


--------------------------------------------------------------------------------