├── .gitignore ├── .vscode └── settings.json ├── Chapter01 ├── 1.1 read-csv-data.ipynb ├── 1.2 read-json-data.ipynb ├── 1.3 read-parquet-data.ipynb ├── 1.4 read-xml-data.ipynb ├── 1.5 work-with-nested-data.ipynb ├── 1.6 process-text-data.ipynb └── 1.7 write-data.ipynb ├── Chapter02 ├── 2.1 basic-transformations.ipynb ├── 2.2 filter-data.ipynb ├── 2.3 perform-joins.ipynb ├── 2.4 perform-aggregations.ipynb ├── 2.5 apply-window-functions.ipynb ├── 2.6 work-with-UDFs.ipynb └── 2.7 handle-nulls.ipynb ├── Chapter03 ├── 3.1 create-delta-tables.ipynb ├── 3.2 read-delta-tables.ipynb ├── 3.3 upsert-delta-tables.ipynb ├── 3.4 merge-delta-tables.ipynb ├── 3.5 change_data_capture_delta-tables.ipynb ├── 3.6 optimizing-delta-tables.ipynb ├── 3.7 time-travel-delta-tables.ipynb └── 3.8 manage-delta-tables.ipynb ├── Chapter04 ├── 4.0 events-gen-kafka.ipynb ├── 4.0 user-gen-kafka.ipynb ├── 4.1 config-streaming.ipynb ├── 4.2 connect-kafka-streaming.ipynb ├── 4.3 transform-filter-streaming.ipynb ├── 4.4 config-checkpoints.ipynb ├── 4.5 config-triggers.ipynb ├── 4.6 apply-window-aggregations.ipynb └── 4.7 handle-late-and-out-of-order-data.ipynb ├── Chapter05 ├── 5.0 events-gen-kafka.ipynb ├── 5.0 orders-gen-kafka.ipynb ├── 5.0 user-gen-kafka.ipynb ├── 5.1 delta-write-streaming.ipynb ├── 5.2 idempotent-stream-write-delta.ipynb ├── 5.3 merge-cdc-streaming.ipynb ├── 5.4 joining-stream-static-data.ipynb ├── 5.5 joining-stream-stream-data.ipynb └── 5.6 monitor-streams.ipynb ├── Chapter06 ├── 6.1 monitor-spark-ui.ipynb ├── 6.2 broadcast-variables.ipynb ├── 6.3 optimize-data-shuffles.ipynb ├── 6.4 avoid-data-skew.ipynb ├── 6.5 cache-and-persist.ipynb ├── 6.6 partitioning-and-repartitioning.ipynb └── 6.7 optimize-join-strategies.ipynb ├── Chapter07 ├── 7.1 optimize-table-partitions-delta.ipynb ├── 7.2 z-order-delta-tables.ipynb ├── 7.3 data-skipping-delta-tables.ipynb └── 7.4 compression-delta-tables.ipynb ├── Chapter08 ├── 8.1 building-databricks-workflow.yml ├── 8.4 conditional-branching.yml ├── Clean Up.sql ├── Data Preparation DLT.sql ├── Download Inventory Data.sql └── Setup.sql ├── Chapter09 ├── 9.1 create-medallion-arch-DLT.sql ├── 9.3 data-quality-and-validation.sql ├── 9.4 quarantine-bad-data-dlt.sql ├── 9.5 monitor-delta-live-table-pipelines.sql ├── 9.6 dlt-dabs-cicd │ ├── 9.6 create-medallion-arch-DLT.sql │ ├── databricks.yml │ └── dlt_dabs_cicd_pipeline.yml └── 9.7 apply-changes_into-dlt.sql ├── Chapter10 ├── 10.2 uc_object_hierarchy.sql ├── 10.4 tags_comments_metadata.sql ├── 10.5 filter_sensitive_data.sql ├── 10.6 lineage_view.sql └── 10.7 system_tables.sql ├── Chapter11 ├── 11.1 connect_to_git_repo.py ├── 11.3 using_databricks_sdk.py ├── 11.4 databricks_vscode_extension.py └── 11.5_databricks_asset_bundles │ └── dabs_cicd_example │ ├── .github │ └── workflows │ │ ├── deploy_to_prod_CD.yml │ │ └── deploy_to_qa_CI.yml │ ├── README.md │ ├── databricks.yml │ ├── fixtures │ └── .gitkeep │ ├── requirements-dev.txt │ ├── resources │ ├── dabs_cicd_example_job.yml │ └── dabs_cicd_example_pipeline.yml │ ├── scratch │ ├── README.md │ └── exploration.ipynb │ └── src │ ├── dlt_pipeline.ipynb │ └── notebook.ipynb ├── LICENSE ├── README.md ├── build.sh ├── data ├── Credit Card │ ├── CardBase.csv │ ├── CustomerBase.csv │ ├── FraudBase.csv │ └── TransactionBase.csv ├── Online_Retail.csv ├── Reviews.csv ├── Stanford Question Answering Dataset.json ├── netflix_titles.csv ├── netflix_titles_batch_2.csv ├── nobel_prizes.json ├── nobel_prizes.xml ├── partitioned_recipes │ ├── ._SUCCESS.crc │ ├── DatePublished=2019-01-01 │ │ ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc │ │ └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet │ ├── DatePublished=2019-01-02 │ │ ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc │ │ └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet │ ├── DatePublished=2019-01-03 │ │ ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc │ │ └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet │ ├── DatePublished=2019-01-04 │ │ ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc │ │ └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet │ ├── DatePublished=2019-01-05 │ │ ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc │ │ └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet │ ├── DatePublished=2019-01-06 │ │ ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc │ │ └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet │ ├── DatePublished=2019-01-07 │ │ ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc │ │ └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet │ ├── DatePublished=2019-01-08 │ │ ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc │ │ └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet │ ├── DatePublished=2019-01-09 │ │ ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc │ │ └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet │ ├── DatePublished=2019-01-10 │ │ ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc │ │ └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet │ ├── DatePublished=2019-01-11 │ │ ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc │ │ └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet │ ├── DatePublished=2019-01-12 │ │ ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc │ │ └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet │ ├── DatePublished=2019-01-13 │ │ ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc │ │ └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet │ ├── DatePublished=2019-01-14 │ │ ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc │ │ └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet │ ├── DatePublished=2019-01-15 │ │ ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc │ │ └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet │ ├── DatePublished=2019-01-16 │ │ ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc │ │ └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet │ ├── DatePublished=2019-01-17 │ │ ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc │ │ └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet │ ├── DatePublished=2019-01-18 │ │ ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc │ │ └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet │ ├── DatePublished=2019-01-19 │ │ ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc │ │ └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet │ ├── DatePublished=2019-01-20 │ │ ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc │ │ └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet │ ├── DatePublished=2019-01-21 │ │ ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc │ │ └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet │ ├── DatePublished=2019-01-22 │ │ ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc │ │ └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet │ ├── DatePublished=2019-01-23 │ │ ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc │ │ └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet │ ├── DatePublished=2019-01-24 │ │ ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc │ │ └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet │ ├── DatePublished=2019-01-25 │ │ ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc │ │ └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet │ ├── DatePublished=2019-01-26 │ │ ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc │ │ └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet │ ├── DatePublished=2019-01-27 │ │ ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc │ │ └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet │ ├── DatePublished=2019-01-28 │ │ ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc │ │ └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet │ ├── DatePublished=2019-01-29 │ │ ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc │ │ └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet │ ├── DatePublished=2019-01-30 │ │ ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc │ │ └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet │ ├── DatePublished=2019-01-31 │ │ ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc │ │ └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet │ ├── DatePublished=2020-01-01 │ │ ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc │ │ └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet │ ├── DatePublished=2020-01-02 │ │ ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc │ │ └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet │ ├── DatePublished=2020-01-03 │ │ ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc │ │ └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet │ ├── DatePublished=2020-01-07 │ │ ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc │ │ └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet │ ├── DatePublished=2020-01-09 │ │ ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc │ │ └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet │ ├── DatePublished=2020-01-10 │ │ ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc │ │ └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet │ ├── DatePublished=2020-01-13 │ │ ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc │ │ └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet │ ├── DatePublished=2020-01-14 │ │ ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc │ │ └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet │ ├── DatePublished=2020-01-16 │ │ ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc │ │ └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet │ ├── DatePublished=2020-01-21 │ │ ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc │ │ └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet │ ├── DatePublished=2020-01-23 │ │ ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc │ │ └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet │ ├── DatePublished=2020-01-28 │ │ ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc │ │ └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet │ ├── DatePublished=2020-01-29 │ │ ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc │ │ └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet │ ├── DatePublished=2020-01-30 │ │ ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc │ │ └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet │ ├── DatePublished=2020-01-31 │ │ ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc │ │ └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet │ ├── DatePublished=2020-02-03 │ │ ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc │ │ └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet │ ├── DatePublished=2020-02-04 │ │ ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc │ │ └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet │ ├── DatePublished=2020-02-05 │ │ ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc │ │ └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet │ ├── DatePublished=2020-02-06 │ │ ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc │ │ └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet │ ├── DatePublished=2020-02-07 │ │ ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc │ │ └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet │ ├── DatePublished=2020-02-10 │ │ ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc │ │ └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet │ ├── DatePublished=2020-02-11 │ │ ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc │ │ └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet │ ├── DatePublished=2020-02-13 │ │ ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc │ │ └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet │ ├── DatePublished=2020-02-14 │ │ ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc │ │ └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet │ ├── DatePublished=2020-02-16 │ │ ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc │ │ └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet │ ├── DatePublished=2020-02-18 │ │ ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc │ │ └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet │ ├── DatePublished=2020-02-20 │ │ ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc │ │ └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet │ ├── DatePublished=2020-02-23 │ │ ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc │ │ └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet │ ├── DatePublished=2020-02-24 │ │ ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc │ │ └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet │ ├── DatePublished=2020-02-25 │ │ ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc │ │ └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet │ ├── DatePublished=2020-02-27 │ │ ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc │ │ └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet │ ├── DatePublished=2020-02-28 │ │ ├── .part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc │ │ └── part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet │ └── _SUCCESS ├── recipes.parquet └── titles.csv ├── diagrams └── Chapter10 │ ├── unity_catalog_hierarchy.md │ └── unity_catalog_hierarchy.png ├── docker-compose.yml └── docker ├── base └── Dockerfile ├── jupyterlab ├── 00-first.py └── Dockerfile ├── spark-base └── Dockerfile ├── spark-master └── Dockerfile └── spark-worker └── Dockerfile /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | # Created by https://www.toptal.com/developers/gitignore/api/spark,jupyternotebooks,pycharm,pycharm+iml,pycharm+all 3 | # Edit at https://www.toptal.com/developers/gitignore?templates=spark,jupyternotebooks,pycharm,pycharm+iml,pycharm+all 4 | 5 | ### JupyterNotebooks ### 6 | # gitignore template for Jupyter Notebooks 7 | # website: http://jupyter.org/ 8 | 9 | .vscode/* 10 | !.vscode/settings.json 11 | !.vscode/tasks.json 12 | !.vscode/launch.json 13 | !.vscode/extensions.json 14 | !.vscode/*.code-snippets 15 | 16 | # Local History for Visual Studio Code 17 | .history/ 18 | 19 | # Built Visual Studio Code Extensions 20 | *.vsix 21 | 22 | .databricks/ 23 | build/ 24 | dist/ 25 | __pycache__/ 26 | *.egg-info 27 | .venv/ 28 | scratch/** 29 | !scratch/README.md 30 | 31 | .ipynb_checkpoints 32 | */.ipynb_checkpoints/* 33 | 34 | .Trash* 35 | images/* 36 | data/data_lake/* 37 | data/delta_lake/* 38 | data/tmp/* 39 | # IPython 40 | profile_default/ 41 | ipython_config.py 42 | 43 | # Remove previous ipynb_checkpoints 44 | # git rm -r .ipynb_checkpoints/ 45 | 46 | ### PyCharm ### 47 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider 48 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 49 | 50 | # User-specific stuff 51 | .idea/**/workspace.xml 52 | .idea/**/tasks.xml 53 | .idea/**/usage.statistics.xml 54 | .idea/**/dictionaries 55 | .idea/**/shelf 56 | 57 | # Generated files 58 | .idea/**/contentModel.xml 59 | 60 | # Sensitive or high-churn files 61 | .idea/**/dataSources/ 62 | .idea/**/dataSources.ids 63 | .idea/**/dataSources.local.xml 64 | .idea/**/sqlDataSources.xml 65 | .idea/**/dynamic.xml 66 | .idea/**/uiDesigner.xml 67 | .idea/**/dbnavigator.xml 68 | 69 | # Gradle 70 | .idea/**/gradle.xml 71 | .idea/**/libraries 72 | 73 | # Gradle and Maven with auto-import 74 | # When using Gradle or Maven with auto-import, you should exclude module files, 75 | # since they will be recreated, and may cause churn. Uncomment if using 76 | # auto-import. 77 | # .idea/artifacts 78 | # .idea/compiler.xml 79 | # .idea/jarRepositories.xml 80 | # .idea/modules.xml 81 | # .idea/*.iml 82 | # .idea/modules 83 | # *.iml 84 | # *.ipr 85 | 86 | # CMake 87 | cmake-build-*/ 88 | 89 | # Mongo Explorer plugin 90 | .idea/**/mongoSettings.xml 91 | 92 | # File-based project format 93 | *.iws 94 | 95 | # IntelliJ 96 | out/ 97 | 98 | # mpeltonen/sbt-idea plugin 99 | .idea_modules/ 100 | 101 | # JIRA plugin 102 | atlassian-ide-plugin.xml 103 | 104 | # Cursive Clojure plugin 105 | .idea/replstate.xml 106 | 107 | # Crashlytics plugin (for Android Studio and IntelliJ) 108 | com_crashlytics_export_strings.xml 109 | crashlytics.properties 110 | crashlytics-build.properties 111 | fabric.properties 112 | 113 | # Editor-based Rest Client 114 | .idea/httpRequests 115 | 116 | # Android studio 3.1+ serialized cache file 117 | .idea/caches/build_file_checksums.ser 118 | 119 | ### PyCharm Patch ### 120 | # Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721 121 | 122 | # *.iml 123 | # modules.xml 124 | # .idea/misc.xml 125 | # *.ipr 126 | 127 | # Sonarlint plugin 128 | .idea/**/sonarlint/ 129 | 130 | # SonarQube Plugin 131 | .idea/**/sonarIssues.xml 132 | 133 | # Markdown Navigator plugin 134 | .idea/**/markdown-navigator.xml 135 | .idea/**/markdown-navigator-enh.xml 136 | .idea/**/markdown-navigator/ 137 | 138 | # Cache file creation bug 139 | # See https://youtrack.jetbrains.com/issue/JBR-2257 140 | .idea/$CACHE_FILE$ 141 | 142 | ### PyCharm+all ### 143 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider 144 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 145 | 146 | # User-specific stuff 147 | 148 | # Generated files 149 | 150 | # Sensitive or high-churn files 151 | 152 | # Gradle 153 | 154 | # Gradle and Maven with auto-import 155 | # When using Gradle or Maven with auto-import, you should exclude module files, 156 | # since they will be recreated, and may cause churn. Uncomment if using 157 | # auto-import. 158 | # .idea/artifacts 159 | # .idea/compiler.xml 160 | # .idea/jarRepositories.xml 161 | # .idea/modules.xml 162 | # .idea/*.iml 163 | # .idea/modules 164 | # *.iml 165 | # *.ipr 166 | 167 | # CMake 168 | 169 | # Mongo Explorer plugin 170 | 171 | # File-based project format 172 | 173 | # IntelliJ 174 | 175 | # mpeltonen/sbt-idea plugin 176 | 177 | # JIRA plugin 178 | 179 | # Cursive Clojure plugin 180 | 181 | # Crashlytics plugin (for Android Studio and IntelliJ) 182 | 183 | # Editor-based Rest Client 184 | 185 | # Android studio 3.1+ serialized cache file 186 | 187 | ### PyCharm+all Patch ### 188 | # Ignores the whole .idea folder and all .iml files 189 | # See https://github.com/joeblau/gitignore.io/issues/186 and https://github.com/joeblau/gitignore.io/issues/360 190 | 191 | .idea/ 192 | 193 | # Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-249601023 194 | 195 | *.iml 196 | modules.xml 197 | .idea/misc.xml 198 | *.ipr 199 | 200 | # Sonarlint plugin 201 | .idea/sonarlint 202 | 203 | ### PyCharm+iml ### 204 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider 205 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 206 | 207 | # User-specific stuff 208 | 209 | # Generated files 210 | 211 | # Sensitive or high-churn files 212 | 213 | # Gradle 214 | 215 | # Gradle and Maven with auto-import 216 | # When using Gradle or Maven with auto-import, you should exclude module files, 217 | # since they will be recreated, and may cause churn. Uncomment if using 218 | # auto-import. 219 | # .idea/artifacts 220 | # .idea/compiler.xml 221 | # .idea/jarRepositories.xml 222 | # .idea/modules.xml 223 | # .idea/*.iml 224 | # .idea/modules 225 | # *.iml 226 | # *.ipr 227 | 228 | # CMake 229 | 230 | # Mongo Explorer plugin 231 | 232 | # File-based project format 233 | 234 | # IntelliJ 235 | 236 | # mpeltonen/sbt-idea plugin 237 | 238 | # JIRA plugin 239 | 240 | # Cursive Clojure plugin 241 | 242 | # Crashlytics plugin (for Android Studio and IntelliJ) 243 | 244 | # Editor-based Rest Client 245 | 246 | # Android studio 3.1+ serialized cache file 247 | 248 | ### PyCharm+iml Patch ### 249 | # Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-249601023 250 | 251 | 252 | ### Spark ### 253 | *#*# 254 | *.#* 255 | *.pyc 256 | *.pyo 257 | *.swp 258 | *~ 259 | .DS_Store 260 | .cache 261 | .classpath 262 | .ensime 263 | .ensime_cache/ 264 | .ensime_lucene 265 | .generated-mima* 266 | .project 267 | .pydevproject 268 | .scala_dependencies 269 | .settings 270 | /lib/ 271 | R-unit-tests.log 272 | R/unit-tests.out 273 | R/cran-check.out 274 | R/pkg/vignettes/sparkr-vignettes.html 275 | R/pkg/tests/fulltests/Rplots.pdf 276 | build/*.jar 277 | build/apache-maven* 278 | build/scala* 279 | build/zinc* 280 | cache 281 | checkpoint 282 | conf/*.cmd 283 | conf/*.conf 284 | conf/*.properties 285 | conf/*.sh 286 | conf/*.xml 287 | conf/java-opts 288 | conf/slaves 289 | dependency-reduced-pom.xml 290 | derby.log 291 | dev/create-release/*final 292 | dev/create-release/*txt 293 | dev/pr-deps/ 294 | dist/ 295 | docs/_site 296 | docs/api 297 | sql/docs 298 | sql/site 299 | lib_managed/ 300 | lint-r-report.log 301 | log/ 302 | logs/ 303 | project/boot/ 304 | project/build/target/ 305 | project/plugins/lib_managed/ 306 | project/plugins/project/build.properties 307 | project/plugins/src_managed/ 308 | project/plugins/target/ 309 | python/lib/pyspark.zip 310 | python/deps 311 | python/test_coverage/coverage_data 312 | python/test_coverage/htmlcov 313 | python/pyspark/python 314 | reports/ 315 | scalastyle-on-compile.generated.xml 316 | scalastyle-output.xml 317 | scalastyle.txt 318 | spark-*-bin-*.tgz 319 | spark-tests.log 320 | src_managed/ 321 | streaming-tests.log 322 | target/ 323 | unit-tests.log 324 | work/ 325 | docs/.jekyll-metadata 326 | 327 | # For Hive 328 | TempStatsStore/ 329 | metastore/ 330 | metastore_db/ 331 | sql/hive-thriftserver/test_warehouses 332 | warehouse/ 333 | spark-warehouse/ 334 | 335 | # For R session data 336 | .RData 337 | .RHistory 338 | .Rhistory 339 | *.Rproj 340 | *.Rproj.* 341 | 342 | .Rproj.user 343 | 344 | # For SBT 345 | .jvmopts 346 | 347 | 348 | # End of https://www.toptal.com/developers/gitignore/api/spark,jupyternotebooks,pycharm,pycharm+iml,pycharm+all -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "python.envFile": "${workspaceFolder}/.databricks/.databricks.env", 3 | "databricks.python.envFile": "${workspaceFolder}/.env", 4 | "jupyter.interactiveWindow.cellMarker.codeRegex": "^# COMMAND ----------|^# Databricks notebook source|^(#\\s*%%|#\\s*\\|#\\s*In\\[\\d*?\\]|#\\s*In\\[ \\])", 5 | "jupyter.interactiveWindow.cellMarker.default": "# COMMAND ----------" 6 | } 7 | -------------------------------------------------------------------------------- /Chapter01/1.7 write-data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "f6e19347", 7 | "metadata": { 8 | "tags": [] 9 | }, 10 | "outputs": [ 11 | { 12 | "name": "stderr", 13 | "output_type": "stream", 14 | "text": [ 15 | "Setting default log level to \"WARN\".\n", 16 | "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n" 17 | ] 18 | }, 19 | { 20 | "name": "stdout", 21 | "output_type": "stream", 22 | "text": [ 23 | "23/05/19 12:19:40 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n" 24 | ] 25 | } 26 | ], 27 | "source": [ 28 | "from pyspark.sql import SparkSession\n", 29 | "\n", 30 | "spark = (SparkSession.builder\n", 31 | " .appName(\"write-data\")\n", 32 | " .master(\"spark://spark-master:7077\")\n", 33 | " .config(\"spark.executor.memory\", \"512m\")\n", 34 | " .getOrCreate())\n", 35 | "\n", 36 | "spark.sparkContext.setLogLevel(\"ERROR\")" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 3, 42 | "id": "b20b7da2-b027-478b-aa5a-377277fd12f9", 43 | "metadata": { 44 | "tags": [] 45 | }, 46 | "outputs": [ 47 | { 48 | "name": "stderr", 49 | "output_type": "stream", 50 | "text": [ 51 | " \r" 52 | ] 53 | } 54 | ], 55 | "source": [ 56 | "from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DateType\n", 57 | "\n", 58 | "df = (spark.read.format(\"csv\")\n", 59 | " .option(\"header\", \"true\")\n", 60 | " .option(\"nullValue\", \"null\")\n", 61 | " .option(\"dateFormat\", \"LLLL d, y\")\n", 62 | " .load(\"../data/netflix_titles.csv\"))" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": 7, 68 | "id": "e653265c-a151-4917-82ab-ccf8df808fcd", 69 | "metadata": { 70 | "tags": [] 71 | }, 72 | "outputs": [ 73 | { 74 | "name": "stderr", 75 | "output_type": "stream", 76 | "text": [ 77 | " \r" 78 | ] 79 | } 80 | ], 81 | "source": [ 82 | "(df.write.format(\"csv\")\n", 83 | " .option(\"header\", \"true\") \n", 84 | " .mode(\"overwrite\")\n", 85 | " .option(\"delimiter\", \",\")\n", 86 | " .save(\"../data/data_lake/netflix_csv_data\"))" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": 8, 92 | "id": "353d95f3-0f85-4802-b8b1-7ae6ef4250f3", 93 | "metadata": { 94 | "tags": [] 95 | }, 96 | "outputs": [ 97 | { 98 | "name": "stderr", 99 | "output_type": "stream", 100 | "text": [ 101 | " \r" 102 | ] 103 | } 104 | ], 105 | "source": [ 106 | "(df.write.format(\"json\") \n", 107 | " .mode(\"overwrite\") \n", 108 | " .save(\"../data/data_lake/netflix_json_data\"))" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": 9, 114 | "id": "2d409bc4-41ec-4c60-a5e9-36cec7bc7354", 115 | "metadata": { 116 | "tags": [] 117 | }, 118 | "outputs": [ 119 | { 120 | "name": "stderr", 121 | "output_type": "stream", 122 | "text": [ 123 | " \r" 124 | ] 125 | } 126 | ], 127 | "source": [ 128 | "(df.write.format(\"parquet\") \n", 129 | " .mode(\"overwrite\") \n", 130 | " .save(\"../data/data_lake/netflix_parquet_data\"))" 131 | ] 132 | }, 133 | { 134 | "cell_type": "markdown", 135 | "id": "5ab23f75-9f3d-4f61-ae7c-88894fc1c89d", 136 | "metadata": {}, 137 | "source": [ 138 | "### Write Compressed Data" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": 12, 144 | "id": "8c15d6d9-da07-4f51-a26e-d1e37c935505", 145 | "metadata": { 146 | "tags": [] 147 | }, 148 | "outputs": [ 149 | { 150 | "name": "stderr", 151 | "output_type": "stream", 152 | "text": [ 153 | " \r" 154 | ] 155 | } 156 | ], 157 | "source": [ 158 | "(df.write\n", 159 | " .format(\"csv\")\n", 160 | " .mode(\"overwrite\") \n", 161 | " .option(\"header\", \"true\")\n", 162 | " .option(\"delimiter\", \",\")\n", 163 | " .option(\"codec\", \"org.apache.hadoop.io.compress.GzipCodec\")\n", 164 | " .save(\"../data/data_lake/netflix_csv_data.gz\"))" 165 | ] 166 | }, 167 | { 168 | "cell_type": "markdown", 169 | "id": "3fda5914-2979-4064-a0ad-58e8e63515de", 170 | "metadata": {}, 171 | "source": [ 172 | "### Specify the Number of Partitions" 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": 14, 178 | "id": "1d842078-d65b-4337-829a-d282a9705ebe", 179 | "metadata": { 180 | "tags": [] 181 | }, 182 | "outputs": [ 183 | { 184 | "name": "stderr", 185 | "output_type": "stream", 186 | "text": [ 187 | " \r" 188 | ] 189 | } 190 | ], 191 | "source": [ 192 | "(df.repartition(4) \n", 193 | " .write.format(\"csv\") \n", 194 | " .mode(\"overwrite\") \n", 195 | " .option(\"header\", \"true\") \n", 196 | " .option(\"delimiter\", \",\") \n", 197 | " .save(\"../data/data_lake/netflix_csv_data_4_part\")) " 198 | ] 199 | }, 200 | { 201 | "cell_type": "markdown", 202 | "id": "a81d1655-d1f2-433b-be5c-21d793b92cb4", 203 | "metadata": {}, 204 | "source": [ 205 | "### Use `coalesce()` to Reduce number fo Partitions" 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": 15, 211 | "id": "6f3b93ee-33cc-48df-9869-d825c85e9c4f", 212 | "metadata": { 213 | "tags": [] 214 | }, 215 | "outputs": [ 216 | { 217 | "name": "stderr", 218 | "output_type": "stream", 219 | "text": [ 220 | " \r" 221 | ] 222 | } 223 | ], 224 | "source": [ 225 | "(df.coalesce(1)\n", 226 | " .write.format(\"csv\")\n", 227 | " .mode(\"overwrite\") \n", 228 | " .option(\"header\", \"true\")\n", 229 | " .option(\"delimiter\", \",\")\n", 230 | " .save(\"../data/data_lake/netflix_csv_data_whole\"))" 231 | ] 232 | }, 233 | { 234 | "cell_type": "markdown", 235 | "id": "5c472365-efe7-4e0c-83fe-5cd6cf0fd16c", 236 | "metadata": {}, 237 | "source": [ 238 | "### Use `partitionBy()` to write partitions based on a column" 239 | ] 240 | }, 241 | { 242 | "cell_type": "code", 243 | "execution_count": 16, 244 | "id": "dc9cc49d-4f32-47ee-80f9-3a723aabcc90", 245 | "metadata": { 246 | "tags": [] 247 | }, 248 | "outputs": [ 249 | { 250 | "name": "stderr", 251 | "output_type": "stream", 252 | "text": [ 253 | " \r" 254 | ] 255 | } 256 | ], 257 | "source": [ 258 | "# partition the CSV data by the 'release_year' column\n", 259 | "(df.write.format('csv')\n", 260 | " .option('header', 'true')\n", 261 | " .option('delimiter', ',')\n", 262 | " .mode('overwrite')\n", 263 | " .partitionBy('release_year')\n", 264 | " .save(\"../data/data_lake/netflix_csv_data_partitioned\"))" 265 | ] 266 | }, 267 | { 268 | "cell_type": "code", 269 | "execution_count": 17, 270 | "id": "a4e7e70c-23e8-468e-81c9-7f57640aa511", 271 | "metadata": { 272 | "tags": [] 273 | }, 274 | "outputs": [], 275 | "source": [ 276 | "spark.stop()" 277 | ] 278 | } 279 | ], 280 | "metadata": { 281 | "kernelspec": { 282 | "display_name": "Python 3 (ipykernel)", 283 | "language": "python", 284 | "name": "python3" 285 | }, 286 | "language_info": { 287 | "codemirror_mode": { 288 | "name": "ipython", 289 | "version": 3 290 | }, 291 | "file_extension": ".py", 292 | "mimetype": "text/x-python", 293 | "name": "python", 294 | "nbconvert_exporter": "python", 295 | "pygments_lexer": "ipython3", 296 | "version": "3.10.6" 297 | }, 298 | "vscode": { 299 | "interpreter": { 300 | "hash": "916dbcbb3f70747c44a77c7bcd40155683ae19c65e1c03b4aa3499c5328201f1" 301 | } 302 | } 303 | }, 304 | "nbformat": 4, 305 | "nbformat_minor": 5 306 | } 307 | -------------------------------------------------------------------------------- /Chapter03/3.1 create-delta-tables.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 9, 6 | "id": "54f45c54-0b95-4fd9-a180-fe3be96ab99d", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "from delta import configure_spark_with_delta_pip, DeltaTable\n", 11 | "from pyspark.sql import SparkSession" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 10, 17 | "id": "e7850eab-3759-491d-a70a-7a02977db101", 18 | "metadata": {}, 19 | "outputs": [], 20 | "source": [ 21 | "builder = (SparkSession.builder\n", 22 | " .appName(\"create-delta-table\")\n", 23 | " .master(\"spark://spark-master:7077\")\n", 24 | " .config(\"spark.executor.memory\", \"512m\") \n", 25 | " .config(\"spark.sql.extensions\", \"io.delta.sql.DeltaSparkSessionExtension\")\n", 26 | " .config(\"spark.sql.catalog.spark_catalog\", \"org.apache.spark.sql.delta.catalog.DeltaCatalog\"))\n", 27 | "\n", 28 | "spark = configure_spark_with_delta_pip(builder).getOrCreate()\n", 29 | "spark.sparkContext.setLogLevel(\"ERROR\")" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 11, 35 | "id": "5786a610-0463-45af-a229-1cfb8181cad8", 36 | "metadata": {}, 37 | "outputs": [ 38 | { 39 | "name": "stdout", 40 | "output_type": "stream", 41 | "text": [ 42 | "The sparksql_magic extension is already loaded. To reload it, use:\n", 43 | " %reload_ext sparksql_magic\n" 44 | ] 45 | } 46 | ], 47 | "source": [ 48 | "get_ipython().run_line_magic('load_ext', 'sparksql_magic')\n", 49 | "get_ipython().run_line_magic('config', 'SparkSql.limit=20')" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": 12, 55 | "id": "228e8f56-4fd9-497a-947f-8ce968889dc7", 56 | "metadata": { 57 | "tags": [] 58 | }, 59 | "outputs": [ 60 | { 61 | "name": "stderr", 62 | "output_type": "stream", 63 | "text": [ 64 | " \r" 65 | ] 66 | }, 67 | { 68 | "data": { 69 | "text/html": [ 70 | "
" 71 | ], 72 | "text/plain": [ 73 | "" 74 | ] 75 | }, 76 | "execution_count": 12, 77 | "metadata": {}, 78 | "output_type": "execute_result" 79 | } 80 | ], 81 | "source": [ 82 | "%%sparksql\n", 83 | "CREATE OR REPLACE TABLE default.netflix_titles (\n", 84 | " show_id STRING,\n", 85 | " type STRING,\n", 86 | " title STRING,\n", 87 | " director STRING,\n", 88 | " cast STRING,\n", 89 | " country STRING,\n", 90 | " date_added STRING,\n", 91 | " release_year STRING,\n", 92 | " rating STRING,\n", 93 | " duration STRING,\n", 94 | " listed_in STRING,\n", 95 | " description STRING \n", 96 | ") USING DELTA LOCATION '/opt/workspace/data/delta_lake/netflix_titles';" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": 13, 102 | "id": "a0e26449-e04a-4cf5-a42f-5594943bd3fa", 103 | "metadata": { 104 | "tags": [] 105 | }, 106 | "outputs": [], 107 | "source": [ 108 | "# Read CSV file into a DataFrame\n", 109 | "df = (spark.read\n", 110 | " .format(\"csv\")\n", 111 | " .option(\"header\", \"true\")\n", 112 | " .load(\"../data/netflix_titles.csv\"))" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": 14, 118 | "id": "f8d3d68c-c3b0-4a8a-91b0-32366f2199fc", 119 | "metadata": { 120 | "tags": [] 121 | }, 122 | "outputs": [ 123 | { 124 | "name": "stdout", 125 | "output_type": "stream", 126 | "text": [ 127 | "root\n", 128 | " |-- show_id: string (nullable = true)\n", 129 | " |-- type: string (nullable = true)\n", 130 | " |-- title: string (nullable = true)\n", 131 | " |-- director: string (nullable = true)\n", 132 | " |-- cast: string (nullable = true)\n", 133 | " |-- country: string (nullable = true)\n", 134 | " |-- date_added: string (nullable = true)\n", 135 | " |-- release_year: string (nullable = true)\n", 136 | " |-- rating: string (nullable = true)\n", 137 | " |-- duration: string (nullable = true)\n", 138 | " |-- listed_in: string (nullable = true)\n", 139 | " |-- description: string (nullable = true)\n", 140 | "\n" 141 | ] 142 | } 143 | ], 144 | "source": [ 145 | "df.printSchema()" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": 15, 151 | "id": "5c447d82-f3f3-4b78-98b9-cb8e1e79512c", 152 | "metadata": { 153 | "tags": [] 154 | }, 155 | "outputs": [ 156 | { 157 | "name": "stderr", 158 | "output_type": "stream", 159 | "text": [ 160 | " \r" 161 | ] 162 | } 163 | ], 164 | "source": [ 165 | "df.write.format(\"delta\").mode(\"overwrite\").saveAsTable(\"default.netflix_titles\")" 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": 16, 171 | "id": "23e92c59-b032-49be-8081-471c3dfa79e3", 172 | "metadata": {}, 173 | "outputs": [ 174 | { 175 | "name": "stderr", 176 | "output_type": "stream", 177 | "text": [ 178 | " \r" 179 | ] 180 | }, 181 | { 182 | "data": { 183 | "text/html": [ 184 | "
show_idtypetitledirectorcastcountrydate_addedrelease_yearratingdurationlisted_indescription
s1MovieDick Johnson Is DeadKirsten JohnsonnullUnited StatesSeptember 25, 20212020PG-1390 minDocumentariesAs her father nears the end of his life, filmmaker Kirsten Johnson stages his death in inventive and comical ways to help them both face the inevitable.
s2TV ShowBlood & WaternullAma Qamata, Khosi Ngema, Gail Mabalane, Thabang Molaba, Dillon Windvogel, Natasha Thahane, Arno Greeff, Xolile Tshabalala, Getmore Sithole, Cindy Mahlangu, Ryle De Morny, Greteli Fincham, Sello Maake Ka-Ncube, Odwa Gwanya, Mekaila Mathys, Sandi Schultz, Duane Williams, Shamilla Miller, Patrick MofokengSouth AfricaSeptember 24, 20212021TV-MA2 SeasonsInternational TV Shows, TV Dramas, TV MysteriesAfter crossing paths at a party, a Cape Town teen sets out to prove whether a private-school swimming star is her sister who was abducted at birth.
s3TV ShowGanglandsJulien LeclercqSami Bouajila, Tracy Gotoas, Samuel Jouy, Nabiha Akkari, Sofia Lesaffre, Salim Kechiouche, Noureddine Farihi, Geert Van Rampelberg, Bakary DiomberanullSeptember 24, 20212021TV-MA1 SeasonCrime TV Shows, International TV Shows, TV Action & AdventureTo protect his family from a powerful drug lord, skilled thief Mehdi and his expert team of robbers are pulled into a violent and deadly turf war.
" 185 | ], 186 | "text/plain": [ 187 | "" 188 | ] 189 | }, 190 | "execution_count": 16, 191 | "metadata": {}, 192 | "output_type": "execute_result" 193 | } 194 | ], 195 | "source": [ 196 | "%%sparksql \n", 197 | "SELECT * FROM default.netflix_titles LIMIT 3;" 198 | ] 199 | }, 200 | { 201 | "cell_type": "code", 202 | "execution_count": 17, 203 | "id": "bbdd7547-1607-4aa8-9e44-9e82f7356ef4", 204 | "metadata": {}, 205 | "outputs": [], 206 | "source": [ 207 | "spark.stop()" 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": null, 213 | "id": "bcf3399a-fd4b-4f7e-98cc-658629165132", 214 | "metadata": {}, 215 | "outputs": [], 216 | "source": [] 217 | } 218 | ], 219 | "metadata": { 220 | "kernelspec": { 221 | "display_name": "Python 3 (ipykernel)", 222 | "language": "python", 223 | "name": "python3" 224 | }, 225 | "language_info": { 226 | "codemirror_mode": { 227 | "name": "ipython", 228 | "version": 3 229 | }, 230 | "file_extension": ".py", 231 | "mimetype": "text/x-python", 232 | "name": "python", 233 | "nbconvert_exporter": "python", 234 | "pygments_lexer": "ipython3", 235 | "version": "3.10.12" 236 | } 237 | }, 238 | "nbformat": 4, 239 | "nbformat_minor": 5 240 | } 241 | -------------------------------------------------------------------------------- /Chapter04/4.0 events-gen-kafka.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "id": "8393e835-14fb-4aa3-833f-d60aa5464018", 7 | "metadata": {}, 8 | "outputs": [ 9 | { 10 | "name": "stdout", 11 | "output_type": "stream", 12 | "text": [ 13 | "{'user_id': 94, 'event_type': 'click', 'event_time': '02/04/2024, 18:14:31', 'processing_time': '02/04/2024, 18:14:47'}\n", 14 | "{'user_id': 71, 'event_type': 'like', 'event_time': '02/04/2024, 18:14:55', 'processing_time': '02/04/2024, 18:14:57'}\n", 15 | "{'user_id': 75, 'event_type': 'share', 'event_time': '02/04/2024, 18:15:04', 'processing_time': '02/04/2024, 18:15:07'}\n", 16 | "{'user_id': 85, 'event_type': 'purchase', 'event_time': '02/04/2024, 18:14:55', 'processing_time': '02/04/2024, 18:15:17'}\n", 17 | "{'user_id': 87, 'event_type': 'share', 'event_time': '02/04/2024, 18:15:07', 'processing_time': '02/04/2024, 18:15:27'}\n", 18 | "{'user_id': 16, 'event_type': 'like', 'event_time': '02/04/2024, 18:15:24', 'processing_time': '02/04/2024, 18:15:37'}\n", 19 | "{'user_id': 2, 'event_type': 'purchase', 'event_time': '02/04/2024, 18:15:44', 'processing_time': '02/04/2024, 18:15:47'}\n", 20 | "{'user_id': 19, 'event_type': 'share', 'event_time': '02/04/2024, 18:15:51', 'processing_time': '02/04/2024, 18:15:57'}\n", 21 | "{'user_id': 82, 'event_type': 'view', 'event_time': '02/04/2024, 18:16:01', 'processing_time': '02/04/2024, 18:16:07'}\n", 22 | "{'user_id': 1, 'event_type': 'like', 'event_time': '02/04/2024, 18:16:06', 'processing_time': '02/04/2024, 18:16:17'}\n", 23 | "{'user_id': 45, 'event_type': 'view', 'event_time': '02/04/2024, 18:16:17', 'processing_time': '02/04/2024, 18:16:27'}\n", 24 | "{'user_id': 57, 'event_type': 'click', 'event_time': '02/04/2024, 18:16:24', 'processing_time': '02/04/2024, 18:16:37'}\n", 25 | "{'user_id': 13, 'event_type': 'click', 'event_time': '02/04/2024, 18:16:37', 'processing_time': '02/04/2024, 18:16:47'}\n", 26 | "{'user_id': 96, 'event_type': 'click', 'event_time': '02/04/2024, 18:16:56', 'processing_time': '02/04/2024, 18:16:57'}\n" 27 | ] 28 | }, 29 | { 30 | "ename": "KeyboardInterrupt", 31 | "evalue": "", 32 | "output_type": "error", 33 | "traceback": [ 34 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 35 | "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", 36 | "Cell \u001b[0;32mIn[2], line 40\u001b[0m\n\u001b[1;32m 37\u001b[0m \u001b[38;5;66;03m# Send the event to the Kafka topic\u001b[39;00m\n\u001b[1;32m 39\u001b[0m producer\u001b[38;5;241m.\u001b[39msend(topic, value\u001b[38;5;241m=\u001b[39mjson\u001b[38;5;241m.\u001b[39mdumps(event)\u001b[38;5;241m.\u001b[39mencode(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mutf-8\u001b[39m\u001b[38;5;124m'\u001b[39m))\n\u001b[0;32m---> 40\u001b[0m \u001b[43mtime\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msleep\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m10\u001b[39;49m\u001b[43m)\u001b[49m\n", 37 | "\u001b[0;31mKeyboardInterrupt\u001b[0m: " 38 | ] 39 | } 40 | ], 41 | "source": [ 42 | "# Import the required modules\n", 43 | "import random\n", 44 | "import json\n", 45 | "from kafka import KafkaProducer\n", 46 | "import time\n", 47 | "\n", 48 | "# using datetime module\n", 49 | "import datetime;\n", 50 | "\n", 51 | "# Define the bootstrap servers and the topic name\n", 52 | "bootstrap_servers = \"kafka:9092\"\n", 53 | "topic = \"events\"\n", 54 | "\n", 55 | "# Create a Kafka producer with JSON value serializer\n", 56 | "producer = KafkaProducer(bootstrap_servers=bootstrap_servers)\n", 57 | "\n", 58 | "# Define a function to generate random event data\n", 59 | "def generate_event():\n", 60 | " # Generate a random user id from 1 to 100\n", 61 | " current_time = time.time()\n", 62 | " user_id = random.randint(1, 100)\n", 63 | " # Generate a random event type from a list of options\n", 64 | " event_type = random.choice([\"click\", \"view\", \"purchase\", \"like\", \"share\"])\n", 65 | " # Generate a random event time from 0 to 9999\n", 66 | " event_time = datetime.datetime.fromtimestamp(current_time- abs(random.normalvariate(0, 10))).strftime(\"%m/%d/%Y, %H:%M:%S\")\n", 67 | " # Generate a random event time from 0 to 9999\n", 68 | " processing_time =datetime.datetime.fromtimestamp(current_time).strftime(\"%m/%d/%Y, %H:%M:%S\")\n", 69 | " # Return a dictionary with the event data\n", 70 | " return {\"user_id\": user_id, \"event_type\": event_type, \"event_time\": event_time, \"processing_time\": processing_time}\n", 71 | "\n", 72 | "# Loop to generate and send events\n", 73 | "while True:\n", 74 | " # Generate a random event\n", 75 | " event = generate_event()\n", 76 | " # Print the event to the console\n", 77 | " print(event)\n", 78 | " # Send the event to the Kafka topic\n", 79 | " \n", 80 | " producer.send(topic, value=json.dumps(event).encode('utf-8'))\n", 81 | " time.sleep(10)\n" 82 | ] 83 | }, 84 | { 85 | "cell_type": "markdown", 86 | "id": "87081f46-4053-4df2-b262-aec86df50970", 87 | "metadata": { 88 | "jp-MarkdownHeadingCollapsed": true 89 | }, 90 | "source": [ 91 | "### " 92 | ] 93 | } 94 | ], 95 | "metadata": { 96 | "kernelspec": { 97 | "display_name": "Python 3 (ipykernel)", 98 | "language": "python", 99 | "name": "python3" 100 | }, 101 | "language_info": { 102 | "codemirror_mode": { 103 | "name": "ipython", 104 | "version": 3 105 | }, 106 | "file_extension": ".py", 107 | "mimetype": "text/x-python", 108 | "name": "python", 109 | "nbconvert_exporter": "python", 110 | "pygments_lexer": "ipython3", 111 | "version": "3.10.12" 112 | } 113 | }, 114 | "nbformat": 4, 115 | "nbformat_minor": 5 116 | } 117 | -------------------------------------------------------------------------------- /Chapter04/4.0 user-gen-kafka.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "8393e835-14fb-4aa3-833f-d60aa5464018", 7 | "metadata": {}, 8 | "outputs": [ 9 | { 10 | "name": "stdout", 11 | "output_type": "stream", 12 | "text": [ 13 | "{'id': 46, 'name': 'user87', 'age': 56, 'gender': 'F', 'country': 'India', 'timestamp': '02/04/2024, 17:57:37'}\n", 14 | "{'id': 43, 'name': 'user54', 'age': 45, 'gender': 'F', 'country': 'Brazil', 'timestamp': '02/04/2024, 17:57:47'}\n", 15 | "{'id': 64, 'name': 'user43', 'age': 49, 'gender': 'M', 'country': 'Brazil', 'timestamp': '02/04/2024, 17:57:57'}\n", 16 | "{'id': 60, 'name': 'user8', 'age': 48, 'gender': 'F', 'country': 'USA', 'timestamp': '02/04/2024, 17:58:07'}\n", 17 | "{'id': 12, 'name': 'user41', 'age': 29, 'gender': 'F', 'country': 'Canada', 'timestamp': '02/04/2024, 17:58:17'}\n", 18 | "{'id': 12, 'name': 'user78', 'age': 28, 'gender': 'F', 'country': 'China', 'timestamp': '02/04/2024, 17:58:27'}\n", 19 | "{'id': 63, 'name': 'user39', 'age': 57, 'gender': 'M', 'country': 'Australia', 'timestamp': '02/04/2024, 17:58:37'}\n", 20 | "{'id': 32, 'name': 'user23', 'age': 54, 'gender': 'M', 'country': 'China', 'timestamp': '02/04/2024, 17:58:47'}\n", 21 | "{'id': 32, 'name': 'user53', 'age': 19, 'gender': 'M', 'country': 'USA', 'timestamp': '02/04/2024, 17:58:57'}\n", 22 | "{'id': 58, 'name': 'user52', 'age': 48, 'gender': 'F', 'country': 'India', 'timestamp': '02/04/2024, 17:59:07'}\n", 23 | "{'id': 65, 'name': 'user65', 'age': 44, 'gender': 'F', 'country': 'Australia', 'timestamp': '02/04/2024, 17:59:17'}\n", 24 | "{'id': 60, 'name': 'user20', 'age': 33, 'gender': 'F', 'country': 'USA', 'timestamp': '02/04/2024, 17:59:27'}\n", 25 | "{'id': 71, 'name': 'user61', 'age': 26, 'gender': 'F', 'country': 'Canada', 'timestamp': '02/04/2024, 17:59:37'}\n", 26 | "{'id': 69, 'name': 'user74', 'age': 28, 'gender': 'F', 'country': 'India', 'timestamp': '02/04/2024, 17:59:47'}\n", 27 | "{'id': 85, 'name': 'user29', 'age': 56, 'gender': 'F', 'country': 'Brazil', 'timestamp': '02/04/2024, 17:59:57'}\n", 28 | "{'id': 90, 'name': 'user18', 'age': 29, 'gender': 'F', 'country': 'UK', 'timestamp': '02/04/2024, 18:00:07'}\n", 29 | "{'id': 52, 'name': 'user15', 'age': 36, 'gender': 'M', 'country': 'UK', 'timestamp': '02/04/2024, 18:00:17'}\n", 30 | "{'id': 60, 'name': 'user59', 'age': 41, 'gender': 'F', 'country': 'Australia', 'timestamp': '02/04/2024, 18:00:27'}\n", 31 | "{'id': 99, 'name': 'user19', 'age': 18, 'gender': 'M', 'country': 'Canada', 'timestamp': '02/04/2024, 18:00:37'}\n", 32 | "{'id': 95, 'name': 'user62', 'age': 63, 'gender': 'M', 'country': 'USA', 'timestamp': '02/04/2024, 18:00:47'}\n", 33 | "{'id': 9, 'name': 'user14', 'age': 46, 'gender': 'F', 'country': 'Canada', 'timestamp': '02/04/2024, 18:00:57'}\n", 34 | "{'id': 79, 'name': 'user90', 'age': 22, 'gender': 'M', 'country': 'India', 'timestamp': '02/04/2024, 18:01:07'}\n", 35 | "{'id': 8, 'name': 'user54', 'age': 25, 'gender': 'M', 'country': 'India', 'timestamp': '02/04/2024, 18:01:17'}\n", 36 | "{'id': 18, 'name': 'user44', 'age': 25, 'gender': 'F', 'country': 'China', 'timestamp': '02/04/2024, 18:01:27'}\n", 37 | "{'id': 64, 'name': 'user92', 'age': 63, 'gender': 'F', 'country': 'Australia', 'timestamp': '02/04/2024, 18:01:37'}\n", 38 | "{'id': 13, 'name': 'user9', 'age': 60, 'gender': 'M', 'country': 'Canada', 'timestamp': '02/04/2024, 18:01:47'}\n", 39 | "{'id': 63, 'name': 'user91', 'age': 45, 'gender': 'M', 'country': 'China', 'timestamp': '02/04/2024, 18:01:57'}\n", 40 | "{'id': 29, 'name': 'user34', 'age': 61, 'gender': 'F', 'country': 'UK', 'timestamp': '02/04/2024, 18:02:07'}\n", 41 | "{'id': 48, 'name': 'user93', 'age': 58, 'gender': 'M', 'country': 'Canada', 'timestamp': '02/04/2024, 18:02:17'}\n", 42 | "{'id': 26, 'name': 'user9', 'age': 57, 'gender': 'M', 'country': 'India', 'timestamp': '02/04/2024, 18:02:27'}\n", 43 | "{'id': 97, 'name': 'user94', 'age': 44, 'gender': 'F', 'country': 'Brazil', 'timestamp': '02/04/2024, 18:02:37'}\n", 44 | "{'id': 86, 'name': 'user68', 'age': 44, 'gender': 'F', 'country': 'UK', 'timestamp': '02/04/2024, 18:02:47'}\n", 45 | "{'id': 47, 'name': 'user45', 'age': 20, 'gender': 'M', 'country': 'Canada', 'timestamp': '02/04/2024, 18:02:57'}\n", 46 | "{'id': 41, 'name': 'user59', 'age': 47, 'gender': 'M', 'country': 'Canada', 'timestamp': '02/04/2024, 18:03:07'}\n", 47 | "{'id': 59, 'name': 'user79', 'age': 65, 'gender': 'M', 'country': 'Australia', 'timestamp': '02/04/2024, 18:03:17'}\n", 48 | "{'id': 76, 'name': 'user41', 'age': 50, 'gender': 'M', 'country': 'USA', 'timestamp': '02/04/2024, 18:03:27'}\n", 49 | "{'id': 18, 'name': 'user11', 'age': 57, 'gender': 'F', 'country': 'Australia', 'timestamp': '02/04/2024, 18:03:37'}\n", 50 | "{'id': 51, 'name': 'user70', 'age': 56, 'gender': 'F', 'country': 'India', 'timestamp': '02/04/2024, 18:03:47'}\n", 51 | "{'id': 69, 'name': 'user15', 'age': 22, 'gender': 'F', 'country': 'Canada', 'timestamp': '02/04/2024, 18:03:57'}\n", 52 | "{'id': 13, 'name': 'user83', 'age': 51, 'gender': 'F', 'country': 'Brazil', 'timestamp': '02/04/2024, 18:04:07'}\n", 53 | "{'id': 84, 'name': 'user75', 'age': 53, 'gender': 'M', 'country': 'USA', 'timestamp': '02/04/2024, 18:04:17'}\n", 54 | "{'id': 34, 'name': 'user79', 'age': 30, 'gender': 'F', 'country': 'Canada', 'timestamp': '02/04/2024, 18:04:27'}\n", 55 | "{'id': 45, 'name': 'user25', 'age': 20, 'gender': 'M', 'country': 'Brazil', 'timestamp': '02/04/2024, 18:04:37'}\n", 56 | "{'id': 34, 'name': 'user95', 'age': 50, 'gender': 'M', 'country': 'China', 'timestamp': '02/04/2024, 18:04:47'}\n", 57 | "{'id': 46, 'name': 'user52', 'age': 54, 'gender': 'M', 'country': 'UK', 'timestamp': '02/04/2024, 18:04:57'}\n", 58 | "{'id': 46, 'name': 'user68', 'age': 51, 'gender': 'M', 'country': 'India', 'timestamp': '02/04/2024, 18:05:07'}\n" 59 | ] 60 | } 61 | ], 62 | "source": [ 63 | "import json\n", 64 | "import random\n", 65 | "import time\n", 66 | "from kafka import KafkaProducer\n", 67 | "# using datetime module\n", 68 | "import datetime;\n", 69 | "\n", 70 | "producer = KafkaProducer(bootstrap_servers='kafka:9092')\n", 71 | "countries = ['USA', 'UK', 'India', 'China', 'Brazil', 'Canada', 'Australia']\n", 72 | "genders = ['M', 'F']\n", 73 | "\n", 74 | "while True:\n", 75 | " current_time = time.time()\n", 76 | " message = {\n", 77 | " 'id': random.randint(1, 100),\n", 78 | " 'name': f'user{random.randint(1, 100)}',\n", 79 | " 'age': random.randint(18, 65),\n", 80 | " 'gender': random.choice(genders),\n", 81 | " 'country': random.choice(countries),\n", 82 | " 'timestamp':datetime.datetime.fromtimestamp(current_time).strftime(\"%m/%d/%Y, %H:%M:%S\")\n", 83 | " }\n", 84 | " producer.send('users', value=json.dumps(message).encode('utf-8'))\n", 85 | " print(message)\n", 86 | " time.sleep(10)" 87 | ] 88 | }, 89 | { 90 | "cell_type": "markdown", 91 | "id": "87081f46-4053-4df2-b262-aec86df50970", 92 | "metadata": { 93 | "jp-MarkdownHeadingCollapsed": true 94 | }, 95 | "source": [ 96 | "### " 97 | ] 98 | } 99 | ], 100 | "metadata": { 101 | "kernelspec": { 102 | "display_name": "Python 3 (ipykernel)", 103 | "language": "python", 104 | "name": "python3" 105 | }, 106 | "language_info": { 107 | "codemirror_mode": { 108 | "name": "ipython", 109 | "version": 3 110 | }, 111 | "file_extension": ".py", 112 | "mimetype": "text/x-python", 113 | "name": "python", 114 | "nbconvert_exporter": "python", 115 | "pygments_lexer": "ipython3", 116 | "version": "3.10.12" 117 | } 118 | }, 119 | "nbformat": 4, 120 | "nbformat_minor": 5 121 | } 122 | -------------------------------------------------------------------------------- /Chapter04/4.1 config-streaming.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "2cb9c58f-e466-48b4-81ab-07872b03c918", 6 | "metadata": {}, 7 | "source": [ 8 | "For running this recipe, we first need to set up incoming streaming data. We will feed data by opening a terminal window in Jupyter labs UI and run the following command that uses the nc (netcat) utility to create a socket connection on port 9999 and listen for incoming data: \n", 9 | "\n", 10 | "`nc -lk 9999 `\n", 11 | "\n", 12 | "Once the previous command is running, you can start typing any text on the command line. \n", 13 | "\n", 14 | "For example, you can enter the following text: \n", 15 | "\n", 16 | "Fundamentals of Data Engineering: Plan and Build Robust Data Systems by Joe Reis and Matt Housley. This book provides a concise overview of the data engineering landscape and a framework of best practices to assess and solve data engineering problems. It also helps you choose the best technologies and architectures for your data needs. \n", 17 | " \n", 18 | "Designing Data-Intensive Applications: The Big Ideas Behind Reliable, Scalable, and Maintainable Systems** by Martin Kleppmann. This book explains the fundamental principles and trade-offs behind the design of distributed data systems. It covers topics such as replication, partitioning, consistency, fault tolerance, batch and stream processing, and data model" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 1, 24 | "id": "54f45c54-0b95-4fd9-a180-fe3be96ab99d", 25 | "metadata": {}, 26 | "outputs": [], 27 | "source": [ 28 | "from pyspark.sql import SparkSession\n", 29 | "from pyspark.sql.functions import explode, split" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 2, 35 | "id": "e7850eab-3759-491d-a70a-7a02977db101", 36 | "metadata": {}, 37 | "outputs": [ 38 | { 39 | "name": "stderr", 40 | "output_type": "stream", 41 | "text": [ 42 | "Setting default log level to \"WARN\".\n", 43 | "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n", 44 | "24/02/04 17:37:37 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n" 45 | ] 46 | } 47 | ], 48 | "source": [ 49 | "spark = (SparkSession.builder\n", 50 | " .appName(\"config-streaming\")\n", 51 | " .master(\"spark://spark-master:7077\")\n", 52 | " .config(\"spark.executor.memory\", \"512m\")\n", 53 | " .getOrCreate())\n", 54 | "spark.sparkContext.setLogLevel(\"ERROR\")" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": 3, 60 | "id": "228e8f56-4fd9-497a-947f-8ce968889dc7", 61 | "metadata": { 62 | "tags": [] 63 | }, 64 | "outputs": [], 65 | "source": [ 66 | "# Create DataFrame representing the stream of input lines from connection to localhost:9999\n", 67 | "lines = (spark.readStream\n", 68 | " .format(\"socket\")\n", 69 | " .option(\"host\", \"localhost\")\n", 70 | " .option(\"port\", 9999)\n", 71 | " .load())" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": 4, 77 | "id": "a0e26449-e04a-4cf5-a42f-5594943bd3fa", 78 | "metadata": { 79 | "tags": [] 80 | }, 81 | "outputs": [], 82 | "source": [ 83 | "# Split the lines into words\n", 84 | "words = lines.select(\n", 85 | " explode(split(lines.value, \" \")).alias(\"word\"))" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": 5, 91 | "id": "f8d3d68c-c3b0-4a8a-91b0-32366f2199fc", 92 | "metadata": { 93 | "tags": [] 94 | }, 95 | "outputs": [], 96 | "source": [ 97 | "# Generate running word count\n", 98 | "wordCounts = words.groupBy(\"word\").count()" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": 6, 104 | "id": "5c447d82-f3f3-4b78-98b9-cb8e1e79512c", 105 | "metadata": { 106 | "tags": [] 107 | }, 108 | "outputs": [ 109 | { 110 | "name": "stderr", 111 | "output_type": "stream", 112 | "text": [ 113 | " \r" 114 | ] 115 | }, 116 | { 117 | "name": "stdout", 118 | "output_type": "stream", 119 | "text": [ 120 | "-------------------------------------------\n", 121 | "Batch: 0\n", 122 | "-------------------------------------------\n", 123 | "+----+-----+\n", 124 | "|word|count|\n", 125 | "+----+-----+\n", 126 | "+----+-----+\n", 127 | "\n" 128 | ] 129 | }, 130 | { 131 | "name": "stderr", 132 | "output_type": "stream", 133 | "text": [ 134 | " \r" 135 | ] 136 | }, 137 | { 138 | "name": "stdout", 139 | "output_type": "stream", 140 | "text": [ 141 | "-------------------------------------------\n", 142 | "Batch: 1\n", 143 | "-------------------------------------------\n", 144 | "+------------+-----+\n", 145 | "| word|count|\n", 146 | "+------------+-----+\n", 147 | "| Data| 2|\n", 148 | "| overview| 1|\n", 149 | "|Fundamentals| 1|\n", 150 | "| stream| 1|\n", 151 | "| by| 2|\n", 152 | "| solve| 1|\n", 153 | "| you| 1|\n", 154 | "| landscape| 1|\n", 155 | "| systems.| 1|\n", 156 | "|replication,| 1|\n", 157 | "| for| 1|\n", 158 | "| Joe| 1|\n", 159 | "| tolerance,| 1|\n", 160 | "| provides| 1|\n", 161 | "| Reis| 1|\n", 162 | "| topics| 1|\n", 163 | "| practices| 1|\n", 164 | "| model| 1|\n", 165 | "| concise| 1|\n", 166 | "| distributed| 1|\n", 167 | "+------------+-----+\n", 168 | "only showing top 20 rows\n", 169 | "\n" 170 | ] 171 | }, 172 | { 173 | "name": "stderr", 174 | "output_type": "stream", 175 | "text": [ 176 | " \r" 177 | ] 178 | }, 179 | { 180 | "name": "stdout", 181 | "output_type": "stream", 182 | "text": [ 183 | "-------------------------------------------\n", 184 | "Batch: 2\n", 185 | "-------------------------------------------\n", 186 | "+------------+-----+\n", 187 | "| word|count|\n", 188 | "+------------+-----+\n", 189 | "| Dynamical| 1|\n", 190 | "| Data| 2|\n", 191 | "| complex| 1|\n", 192 | "| overview| 1|\n", 193 | "| Science| 1|\n", 194 | "|Fundamentals| 1|\n", 195 | "| stream| 1|\n", 196 | "| Nathan| 1|\n", 197 | "| by| 3|\n", 198 | "| solve| 2|\n", 199 | "| you| 2|\n", 200 | "| landscape| 1|\n", 201 | "| L.| 1|\n", 202 | "| systems.| 1|\n", 203 | "| apply| 1|\n", 204 | "|replication,| 1|\n", 205 | "| for| 1|\n", 206 | "| Joe| 1|\n", 207 | "| how| 1|\n", 208 | "| reduction,| 1|\n", 209 | "+------------+-----+\n", 210 | "only showing top 20 rows\n", 211 | "\n" 212 | ] 213 | } 214 | ], 215 | "source": [ 216 | " # Start running the query that prints the running counts to the console\n", 217 | "query = (wordCounts.writeStream\n", 218 | " .outputMode(\"complete\")\n", 219 | " .format(\"console\")\n", 220 | " .start())" 221 | ] 222 | }, 223 | { 224 | "cell_type": "markdown", 225 | "id": "b1330621-ab2a-4e08-b5c2-41312d472fd5", 226 | "metadata": {}, 227 | "source": [ 228 | "Open the terminal and add more data to the netcat listener. See the following example text: \n", 229 | "\n", 230 | "__Data-Driven Science and Engineering: Machine Learning, Dynamical Systems, and Control by Steven L. Brunton and J. Nathan Kutz13. This book teaches you how to apply machine learning and data analytics techniques to solve complex engineering and scientific problems. It covers topics such as dimensionality reduction, sparse sensing, system identification, and control design.__\n", 231 | "\n", 232 | "A new batch for the stream query is triggered and the output is updated as shown: " 233 | ] 234 | }, 235 | { 236 | "cell_type": "code", 237 | "execution_count": 7, 238 | "id": "bbdd7547-1607-4aa8-9e44-9e82f7356ef4", 239 | "metadata": {}, 240 | "outputs": [], 241 | "source": [ 242 | "query.stop()" 243 | ] 244 | }, 245 | { 246 | "cell_type": "code", 247 | "execution_count": 8, 248 | "id": "3fb39cc3-e38d-4bcb-96d3-7493e83f8c42", 249 | "metadata": {}, 250 | "outputs": [], 251 | "source": [ 252 | "spark.stop()" 253 | ] 254 | }, 255 | { 256 | "cell_type": "code", 257 | "execution_count": null, 258 | "id": "6410a564-9653-4ece-aeb4-e56df9bd881d", 259 | "metadata": {}, 260 | "outputs": [], 261 | "source": [] 262 | } 263 | ], 264 | "metadata": { 265 | "kernelspec": { 266 | "display_name": "Python 3 (ipykernel)", 267 | "language": "python", 268 | "name": "python3" 269 | }, 270 | "language_info": { 271 | "codemirror_mode": { 272 | "name": "ipython", 273 | "version": 3 274 | }, 275 | "file_extension": ".py", 276 | "mimetype": "text/x-python", 277 | "name": "python", 278 | "nbconvert_exporter": "python", 279 | "pygments_lexer": "ipython3", 280 | "version": "3.10.12" 281 | } 282 | }, 283 | "nbformat": 4, 284 | "nbformat_minor": 5 285 | } 286 | -------------------------------------------------------------------------------- /Chapter05/5.0 events-gen-kafka.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "8393e835-14fb-4aa3-833f-d60aa5464018", 7 | "metadata": {}, 8 | "outputs": [ 9 | { 10 | "name": "stdout", 11 | "output_type": "stream", 12 | "text": [ 13 | "{'user_id': 47, 'event_type': 'purchase', 'event_time': '08/26/2023, 12:15:57', 'processing_time': '08/26/2023, 12:16:13'}\n", 14 | "{'user_id': 44, 'event_type': 'click', 'event_time': '08/26/2023, 12:16:14', 'processing_time': '08/26/2023, 12:16:24'}\n", 15 | "{'user_id': 63, 'event_type': 'click', 'event_time': '08/26/2023, 12:16:21', 'processing_time': '08/26/2023, 12:16:34'}\n", 16 | "{'user_id': 68, 'event_type': 'purchase', 'event_time': '08/26/2023, 12:16:32', 'processing_time': '08/26/2023, 12:16:44'}\n", 17 | "{'user_id': 89, 'event_type': 'click', 'event_time': '08/26/2023, 12:16:32', 'processing_time': '08/26/2023, 12:16:54'}\n", 18 | "{'user_id': 15, 'event_type': 'view', 'event_time': '08/26/2023, 12:16:55', 'processing_time': '08/26/2023, 12:17:04'}\n", 19 | "{'user_id': 94, 'event_type': 'view', 'event_time': '08/26/2023, 12:16:54', 'processing_time': '08/26/2023, 12:17:14'}\n", 20 | "{'user_id': 45, 'event_type': 'view', 'event_time': '08/26/2023, 12:17:23', 'processing_time': '08/26/2023, 12:17:24'}\n", 21 | "{'user_id': 17, 'event_type': 'purchase', 'event_time': '08/26/2023, 12:17:23', 'processing_time': '08/26/2023, 12:17:34'}\n", 22 | "{'user_id': 48, 'event_type': 'share', 'event_time': '08/26/2023, 12:17:36', 'processing_time': '08/26/2023, 12:17:44'}\n", 23 | "{'user_id': 86, 'event_type': 'purchase', 'event_time': '08/26/2023, 12:17:48', 'processing_time': '08/26/2023, 12:17:54'}\n", 24 | "{'user_id': 92, 'event_type': 'like', 'event_time': '08/26/2023, 12:18:01', 'processing_time': '08/26/2023, 12:18:04'}\n", 25 | "{'user_id': 50, 'event_type': 'purchase', 'event_time': '08/26/2023, 12:18:12', 'processing_time': '08/26/2023, 12:18:14'}\n", 26 | "{'user_id': 94, 'event_type': 'share', 'event_time': '08/26/2023, 12:18:22', 'processing_time': '08/26/2023, 12:18:24'}\n", 27 | "{'user_id': 84, 'event_type': 'purchase', 'event_time': '08/26/2023, 12:18:30', 'processing_time': '08/26/2023, 12:18:34'}\n", 28 | "{'user_id': 54, 'event_type': 'click', 'event_time': '08/26/2023, 12:18:34', 'processing_time': '08/26/2023, 12:18:44'}\n", 29 | "{'user_id': 24, 'event_type': 'share', 'event_time': '08/26/2023, 12:18:41', 'processing_time': '08/26/2023, 12:18:54'}\n", 30 | "{'user_id': 92, 'event_type': 'share', 'event_time': '08/26/2023, 12:19:00', 'processing_time': '08/26/2023, 12:19:04'}\n", 31 | "{'user_id': 66, 'event_type': 'click', 'event_time': '08/26/2023, 12:18:55', 'processing_time': '08/26/2023, 12:19:14'}\n", 32 | "{'user_id': 72, 'event_type': 'view', 'event_time': '08/26/2023, 12:19:17', 'processing_time': '08/26/2023, 12:19:24'}\n", 33 | "{'user_id': 67, 'event_type': 'click', 'event_time': '08/26/2023, 12:19:23', 'processing_time': '08/26/2023, 12:19:34'}\n", 34 | "{'user_id': 80, 'event_type': 'purchase', 'event_time': '08/26/2023, 12:19:38', 'processing_time': '08/26/2023, 12:19:44'}\n", 35 | "{'user_id': 61, 'event_type': 'share', 'event_time': '08/26/2023, 12:19:39', 'processing_time': '08/26/2023, 12:19:54'}\n", 36 | "{'user_id': 54, 'event_type': 'purchase', 'event_time': '08/26/2023, 12:20:00', 'processing_time': '08/26/2023, 12:20:04'}\n", 37 | "{'user_id': 57, 'event_type': 'view', 'event_time': '08/26/2023, 12:20:13', 'processing_time': '08/26/2023, 12:20:14'}\n", 38 | "{'user_id': 21, 'event_type': 'like', 'event_time': '08/26/2023, 12:20:20', 'processing_time': '08/26/2023, 12:20:24'}\n", 39 | "{'user_id': 55, 'event_type': 'like', 'event_time': '08/26/2023, 12:20:21', 'processing_time': '08/26/2023, 12:20:34'}\n", 40 | "{'user_id': 55, 'event_type': 'purchase', 'event_time': '08/26/2023, 12:20:37', 'processing_time': '08/26/2023, 12:20:44'}\n", 41 | "{'user_id': 83, 'event_type': 'purchase', 'event_time': '08/26/2023, 12:20:51', 'processing_time': '08/26/2023, 12:20:54'}\n", 42 | "{'user_id': 46, 'event_type': 'share', 'event_time': '08/26/2023, 12:20:44', 'processing_time': '08/26/2023, 12:21:04'}\n", 43 | "{'user_id': 38, 'event_type': 'click', 'event_time': '08/26/2023, 12:21:09', 'processing_time': '08/26/2023, 12:21:14'}\n", 44 | "{'user_id': 84, 'event_type': 'like', 'event_time': '08/26/2023, 12:21:17', 'processing_time': '08/26/2023, 12:21:24'}\n", 45 | "{'user_id': 63, 'event_type': 'purchase', 'event_time': '08/26/2023, 12:21:10', 'processing_time': '08/26/2023, 12:21:34'}\n" 46 | ] 47 | }, 48 | { 49 | "ename": "KeyboardInterrupt", 50 | "evalue": "", 51 | "output_type": "error", 52 | "traceback": [ 53 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 54 | "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", 55 | "Cell \u001b[0;32mIn[1], line 40\u001b[0m\n\u001b[1;32m 37\u001b[0m \u001b[38;5;66;03m# Send the event to the Kafka topic\u001b[39;00m\n\u001b[1;32m 39\u001b[0m producer\u001b[38;5;241m.\u001b[39msend(topic, value\u001b[38;5;241m=\u001b[39mjson\u001b[38;5;241m.\u001b[39mdumps(event)\u001b[38;5;241m.\u001b[39mencode(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mutf-8\u001b[39m\u001b[38;5;124m'\u001b[39m))\n\u001b[0;32m---> 40\u001b[0m \u001b[43mtime\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msleep\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m10\u001b[39;49m\u001b[43m)\u001b[49m\n", 56 | "\u001b[0;31mKeyboardInterrupt\u001b[0m: " 57 | ] 58 | } 59 | ], 60 | "source": [ 61 | "# Import the required modules\n", 62 | "import random\n", 63 | "import json\n", 64 | "from kafka import KafkaProducer\n", 65 | "import time\n", 66 | "\n", 67 | "# using datetime module\n", 68 | "import datetime;\n", 69 | "\n", 70 | "# Define the bootstrap servers and the topic name\n", 71 | "bootstrap_servers = \"kafka:9092\"\n", 72 | "topic = \"events\"\n", 73 | "\n", 74 | "# Create a Kafka producer with JSON value serializer\n", 75 | "producer = KafkaProducer(bootstrap_servers=bootstrap_servers)\n", 76 | "\n", 77 | "# Define a function to generate random event data\n", 78 | "def generate_event():\n", 79 | " # Generate a random user id from 1 to 100\n", 80 | " current_time = time.time()\n", 81 | " user_id = random.randint(1, 100)\n", 82 | " # Generate a random event type from a list of options\n", 83 | " event_type = random.choice([\"click\", \"view\", \"purchase\", \"like\", \"share\"])\n", 84 | " # Generate a random event time from 0 to 9999\n", 85 | " event_time = datetime.datetime.fromtimestamp(current_time- abs(random.normalvariate(0, 10))).strftime(\"%m/%d/%Y, %H:%M:%S\")\n", 86 | " # Generate a random event time from 0 to 9999\n", 87 | " processing_time =datetime.datetime.fromtimestamp(current_time).strftime(\"%m/%d/%Y, %H:%M:%S\")\n", 88 | " # Return a dictionary with the event data\n", 89 | " return {\"user_id\": user_id, \"event_type\": event_type, \"event_time\": event_time, \"processing_time\": processing_time}\n", 90 | "\n", 91 | "# Loop to generate and send events\n", 92 | "while True:\n", 93 | " # Generate a random event\n", 94 | " event = generate_event()\n", 95 | " # Print the event to the console\n", 96 | " print(event)\n", 97 | " # Send the event to the Kafka topic\n", 98 | " \n", 99 | " producer.send(topic, value=json.dumps(event).encode('utf-8'))\n", 100 | " time.sleep(10)\n" 101 | ] 102 | }, 103 | { 104 | "cell_type": "markdown", 105 | "id": "87081f46-4053-4df2-b262-aec86df50970", 106 | "metadata": { 107 | "jp-MarkdownHeadingCollapsed": true 108 | }, 109 | "source": [ 110 | "### " 111 | ] 112 | } 113 | ], 114 | "metadata": { 115 | "kernelspec": { 116 | "display_name": "Python 3 (ipykernel)", 117 | "language": "python", 118 | "name": "python3" 119 | }, 120 | "language_info": { 121 | "codemirror_mode": { 122 | "name": "ipython", 123 | "version": 3 124 | }, 125 | "file_extension": ".py", 126 | "mimetype": "text/x-python", 127 | "name": "python", 128 | "nbconvert_exporter": "python", 129 | "pygments_lexer": "ipython3", 130 | "version": "3.10.12" 131 | } 132 | }, 133 | "nbformat": 4, 134 | "nbformat_minor": 5 135 | } 136 | -------------------------------------------------------------------------------- /Chapter06/6.1 monitor-spark-ui.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "6bfbe105-2fd1-43f1-95e9-525d85226a13", 7 | "metadata": {}, 8 | "outputs": [ 9 | { 10 | "name": "stderr", 11 | "output_type": "stream", 12 | "text": [ 13 | "Setting default log level to \"WARN\".\n", 14 | "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n", 15 | "23/09/03 11:00:25 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n" 16 | ] 17 | } 18 | ], 19 | "source": [ 20 | "from pyspark.sql import SparkSession\n", 21 | "\n", 22 | "# Create a new SparkSession\n", 23 | "spark = (SparkSession\n", 24 | " .builder\n", 25 | " .appName(\"monitor-spark-ui\")\n", 26 | " .master(\"spark://spark-master:7077\")\n", 27 | " .config(\"spark.executor.memory\", \"512m\")\n", 28 | " .getOrCreate())\n", 29 | "\n", 30 | "# Set log level to ERROR\n", 31 | "spark.sparkContext.setLogLevel(\"ERROR\")" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 2, 37 | "id": "d348a886-776d-42db-936c-0d7339969642", 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [ 41 | "from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DateType\n", 42 | "\n", 43 | "# Define a Schema\n", 44 | "schema = StructType([\n", 45 | " StructField(\"show_id\", StringType(), True),\n", 46 | " StructField(\"type\", StringType(), True),\n", 47 | " StructField(\"title\", StringType(), True),\n", 48 | " StructField(\"director\", StringType(), True),\n", 49 | " StructField(\"cast\", StringType(), True),\n", 50 | " StructField(\"country\", StringType(), True),\n", 51 | " StructField(\"date_added\", DateType(), True),\n", 52 | " StructField(\"release_year\", IntegerType(), True),\n", 53 | " StructField(\"rating\", StringType(), True),\n", 54 | " StructField(\"duration\", StringType(), True),\n", 55 | " StructField(\"listed_in\", StringType(), True),\n", 56 | " StructField(\"description\", StringType(), True)])\n" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": 3, 62 | "id": "8670a1c7-4876-4870-8a8c-aedb67ee703e", 63 | "metadata": {}, 64 | "outputs": [], 65 | "source": [ 66 | "# Read CSV file into a DataFrame\n", 67 | "df = (spark.read.format(\"csv\")\n", 68 | " .option(\"header\", \"true\")\n", 69 | " .schema(schema)\n", 70 | " .load(\"../data/netflix_titles.csv\"))" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": 4, 76 | "id": "0d196aa1-1af0-4100-9fee-17ad1edff93c", 77 | "metadata": {}, 78 | "outputs": [], 79 | "source": [ 80 | "# Filter rows where release_year ge is greater than 2020\n", 81 | "df = df.filter(df.release_year > 2020)" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": 5, 87 | "id": "cf6e81ec-736f-4955-b953-a9210cf55112", 88 | "metadata": {}, 89 | "outputs": [], 90 | "source": [ 91 | "# Group by country and count\n", 92 | "df = df.groupBy(\"country\").count()" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": 6, 98 | "id": "831d1ff5-cd30-47ba-91ee-d4e664c67917", 99 | "metadata": {}, 100 | "outputs": [ 101 | { 102 | "name": "stderr", 103 | "output_type": "stream", 104 | "text": [ 105 | "[Stage 0:> (0 + 1) / 1]\r" 106 | ] 107 | }, 108 | { 109 | "name": "stdout", 110 | "output_type": "stream", 111 | "text": [ 112 | "+--------------------+-----+\n", 113 | "| country|count|\n", 114 | "+--------------------+-----+\n", 115 | "|India, United Kin...| 1|\n", 116 | "|France, United St...| 3|\n", 117 | "| Sweden| 3|\n", 118 | "| Turkey| 5|\n", 119 | "|China, United Sta...| 1|\n", 120 | "| Germany| 5|\n", 121 | "| Jordan| 1|\n", 122 | "| France| 7|\n", 123 | "| Uruguay, Germany| 1|\n", 124 | "|United States, India| 1|\n", 125 | "|Belgium, United K...| 1|\n", 126 | "| null| 208|\n", 127 | "| Argentina| 2|\n", 128 | "|Mexico, United St...| 1|\n", 129 | "| Belgium| 2|\n", 130 | "| India| 31|\n", 131 | "| United States| 137|\n", 132 | "| China| 4|\n", 133 | "|United States, Cz...| 1|\n", 134 | "|United States, Japan| 2|\n", 135 | "+--------------------+-----+\n", 136 | "only showing top 20 rows\n", 137 | "\n" 138 | ] 139 | }, 140 | { 141 | "name": "stderr", 142 | "output_type": "stream", 143 | "text": [ 144 | " \r" 145 | ] 146 | } 147 | ], 148 | "source": [ 149 | "# Show the result\n", 150 | "df.show()" 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "execution_count": null, 156 | "id": "030c5f42-391f-4b5f-93ff-efa79ae4bf36", 157 | "metadata": {}, 158 | "outputs": [], 159 | "source": [] 160 | } 161 | ], 162 | "metadata": { 163 | "kernelspec": { 164 | "display_name": "Python 3 (ipykernel)", 165 | "language": "python", 166 | "name": "python3" 167 | }, 168 | "language_info": { 169 | "codemirror_mode": { 170 | "name": "ipython", 171 | "version": 3 172 | }, 173 | "file_extension": ".py", 174 | "mimetype": "text/x-python", 175 | "name": "python", 176 | "nbconvert_exporter": "python", 177 | "pygments_lexer": "ipython3", 178 | "version": "3.10.12" 179 | } 180 | }, 181 | "nbformat": 4, 182 | "nbformat_minor": 5 183 | } 184 | -------------------------------------------------------------------------------- /Chapter06/6.2 broadcast-variables.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 12, 6 | "id": "3d05d7a1-fe97-491a-a177-c1886a5f8baf", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "from pyspark.sql import SparkSession \n", 11 | "from pyspark.sql.functions import rand, when, pandas_udf, PandasUDFType\n", 12 | "from pyspark.sql.types import BooleanType\n", 13 | "import pandas as pd" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 2, 19 | "id": "6bfbe105-2fd1-43f1-95e9-525d85226a13", 20 | "metadata": {}, 21 | "outputs": [ 22 | { 23 | "name": "stderr", 24 | "output_type": "stream", 25 | "text": [ 26 | "Setting default log level to \"WARN\".\n", 27 | "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n", 28 | "24/02/21 12:33:51 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n" 29 | ] 30 | } 31 | ], 32 | "source": [ 33 | "# Create a new SparkSession\n", 34 | "spark = (SparkSession\n", 35 | " .builder\n", 36 | " .appName(\"broadcast-variables\")\n", 37 | " .master(\"spark://spark-master:7077\")\n", 38 | " .config(\"spark.executor.memory\", \"512m\")\n", 39 | " .getOrCreate())\n", 40 | "\n", 41 | "# Set log level to ERROR\n", 42 | "spark.sparkContext.setLogLevel(\"ERROR\")" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": 7, 48 | "id": "2ae2e1f7-45c3-486a-970e-0727eb303197", 49 | "metadata": {}, 50 | "outputs": [ 51 | { 52 | "name": "stderr", 53 | "output_type": "stream", 54 | "text": [ 55 | "[Stage 0:> (0 + 1) / 1]\r" 56 | ] 57 | }, 58 | { 59 | "name": "stdout", 60 | "output_type": "stream", 61 | "text": [ 62 | "+---+------+------+------------+\n", 63 | "| id|salary|gender|country_code|\n", 64 | "+---+------+------+------------+\n", 65 | "| 0| 8000| M| US|\n", 66 | "| 1| 3500| F| null|\n", 67 | "| 2| 9700| F| null|\n", 68 | "| 3| 4800| F| null|\n", 69 | "| 4| 9100| F| null|\n", 70 | "+---+------+------+------------+\n", 71 | "only showing top 5 rows\n", 72 | "\n" 73 | ] 74 | }, 75 | { 76 | "name": "stderr", 77 | "output_type": "stream", 78 | "text": [ 79 | " \r" 80 | ] 81 | } 82 | ], 83 | "source": [ 84 | "# Create some sample data frames\n", 85 | "# A large data frame with 1 million rows\n", 86 | "large_df = (spark.range(0, 1000000)\n", 87 | " .withColumn(\"salary\", 100*(rand() * 100).cast(\"int\"))\n", 88 | " .withColumn(\"gender\", when((rand() * 2).cast(\"int\") == 0, \"M\").otherwise(\"F\"))\n", 89 | " .withColumn(\"country_code\", \n", 90 | " when((rand() * 4).cast(\"int\") == 0, \"US\")\n", 91 | " .when((rand() * 4).cast(\"int\") == 1, \"CN\")\n", 92 | " .when((rand() * 4).cast(\"int\") == 2, \"IN\")\n", 93 | " .when((rand() * 4).cast(\"int\") == 3, \"BR\")))\n", 94 | "large_df.show(5)" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": 8, 100 | "id": "5e1b2f38-375a-4c39-94ab-45597162caf2", 101 | "metadata": {}, 102 | "outputs": [], 103 | "source": [ 104 | "# Define lookup table\n", 105 | "lookup = {\"US\": \"United States\", \"CN\": \"China\", \"IN\": \"India\", \"BR\": \"Brazil\", \"RU\": \"Russia\"}\n", 106 | "\n", 107 | "# Create broadcast variable\n", 108 | "broadcast_lookup = spark.sparkContext.broadcast(lookup)" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": 9, 114 | "id": "11da2278-39a4-43a3-a650-413edce1ba0c", 115 | "metadata": {}, 116 | "outputs": [ 117 | { 118 | "name": "stderr", 119 | "output_type": "stream", 120 | "text": [ 121 | "/usr/local/lib/python3.10/dist-packages/pyspark/sql/pandas/functions.py:399: UserWarning: In Python 3.6+ and Spark 3.0+, it is preferred to specify type hints for pandas UDF instead of specifying pandas UDF type which will be deprecated in the future releases. See SPARK-28264 for more details.\n", 122 | " warnings.warn(\n" 123 | ] 124 | } 125 | ], 126 | "source": [ 127 | "@pandas_udf('string', PandasUDFType.SCALAR)\n", 128 | "def country_convert(s):\n", 129 | " return s.map(broadcast_lookup.value)" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": 10, 135 | "id": "70902427-3e46-48af-9f3b-f1b3a3619cb7", 136 | "metadata": {}, 137 | "outputs": [ 138 | { 139 | "name": "stderr", 140 | "output_type": "stream", 141 | "text": [ 142 | "[Stage 1:> (0 + 1) / 1]\r" 143 | ] 144 | }, 145 | { 146 | "name": "stdout", 147 | "output_type": "stream", 148 | "text": [ 149 | "+---+------+------+------------+-------------+\n", 150 | "| id|salary|gender|country_code| country_name|\n", 151 | "+---+------+------+------------+-------------+\n", 152 | "| 0| 8000| M| US|United States|\n", 153 | "| 1| 3500| F| null| null|\n", 154 | "| 2| 9700| F| null| null|\n", 155 | "| 3| 4800| F| null| null|\n", 156 | "| 4| 9100| F| null| null|\n", 157 | "+---+------+------+------------+-------------+\n", 158 | "only showing top 5 rows\n", 159 | "\n" 160 | ] 161 | }, 162 | { 163 | "name": "stderr", 164 | "output_type": "stream", 165 | "text": [ 166 | " \r" 167 | ] 168 | } 169 | ], 170 | "source": [ 171 | "large_df.withColumn(\"country_name\", country_convert(large_df.country_code)).show(5)" 172 | ] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "execution_count": 13, 177 | "id": "d2492f49-744d-44ea-a253-1b73c1d04710", 178 | "metadata": {}, 179 | "outputs": [], 180 | "source": [ 181 | "@pandas_udf(BooleanType(), PandasUDFType.SCALAR)\n", 182 | "def filter_unknown_country(s):\n", 183 | " return s.isin(broadcast_lookup.value)" 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": 14, 189 | "id": "480aba43-0e1d-4f84-82e0-c7cdaa9debb5", 190 | "metadata": {}, 191 | "outputs": [ 192 | { 193 | "name": "stderr", 194 | "output_type": "stream", 195 | "text": [ 196 | "[Stage 2:> (0 + 1) / 1]\r" 197 | ] 198 | }, 199 | { 200 | "name": "stdout", 201 | "output_type": "stream", 202 | "text": [ 203 | "+---+------+------+------------+\n", 204 | "| id|salary|gender|country_code|\n", 205 | "+---+------+------+------------+\n", 206 | "| 0| 8000| M| US|\n", 207 | "| 6| 3400| F| US|\n", 208 | "| 7| 8400| M| CN|\n", 209 | "| 8| 1100| F| US|\n", 210 | "| 9| 2900| M| CN|\n", 211 | "+---+------+------+------------+\n", 212 | "only showing top 5 rows\n", 213 | "\n" 214 | ] 215 | }, 216 | { 217 | "name": "stderr", 218 | "output_type": "stream", 219 | "text": [ 220 | " \r" 221 | ] 222 | } 223 | ], 224 | "source": [ 225 | "large_df.filter(filter_unknown_country(large_df.country_code)).show(5)" 226 | ] 227 | }, 228 | { 229 | "cell_type": "code", 230 | "execution_count": 15, 231 | "id": "526788c3-4a1d-4314-b2cc-b8f3c13683c4", 232 | "metadata": {}, 233 | "outputs": [], 234 | "source": [ 235 | "spark.stop()" 236 | ] 237 | }, 238 | { 239 | "cell_type": "code", 240 | "execution_count": null, 241 | "id": "875d3099-a7e6-48d2-ac28-09985403102d", 242 | "metadata": {}, 243 | "outputs": [], 244 | "source": [] 245 | } 246 | ], 247 | "metadata": { 248 | "kernelspec": { 249 | "display_name": "Python 3 (ipykernel)", 250 | "language": "python", 251 | "name": "python3" 252 | }, 253 | "language_info": { 254 | "codemirror_mode": { 255 | "name": "ipython", 256 | "version": 3 257 | }, 258 | "file_extension": ".py", 259 | "mimetype": "text/x-python", 260 | "name": "python", 261 | "nbconvert_exporter": "python", 262 | "pygments_lexer": "ipython3", 263 | "version": "3.10.12" 264 | } 265 | }, 266 | "nbformat": 4, 267 | "nbformat_minor": 5 268 | } 269 | -------------------------------------------------------------------------------- /Chapter06/6.5 cache-and-persist.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 6, 6 | "id": "6020138a-0dc8-4b7e-bad8-9cf2ef7133aa", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "from pyspark.sql import SparkSession \n", 11 | "from pyspark import StorageLevel \n", 12 | "from pyspark.sql.functions import rand, current_date, date_sub" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 2, 18 | "id": "2b24dc6e-9c01-4a8b-9b6e-0d71b953bd82", 19 | "metadata": {}, 20 | "outputs": [ 21 | { 22 | "name": "stderr", 23 | "output_type": "stream", 24 | "text": [ 25 | "Setting default log level to \"WARN\".\n", 26 | "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n", 27 | "24/02/21 13:44:30 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n" 28 | ] 29 | } 30 | ], 31 | "source": [ 32 | "spark = (SparkSession.builder\n", 33 | " .appName(\"cache-and-persist\")\n", 34 | " .master(\"spark://spark-master:7077\")\n", 35 | " .config(\"spark.executor.memory\", \"512m\")\n", 36 | " .getOrCreate())\n", 37 | "\n", 38 | "spark.sparkContext.setLogLevel(\"ERROR\")" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": 9, 44 | "id": "8e6169b6-ba60-4544-bf4f-b58cbb2e3d22", 45 | "metadata": {}, 46 | "outputs": [], 47 | "source": [ 48 | "# Define a function to measure the execution time of a query\n", 49 | "import time\n", 50 | "\n", 51 | "def measure_time(query):\n", 52 | " start = time.time()\n", 53 | " query.collect() # Force the query execution by calling an action\n", 54 | " end = time.time()\n", 55 | " print(f\"Execution time: {end - start} seconds\")" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 3, 61 | "id": "1d243df0-aa88-4ea7-a363-76e00eafc3cc", 62 | "metadata": {}, 63 | "outputs": [ 64 | { 65 | "name": "stderr", 66 | "output_type": "stream", 67 | "text": [ 68 | "[Stage 0:> (0 + 1) / 1]\r" 69 | ] 70 | }, 71 | { 72 | "name": "stdout", 73 | "output_type": "stream", 74 | "text": [ 75 | "+---+----------+---------+\n", 76 | "| id| date|ProductId|\n", 77 | "+---+----------+---------+\n", 78 | "| 0|2024-02-10| 67|\n", 79 | "| 1|2023-07-12| 39|\n", 80 | "| 2|2023-08-10| 8|\n", 81 | "| 3|2023-05-22| 29|\n", 82 | "| 4|2023-06-22| 63|\n", 83 | "+---+----------+---------+\n", 84 | "only showing top 5 rows\n", 85 | "\n" 86 | ] 87 | }, 88 | { 89 | "name": "stderr", 90 | "output_type": "stream", 91 | "text": [ 92 | " \r" 93 | ] 94 | } 95 | ], 96 | "source": [ 97 | "# Create some sample data frames\n", 98 | "# A large data frame with 10 million rows and two columns: id and value\n", 99 | "large_df = (spark.range(0, 10000000)\n", 100 | " .withColumn(\"date\", date_sub(current_date(), (rand() * 365).cast(\"int\")))\n", 101 | " .withColumn(\"ProductId\", (rand() * 100).cast(\"int\")))\n", 102 | "large_df.show(5)" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": 4, 108 | "id": "1bb8e6b1-94fa-4d45-9b95-ef6887b8e4c1", 109 | "metadata": {}, 110 | "outputs": [ 111 | { 112 | "name": "stdout", 113 | "output_type": "stream", 114 | "text": [ 115 | "Disk Memory Deserialized 1x Replicated\n" 116 | ] 117 | } 118 | ], 119 | "source": [ 120 | "# Cache the DataFrame using cache() method\n", 121 | "large_df.cache()\n", 122 | "# Check the storage level of the cached DataFrame\n", 123 | "print(large_df.storageLevel)" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": 7, 129 | "id": "f7d80802-2095-47fa-83be-672a56f8c71c", 130 | "metadata": {}, 131 | "outputs": [ 132 | { 133 | "name": "stdout", 134 | "output_type": "stream", 135 | "text": [ 136 | "Disk Memory Deserialized 1x Replicated\n" 137 | ] 138 | } 139 | ], 140 | "source": [ 141 | "# Persist the DataFrame using persist() method with a different storage level\n", 142 | "large_df.persist(StorageLevel.MEMORY_AND_DISK_DESER)\n", 143 | "# Check the storage level of the persisted DataFrame\n", 144 | "print(large_df.storageLevel)" 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": 10, 150 | "id": "760553dd-2781-4b03-9e68-7c7442e6dc4b", 151 | "metadata": {}, 152 | "outputs": [ 153 | { 154 | "name": "stderr", 155 | "output_type": "stream", 156 | "text": [ 157 | " \r" 158 | ] 159 | }, 160 | { 161 | "name": "stdout", 162 | "output_type": "stream", 163 | "text": [ 164 | "Execution time: 8.600075006484985 seconds\n", 165 | "+---------+---------+\n", 166 | "|ProductId|count(Id)|\n", 167 | "+---------+---------+\n", 168 | "| 31| 99961|\n", 169 | "| 85| 99746|\n", 170 | "| 65| 100023|\n", 171 | "| 53| 100615|\n", 172 | "| 78| 99985|\n", 173 | "+---------+---------+\n", 174 | "only showing top 5 rows\n", 175 | "\n" 176 | ] 177 | } 178 | ], 179 | "source": [ 180 | "results_df = large_df.groupBy(\"ProductId\").agg({\"Id\": \"count\"}) \n", 181 | "measure_time(results_df)\n", 182 | "# Show the result\n", 183 | "results_df.show(5)" 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": 11, 189 | "id": "58fe1655-0f67-481c-8fc0-061ea5360e7c", 190 | "metadata": {}, 191 | "outputs": [ 192 | { 193 | "name": "stdout", 194 | "output_type": "stream", 195 | "text": [ 196 | "Execution time: 0.984121561050415 seconds\n" 197 | ] 198 | }, 199 | { 200 | "name": "stderr", 201 | "output_type": "stream", 202 | "text": [ 203 | "[Stage 10:=============================> (1 + 1) / 2]\r" 204 | ] 205 | }, 206 | { 207 | "name": "stdout", 208 | "output_type": "stream", 209 | "text": [ 210 | "+---------+---------+\n", 211 | "|ProductId|count(Id)|\n", 212 | "+---------+---------+\n", 213 | "| 31| 99961|\n", 214 | "| 85| 99746|\n", 215 | "| 65| 100023|\n", 216 | "| 53| 100615|\n", 217 | "| 78| 99985|\n", 218 | "+---------+---------+\n", 219 | "only showing top 5 rows\n", 220 | "\n" 221 | ] 222 | }, 223 | { 224 | "name": "stderr", 225 | "output_type": "stream", 226 | "text": [ 227 | " \r" 228 | ] 229 | } 230 | ], 231 | "source": [ 232 | "results_df = large_df.groupBy(\"ProductId\").agg({\"Id\": \"count\"}) \n", 233 | "measure_time(results_df)\n", 234 | "# Show the result\n", 235 | "results_df.show(5)" 236 | ] 237 | }, 238 | { 239 | "cell_type": "code", 240 | "execution_count": 12, 241 | "id": "e6b5a908-2260-467c-96a3-6245ebba3198", 242 | "metadata": {}, 243 | "outputs": [ 244 | { 245 | "name": "stdout", 246 | "output_type": "stream", 247 | "text": [ 248 | "Serialized 1x Replicated\n" 249 | ] 250 | } 251 | ], 252 | "source": [ 253 | "# Unpersist the DataFrame using unpersist() method\n", 254 | "large_df.unpersist()\n", 255 | "# Check the storage level of the unpersisted DataFrame\n", 256 | "print(large_df.storageLevel)" 257 | ] 258 | }, 259 | { 260 | "cell_type": "code", 261 | "execution_count": 13, 262 | "id": "e360de96-05cf-4575-92a4-567200d91f06", 263 | "metadata": {}, 264 | "outputs": [], 265 | "source": [ 266 | "spark.stop()" 267 | ] 268 | }, 269 | { 270 | "cell_type": "code", 271 | "execution_count": null, 272 | "id": "3e55fe9f-b1e7-4885-92f2-d48e758928c6", 273 | "metadata": {}, 274 | "outputs": [], 275 | "source": [] 276 | } 277 | ], 278 | "metadata": { 279 | "kernelspec": { 280 | "display_name": "Python 3 (ipykernel)", 281 | "language": "python", 282 | "name": "python3" 283 | }, 284 | "language_info": { 285 | "codemirror_mode": { 286 | "name": "ipython", 287 | "version": 3 288 | }, 289 | "file_extension": ".py", 290 | "mimetype": "text/x-python", 291 | "name": "python", 292 | "nbconvert_exporter": "python", 293 | "pygments_lexer": "ipython3", 294 | "version": "3.10.12" 295 | } 296 | }, 297 | "nbformat": 4, 298 | "nbformat_minor": 5 299 | } 300 | -------------------------------------------------------------------------------- /Chapter08/8.1 building-databricks-workflow.yml: -------------------------------------------------------------------------------- 1 | resources: 2 | jobs: 3 | On_Shelf_Availability_Workflow: 4 | name: On-Shelf-Availability Workflow 5 | schedule: 6 | quartz_cron_expression: 36 0 0 * * ? 7 | timezone_id: UTC 8 | pause_status: PAUSED 9 | tasks: 10 | - task_key: Setup 11 | notebook_task: 12 | notebook_path: /Repos/pulkit.chadha.packt@gmail.com/Data-Engineering-with-Apache-Spark-and-Delta-Lake-Cookbook/Chapter08/Setup 13 | base_parameters: 14 | catalog: main 15 | schema: on_shelf_availability 16 | source: WORKSPACE 17 | job_cluster_key: Job_cluster 18 | - task_key: Download_Inventory_Data 19 | depends_on: 20 | - task_key: Setup 21 | notebook_task: 22 | notebook_path: /Repos/pulkit.chadha.packt@gmail.com/Data-Engineering-with-Apache-Spark-and-Delta-Lake-Cookbook/Chapter08/Download 23 | Inventory Data 24 | base_parameters: 25 | catalog: main 26 | schema: on_shelf_availability 27 | source: WORKSPACE 28 | job_cluster_key: Job_cluster 29 | job_clusters: 30 | - job_cluster_key: Job_cluster 31 | new_cluster: 32 | cluster_name: "" 33 | spark_version: 14.2.x-scala2.12 34 | aws_attributes: 35 | first_on_demand: 1 36 | availability: SPOT_WITH_FALLBACK 37 | zone_id: us-west-2a 38 | spot_bid_price_percent: 100 39 | ebs_volume_count: 0 40 | node_type_id: i4i.large 41 | enable_elastic_disk: false 42 | data_security_mode: SINGLE_USER 43 | runtime_engine: STANDARD 44 | num_workers: 2 -------------------------------------------------------------------------------- /Chapter08/8.4 conditional-branching.yml: -------------------------------------------------------------------------------- 1 | resources: 2 | jobs: 3 | On_Shelf_Availability_Workflow: 4 | name: On-Shelf-Availability Workflow 5 | schedule: 6 | quartz_cron_expression: 36 0 0 * * ? 7 | timezone_id: UTC 8 | pause_status: PAUSED 9 | tasks: 10 | - task_key: Setup 11 | notebook_task: 12 | notebook_path: /Repos/pulkit.chadha.packt@gmail.com/Data-Engineering-with-Apache-Spark-and-Delta-Lake-Cookbook/Chapter08/Setup 13 | base_parameters: 14 | catalog: main 15 | schema: on_shelf_availability 16 | source: WORKSPACE 17 | job_cluster_key: Job_cluster 18 | - task_key: Download_Inventory_Data 19 | depends_on: 20 | - task_key: Setup 21 | notebook_task: 22 | notebook_path: /Repos/pulkit.chadha.packt@gmail.com/Data-Engineering-with-Apache-Spark-and-Delta-Lake-Cookbook/Chapter08/Download 23 | Inventory Data 24 | base_parameters: 25 | catalog: main 26 | schema: on_shelf_availability 27 | source: WORKSPACE 28 | job_cluster_key: Job_cluster 29 | - task_key: Check_File_Size 30 | depends_on: 31 | - task_key: Download_Inventory_Data 32 | condition_task: 33 | op: GREATER_THAN 34 | left: "{{tasks.Download_Inventory_Data.values.file_size}}" 35 | right: "0" 36 | - task_key: OSA_Data_Preparation_DLT 37 | depends_on: 38 | - task_key: Check_File_Size 39 | outcome: "true" 40 | pipeline_task: 41 | pipeline_id: 970bb4d4-35f0-4169-91b9-bf522f95076c 42 | full_refresh: true 43 | - task_key: Cleanup 44 | depends_on: 45 | - task_key: Download_Inventory_Data 46 | - task_key: Setup 47 | - task_key: Check_File_Size 48 | outcome: "false" 49 | run_if: AT_LEAST_ONE_FAILED 50 | notebook_task: 51 | notebook_path: /Repos/pulkit.chadha.packt@gmail.com/Data-Engineering-with-Apache-Spark-and-Delta-Lake-Cookbook/Chapter08/Clean 52 | Up 53 | base_parameters: 54 | catalog: main 55 | schema: on_shelf_availability 56 | source: WORKSPACE 57 | job_cluster_key: Job_cluster 58 | job_clusters: 59 | - job_cluster_key: Job_cluster 60 | new_cluster: 61 | cluster_name: "" 62 | spark_version: 14.2.x-scala2.12 63 | aws_attributes: 64 | first_on_demand: 1 65 | availability: SPOT_WITH_FALLBACK 66 | zone_id: us-west-2a 67 | spot_bid_price_percent: 100 68 | ebs_volume_count: 0 69 | node_type_id: i4i.large 70 | enable_elastic_disk: false 71 | data_security_mode: SINGLE_USER 72 | runtime_engine: STANDARD 73 | num_workers: 2 74 | -------------------------------------------------------------------------------- /Chapter08/Clean Up.sql: -------------------------------------------------------------------------------- 1 | -- Databricks notebook source 2 | CREATE WIDGET TEXT catalog DEFAULT "main"; 3 | CREATE WIDGET TEXT schema DEFAULT "on_shelf_availability"; 4 | 5 | -- COMMAND ---------- 6 | 7 | USE CATALOG ${catalog} 8 | USE SCHEMA ${schema}; 9 | 10 | -- COMMAND ---------- 11 | 12 | DROP VOLUME IF NOT EXISTS data; 13 | DROP SCHEMA IF NOT EXISTS ${schema} CASCADE; 14 | -------------------------------------------------------------------------------- /Chapter08/Data Preparation DLT.sql: -------------------------------------------------------------------------------- 1 | -- Databricks notebook source 2 | CREATE 3 | OR REFRESH STREAMING LIVE TABLE inventory_raw AS 4 | SELECT 5 | * 6 | FROM 7 | cloud_files( 8 | "/Volumes/dbdemos/on-shelf-availability/data/osa_raw_data*.csv", 9 | "csv", 10 | map( 11 | "cloudFiles.inferColumnTypes", 12 | "true", 13 | "dateFormat", 14 | "yyyyMMdd", 15 | "cloudFiles.schemaHints", 16 | "date DATE" 17 | ) 18 | ) 19 | 20 | -- COMMAND ---------- 21 | 22 | CREATE 23 | OR REFRESH STREAMING LIVE TABLE vendor AS 24 | SELECT 25 | * 26 | FROM 27 | cloud_files( 28 | "/Volumes/dbdemos/on-shelf-availability/data/vendor_leadtime_info*.csv", 29 | "csv", 30 | map("cloudFiles.inferColumnTypes", "true") 31 | ) 32 | 33 | -- COMMAND ---------- 34 | 35 | 36 | 37 | -- COMMAND ---------- 38 | 39 | CREATE 40 | OR REFRESH LIVE TABLE inventory AS 41 | SELECT 42 | cross_view.date, 43 | cross_view.store_id, 44 | cross_view.sku, 45 | int_data.product_category, 46 | int_data.total_sales_units, 47 | int_data.on_hand_inventory_units, 48 | int_data.replenishment_units, 49 | int_data.inventory_pipeline, 50 | int_data.units_in_transit, 51 | int_data.units_in_dc, 52 | int_data.units_on_order, 53 | int_data.units_under_promotion, 54 | int_data.shelf_capacity, 55 | CASE WHEN int_data.units_under_promotion > 0 THEN 1 ELSE 0 END as promotion_flag, 56 | CASE WHEN int_data.replenishment_units > 0 THEN 1 ELSE 0 END as replenishment_flag 57 | FROM 58 | ( 59 | SELECT 60 | to_date( 61 | date_add('2019-01-01', cast(abs(t.id) as int)), 62 | 'yy-MM-dd' 63 | ) as date, 64 | store_id, 65 | sku 66 | FROM 67 | range(datediff('2019-01-01', '2021-05-03'), 1) AS t 68 | CROSS JOIN ( 69 | SELECT 70 | store_id, 71 | sku 72 | FROM 73 | live.inventory_raw 74 | GROUP BY 75 | ALL 76 | ) 77 | ) cross_view 78 | LEFT OUTER JOIN live.inventory_raw int_data ON cross_view.date = int_data.date 79 | AND cross_view.store_id = int_data.store_id 80 | AND cross_view.sku = int_data.sku 81 | 82 | -- COMMAND ---------- 83 | 84 | 85 | -------------------------------------------------------------------------------- /Chapter08/Download Inventory Data.sql: -------------------------------------------------------------------------------- 1 | -- Databricks notebook source 2 | CREATE WIDGET TEXT catalog DEFAULT "main"; 3 | CREATE WIDGET TEXT schema DEFAULT "on_shelf_availability"; 4 | 5 | 6 | -- COMMAND ---------- 7 | 8 | -- MAGIC %python 9 | -- MAGIC import os 10 | -- MAGIC os.environ['catalog']=dbutils.widgets.get("catalog") 11 | -- MAGIC os.environ['schema']=dbutils.widgets.get("schema") 12 | -- MAGIC os.environ['volumeName']=dbutils.jobs.taskValues.get(taskKey = "Setup", key = "volumeName", default = "data", debugValue = "data") 13 | 14 | -- COMMAND ---------- 15 | 16 | -- MAGIC %sh 17 | -- MAGIC cd /Volumes/$catalog/$schema/${volumeName} 18 | -- MAGIC wget https://raw.githubusercontent.com/tredenceofficial/OSA-Data/main/osa_raw_data.csv 19 | -- MAGIC wget https://raw.githubusercontent.com/tredenceofficial/OSA-Data/main/vendor_leadtime_info.csv 20 | 21 | -- COMMAND ---------- 22 | 23 | -- MAGIC %python 24 | -- MAGIC import os 25 | -- MAGIC file_path = f'/Volumes/{dbutils.widgets.get("catalog")}/{dbutils.widgets.get("schema")}/{os.environ["volumeName"]}/osa_raw_data.csv' 26 | -- MAGIC if os.path.exists(file_path): 27 | -- MAGIC file_size = os.path.getsize(file_path) 28 | -- MAGIC dbutils.jobs.taskValues.set(key='file_size', value=file_size) 29 | -- MAGIC print(file_size) 30 | -- MAGIC else: 31 | -- MAGIC print(f"{file_path} does not exist") 32 | -- MAGIC dbutils.jobs.taskValues.set(key='file_size', value=0) 33 | 34 | -- COMMAND ---------- 35 | 36 | 37 | -------------------------------------------------------------------------------- /Chapter08/Setup.sql: -------------------------------------------------------------------------------- 1 | -- Databricks notebook source 2 | CREATE WIDGET TEXT catalog DEFAULT "main"; 3 | CREATE WIDGET TEXT schema DEFAULT "on_shelf_availability"; 4 | 5 | -- COMMAND ---------- 6 | 7 | USE CATALOG ${catalog} 8 | 9 | -- COMMAND ---------- 10 | 11 | CREATE SCHEMA IF NOT EXISTS ${schema}; 12 | 13 | -- COMMAND ---------- 14 | 15 | USE SCHEMA ${schema}; 16 | 17 | -- COMMAND ---------- 18 | 19 | CREATE VOLUME IF NOT EXISTS data; 20 | 21 | -- COMMAND ---------- 22 | 23 | -- MAGIC %python 24 | -- MAGIC dbutils.jobs.taskValues.set(key = 'volumeName', value = 'data') 25 | -------------------------------------------------------------------------------- /Chapter09/9.1 create-medallion-arch-DLT.sql: -------------------------------------------------------------------------------- 1 | -- Databricks notebook source 2 | CREATE 3 | OR REFRESH STREAMING TABLE device_data AS 4 | SELECT 5 | * 6 | FROM 7 | cloud_files( 8 | "/databricks-datasets/iot-stream/data-device", 9 | "json", 10 | map("cloudFiles.inferColumnTypes", "true") 11 | ) 12 | 13 | -- COMMAND ---------- 14 | 15 | CREATE 16 | OR REFRESH STREAMING TABLE user_data AS 17 | SELECT 18 | * 19 | FROM 20 | cloud_files( 21 | "/databricks-datasets/iot-stream/data-user", 22 | "csv", 23 | map("cloudFiles.inferColumnTypes", "true") 24 | ) 25 | 26 | -- COMMAND ---------- 27 | 28 | CREATE 29 | OR REFRESH STREAMING TABLE user_data_prepared ( 30 | CONSTRAINT valid_user EXPECT (user_id IS NOT NULL) ON VIOLATION DROP ROW 31 | ) -- COMMENT "" 32 | AS 33 | SELECT 34 | users.userid as user_id, 35 | CASE 36 | WHEN users.gender = 'F' THEN 'Female' 37 | WHEN users.gender = 'M' THEN 'Male' 38 | END AS gender, 39 | users.age, 40 | users.height, 41 | users.weight, 42 | CAST(users.smoker as BOOLEAN) AS isSmoker, 43 | CAST(users.familyhistory as BOOLEAN) AS hasFamilyHistory, 44 | users.cholestlevs AS cholestrolLevels, 45 | users.bp AS bloodPressure, 46 | users.risk 47 | FROM 48 | STREAM(live.user_data) users; 49 | 50 | -- COMMAND ---------- 51 | 52 | CREATE 53 | OR REFRESH STREAMING TABLE device_data_prepared ( 54 | CONSTRAINT valid_timestamp EXPECT (timestamp IS NOT NULL) 55 | ) -- COMMENT "" 56 | AS 57 | SELECT 58 | device.id, 59 | device.device_id, 60 | device.user_id, 61 | device.calories_burnt, 62 | device.miles_walked, 63 | device.num_steps, 64 | CAST(device.timestamp as TIMESTAMP) AS timestamp 65 | FROM 66 | STREAM(live.device_data) device 67 | 68 | -- COMMAND ---------- 69 | 70 | CREATE 71 | OR REFRESH LIVE TABLE user_metrics AS 72 | SELECT 73 | users.user_id, 74 | users.gender, 75 | users.age, 76 | users.height, 77 | users.weight, 78 | users.isSmoker, 79 | users.hasFamilyHistory, 80 | users.cholestrolLevels, 81 | users.bloodPressure, 82 | users.risk, 83 | SUM(devices.calories_burnt) AS totalCaloriesBurnt, 84 | SUM(devices.miles_walked) AS totalMilesWalked, 85 | SUM(devices.num_steps) AS totalNumberOfSteps 86 | FROM 87 | live.user_data_prepared users 88 | LEFT OUTER JOIN LIVE.device_data_prepared devices on devices.user_id = users.user_id 89 | GROUP BY 90 | ALL; 91 | -------------------------------------------------------------------------------- /Chapter09/9.3 data-quality-and-validation.sql: -------------------------------------------------------------------------------- 1 | -- Databricks notebook source 2 | CREATE 3 | OR REFRESH LIVE TABLE customers ( 4 | CONSTRAINT valid_customer_key EXPECT (c_custkey IS NOT NULL) ON VIOLATION DROP ROW 5 | ) AS 6 | SELECT 7 | * 8 | FROM 9 | samples.tpch.customer 10 | 11 | -- COMMAND ---------- 12 | 13 | CREATE TEMPORARY LIVE TABLE duplicate_customers_test ( 14 | CONSTRAINT unique_customer_key EXPECT (cnt = 1) ON VIOLATION DROP ROW 15 | ) AS 16 | SELECT 17 | c_custkey, count(*) as cnt 18 | FROM 19 | live.customers 20 | GROUP BY ALL; 21 | 22 | -- COMMAND ---------- 23 | 24 | CREATE 25 | OR REFRESH LIVE TABLE orders ( 26 | CONSTRAINT valid_order_key EXPECT (o_orderkey IS NOT NULL) ON VIOLATION DROP ROW, 27 | CONSTRAINT valid_customer_key EXPECT (o_custkey IS NOT NULL) ON VIOLATION DROP ROW, 28 | CONSTRAINT valid_reference_customer EXPECT (cust.c_custkey IS NOT NULL) ON VIOLATION DROP ROW 29 | ) AS 30 | SELECT 31 | ord.*, 32 | cust.c_custkey 33 | FROM 34 | samples.tpch.orders ord 35 | LEFT OUTER JOIN live.customers cust on cust.c_custkey = ord.o_custkey 36 | 37 | -- COMMAND ---------- 38 | 39 | CREATE TEMPORARY LIVE TABLE duplicate_orders_test ( 40 | CONSTRAINT unique_order_key EXPECT (cnt = 1) ON VIOLATION DROP ROW 41 | ) AS 42 | SELECT 43 | o_orderkey, count(*) as cnt 44 | FROM 45 | live.orders 46 | GROUP BY ALL; 47 | -------------------------------------------------------------------------------- /Chapter09/9.4 quarantine-bad-data-dlt.sql: -------------------------------------------------------------------------------- 1 | -- Databricks notebook source 2 | CREATE 3 | OR REFRESH STREAMING LIVE TABLE raw_farmers_market AS 4 | SELECT 5 | * 6 | FROM 7 | cloud_files( 8 | "/databricks-datasets/data.gov/farmers_markets_geographic_data/data-001/", 9 | "csv", 10 | map( 11 | "cloudFiles.inferColumnTypes", 12 | "true" 13 | ) 14 | ) 15 | 16 | -- COMMAND ---------- 17 | 18 | CREATE 19 | OR REFRESH STREAMING LIVE TABLE farmers_market_clean ( 20 | CONSTRAINT valid_website EXPECT (Website IS NOT NULL) ON VIOLATION DROP ROW, 21 | CONSTRAINT valid_location EXPECT (Location IS NOT NULL) ON VIOLATION DROP ROW 22 | ) AS 23 | SELECT 24 | * 25 | FROM 26 | STREAM(live.raw_farmers_market) 27 | 28 | -- COMMAND ---------- 29 | 30 | CREATE 31 | OR REFRESH STREAMING LIVE TABLE farmers_market_quarantine ( 32 | CONSTRAINT valid_website EXPECT (NOT(Website IS NOT NULL)) ON VIOLATION DROP ROW, 33 | CONSTRAINT valid_location EXPECT (NOT(Location IS NOT NULL)) ON VIOLATION DROP ROW 34 | ) AS 35 | SELECT 36 | * 37 | FROM 38 | STREAM(live.raw_farmers_market) 39 | -------------------------------------------------------------------------------- /Chapter09/9.5 monitor-delta-live-table-pipelines.sql: -------------------------------------------------------------------------------- 1 | -- Databricks notebook source 2 | CREATE OR REPLACE VIEW main.pkc_farmers_market.event_log_raw AS SELECT * FROM event_log("5f5e0278-f9c8-49dc-bfd2-ade8c07b4453"); 3 | 4 | -- COMMAND ---------- 5 | 6 | -- DBTITLE 1,Query data quality from the event log 7 | SELECT 8 | update_id, 9 | row_expectations.dataset as dataset, 10 | row_expectations.name as expectation, 11 | SUM(row_expectations.passed_records) as passing_records, 12 | SUM(row_expectations.failed_records) as failing_records 13 | FROM 14 | ( 15 | SELECT 16 | origin.update_id,explode( 17 | from_json( 18 | details :flow_progress :data_quality :expectations, 19 | "array>" 20 | ) 21 | ) row_expectations 22 | FROM 23 | main.pkc_farmers_market.event_log_raw 24 | WHERE 25 | event_type = 'flow_progress' 26 | ) 27 | GROUP BY 28 | update_id, 29 | row_expectations.dataset, 30 | row_expectations.name 31 | 32 | -- COMMAND ---------- 33 | 34 | -- DBTITLE 1,Monitor compute resource utilization 35 | SELECT 36 | origin.update_id, 37 | timestamp, 38 | Double(details :cluster_resources.avg_num_queued_tasks) as queue_size, 39 | Double( 40 | details :cluster_resources.avg_task_slot_utilization 41 | ) as utilization, 42 | Double(details :cluster_resources.num_executors) as current_executors, 43 | Double( 44 | details :cluster_resources.latest_requested_num_executors 45 | ) as latest_requested_num_executors, 46 | Double(details :cluster_resources.optimal_num_executors) as optimal_num_executors, 47 | details :cluster_resources.state as autoscaling_state 48 | FROM 49 | main.pkc_farmers_market.event_log_raw 50 | WHERE 51 | event_type = 'cluster_resources' 52 | ORDER BY 53 | origin.update_id, 54 | timestamp 55 | 56 | -- COMMAND ---------- 57 | 58 | -- DBTITLE 1,Query user actions in the event log 59 | SELECT 60 | timestamp, 61 | details :user_action :action, 62 | details :user_action :user_name 63 | FROM 64 | main.pkc_farmers_market.event_log_raw 65 | WHERE 66 | event_type = 'user_action' 67 | -------------------------------------------------------------------------------- /Chapter09/9.6 dlt-dabs-cicd/9.6 create-medallion-arch-DLT.sql: -------------------------------------------------------------------------------- 1 | -- Databricks notebook source 2 | CREATE 3 | OR REFRESH STREAMING TABLE device_data AS 4 | SELECT 5 | * 6 | FROM 7 | cloud_files( 8 | "/databricks-datasets/iot-stream/data-device", 9 | "json", 10 | map("cloudFiles.inferColumnTypes", "true") 11 | ) 12 | 13 | -- COMMAND ---------- 14 | 15 | CREATE 16 | OR REFRESH STREAMING TABLE user_data AS 17 | SELECT 18 | * 19 | FROM 20 | cloud_files( 21 | "/databricks-datasets/iot-stream/data-user", 22 | "csv", 23 | map("cloudFiles.inferColumnTypes", "true") 24 | ) 25 | 26 | -- COMMAND ---------- 27 | 28 | CREATE 29 | OR REFRESH STREAMING TABLE user_data_prepared ( 30 | CONSTRAINT valid_user EXPECT (user_id IS NOT NULL) ON VIOLATION DROP ROW 31 | ) -- COMMENT "" 32 | AS 33 | SELECT 34 | users.userid as user_id, 35 | CASE 36 | WHEN users.gender = 'F' THEN 'Female' 37 | WHEN users.gender = 'M' THEN 'Male' 38 | END AS gender, 39 | users.age, 40 | users.height, 41 | users.weight, 42 | CAST(users.smoker as BOOLEAN) AS isSmoker, 43 | CAST(users.familyhistory as BOOLEAN) AS hasFamilyHistory, 44 | users.cholestlevs AS cholestrolLevels, 45 | users.bp AS bloodPressure, 46 | users.risk 47 | FROM 48 | STREAM(live.user_data) users; 49 | 50 | -- COMMAND ---------- 51 | 52 | CREATE 53 | OR REFRESH STREAMING TABLE device_data_prepared ( 54 | CONSTRAINT valid_timestamp EXPECT (timestamp IS NOT NULL) 55 | ) -- COMMENT "" 56 | AS 57 | SELECT 58 | device.id, 59 | device.device_id, 60 | device.user_id, 61 | device.calories_burnt, 62 | device.miles_walked, 63 | device.num_steps, 64 | CAST(device.timestamp as TIMESTAMP) AS timestamp 65 | FROM 66 | STREAM(live.device_data) device 67 | 68 | -- COMMAND ---------- 69 | 70 | CREATE 71 | OR REFRESH LIVE TABLE user_metrics AS 72 | SELECT 73 | users.user_id, 74 | users.gender, 75 | users.age, 76 | users.height, 77 | users.weight, 78 | users.isSmoker, 79 | users.hasFamilyHistory, 80 | users.cholestrolLevels, 81 | users.bloodPressure, 82 | users.risk, 83 | SUM(devices.calories_burnt) AS totalCaloriesBurnt, 84 | SUM(devices.miles_walked) AS totalMilesWalked, 85 | SUM(devices.num_steps) AS totalNumberOfSteps 86 | FROM 87 | live.user_data_prepared users 88 | LEFT OUTER JOIN LIVE.device_data_prepared devices on devices.user_id = users.user_id 89 | GROUP BY 90 | ALL; 91 | -------------------------------------------------------------------------------- /Chapter09/9.6 dlt-dabs-cicd/databricks.yml: -------------------------------------------------------------------------------- 1 | # This is a Databricks asset bundle definition for dlt_dabs_cicd. 2 | # See https://docs.databricks.com/dev-tools/bundles/index.html for documentation. 3 | bundle: 4 | name: dlt_dabs_cicd 5 | 6 | include: 7 | - dlt_dabs_cicd_pipeline.yml 8 | 9 | targets: 10 | # The 'dev' target, used for development purposes. 11 | # Whenever a developer deploys using 'dev', they get their own copy. 12 | dev: 13 | # We use 'mode: development' to make sure everything deployed to this target gets a prefix 14 | # like '[dev my_user_name]'. Setting this mode also disables any schedules and 15 | # automatic triggers for jobs and enables the 'development' mode for Delta Live Tables pipelines. 16 | mode: development 17 | default: true 18 | workspace: 19 | host: https://adb-7934447987890817.7.azuredatabricks.net/ 20 | 21 | # The 'prod' target, used for production deployment. 22 | prod: 23 | # For production deployments, we only have a single copy, so we override the 24 | # workspace.root_path default of /Users/${workspace.current_user.userName}/.bundle/${bundle.target}/${bundle.name} 25 | # to a path that is not specific to the current user. 26 | mode: production 27 | workspace: 28 | host: https://adb-7934447987890817.7.azuredatabricks.net/ 29 | root_path: /Shared/.bundle/prod/${bundle.name} 30 | run_as: 31 | # Use service principal could be used here using service_principal_name 32 | # (see Databricks documentation). 33 | user_name: pulkit.chadha.packt@gmail.com 34 | -------------------------------------------------------------------------------- /Chapter09/9.6 dlt-dabs-cicd/dlt_dabs_cicd_pipeline.yml: -------------------------------------------------------------------------------- 1 | # The main pipeline for dlt_dabs_cicd 2 | resources: 3 | pipelines: 4 | dlt_dabs_cicd_pipeline: 5 | name: dlt_dabs_cicd_pipeline 6 | target: dlt_dabs_cicd_${bundle.environment} 7 | continuous: false 8 | channel: CURRENT 9 | photon: false 10 | libraries: 11 | - notebook: 12 | path: 9.6 create-medallion-arch-DLT.sql 13 | clusters: 14 | - label: default 15 | autoscale: 16 | min_workers: 1 17 | max_workers: 1 18 | mode: ENHANCED 19 | -------------------------------------------------------------------------------- /Chapter09/9.7 apply-changes_into-dlt.sql: -------------------------------------------------------------------------------- 1 | -- Databricks notebook source 2 | CREATE TEMPORARY STREAMING LIVE VIEW movie_and_show_titles AS 3 | SELECT 4 | *, 5 | now() as ts 6 | FROM 7 | cloud_files( 8 | "/Volumes/main/netflix/data", 9 | "csv", 10 | map("header", "true") 11 | ); 12 | 13 | -- COMMAND ---------- 14 | 15 | CREATE 16 | OR REFRESH STREAMING TABLE movie_and_show_titles_scd_1; 17 | 18 | APPLY CHANGES INTO 19 | live.movie_and_show_titles_scd_1 20 | FROM 21 | STREAM(LIVE.movie_and_show_titles) 22 | KEYS 23 | (type, title, director) 24 | SEQUENCE BY 25 | ts 26 | COLUMNS * EXCEPT (ts) 27 | STORED AS 28 | SCD TYPE 1; 29 | 30 | -- COMMAND ---------- 31 | 32 | CREATE 33 | OR REFRESH STREAMING TABLE movie_and_show_titles_scd_2; 34 | 35 | APPLY CHANGES INTO 36 | live.movie_and_show_titles_scd_2 37 | FROM 38 | STREAM(LIVE.movie_and_show_titles) 39 | KEYS 40 | (type, title, director) 41 | SEQUENCE BY 42 | ts 43 | COLUMNS * EXCEPT (ts) 44 | STORED AS 45 | SCD TYPE 2; 46 | -------------------------------------------------------------------------------- /Chapter10/10.2 uc_object_hierarchy.sql: -------------------------------------------------------------------------------- 1 | -- Databricks notebook source 2 | CREATE CATALOG de_book MANAGED LOCATION 's3://data-lake/de-book-ext-data'; 3 | 4 | -- COMMAND ---------- 5 | 6 | USE CATALOG de_book; 7 | CREATE SCHEMA credit_card; 8 | 9 | -- COMMAND ---------- 10 | 11 | USE CATALOG de_book; 12 | USE SCHEMA credit_card; 13 | CREATE TABLE IF NOT EXISTS transactions_table ( 14 | Transaction_ID STRING, 15 | Transaction_Date STRING, 16 | Credit_Card_ID STRING, 17 | Transaction_Value FLOAT, 18 | Transaction_Segment STRING 19 | ); 20 | INSERT INTO 21 | transactions_table 22 | VALUES 23 | ( 'CTID28830551', '24-Apr-16', '1629-9566-3285-2123', 23649, 'SEG25' ), 24 | ( 'CTID45504917', '11-Feb-16', '3697-6001-4909-5350', 26726, 'SEG16' ), 25 | ( 'CTID47312290', '1-Nov-16', '5864-4475-3659-1440', 22012, 'SEG14' ), 26 | ( 'CTID25637718', '28-Jan-16', '5991-4421-8476-3804', 37637, 'SEG17' ); 27 | 28 | -- COMMAND ---------- 29 | 30 | USE CATALOG de_book; 31 | USE SCHEMA credit_card; 32 | CREATE OR REPLACE VIEW transactions_view (Credit_Card_ID, total_Transaction_Value) 33 | COMMENT 'A view that shows the total transaction value by credit card' 34 | AS SELECT Credit_Card_ID, SUM(Transaction_Value) AS total_Transaction_Value FROM de_book.credit_card.transactions_table GROUP BY Credit_Card_ID; 35 | 36 | -- COMMAND ---------- 37 | 38 | SELECT * FROM transactions_table WHERE Transaction_Value > 25000; 39 | 40 | -- COMMAND ---------- 41 | 42 | SELECT * FROM transactions_view; 43 | 44 | -- COMMAND ---------- 45 | 46 | CREATE EXTERNAL VOLUME de_book.credit_card.files 47 | LOCATION 's3://data-lake/de-book-ext-data/files'; 48 | 49 | -- COMMAND ---------- 50 | 51 | 52 | -------------------------------------------------------------------------------- /Chapter10/10.4 tags_comments_metadata.sql: -------------------------------------------------------------------------------- 1 | -- Databricks notebook source 2 | COMMENT ON TABLE de_book.credit_card.transactions_table IS 'This table contains transaction information from the credit_card database'; 3 | 4 | -- COMMAND ---------- 5 | 6 | ALTER TABLE 7 | de_book.credit_card.transactions_table 8 | SET 9 | TAGS ( 10 | 'business_unit' = 'finance', 11 | 'data_sensitivity' = 'medium', 12 | 'data_quality' = 'high' 13 | ); 14 | 15 | -- COMMAND ---------- 16 | 17 | ALTER TABLE 18 | de_book.credit_card.transactions_table 19 | ALTER COLUMN 20 | Transaction_ID COMMENT 'A unique identifier for the transaction.'; 21 | 22 | -- COMMAND ---------- 23 | 24 | ALTER TABLE 25 | de_book.credit_card.transactions_table 26 | ALTER COLUMN 27 | Transaction_ID 28 | SET 29 | TAGS ( 30 | 'data_protection' = 'non-PII', 31 | 'isIdentifier' = 'true' 32 | ); 33 | 34 | -- COMMAND ---------- 35 | 36 | DESCRIBE DETAIL de_book.credit_card.transactions_table; 37 | 38 | -- COMMAND ---------- 39 | 40 | SELECT 41 | catalog_name, 42 | schema_name, 43 | table_name, 44 | tag_name, 45 | tag_value 46 | FROM 47 | de_book.information_schema.table_tags 48 | WHERE 49 | catalog_name = 'de_book' 50 | and schema_name = 'credit_card' 51 | and table_name = 'transactions_table'; 52 | 53 | -- COMMAND ---------- 54 | 55 | ALTER TABLE 56 | de_book.credit_card.transactions_table UNSET TAGS ('business_unit', 'data_sensitivity'); 57 | 58 | -- COMMAND ---------- 59 | 60 | ALTER TABLE 61 | de_book.credit_card.transactions_table 62 | ALTER COLUMN 63 | Transaction_ID UNSET TAGS ('data_type'); 64 | 65 | -- COMMAND ---------- 66 | 67 | 68 | -------------------------------------------------------------------------------- /Chapter10/10.5 filter_sensitive_data.sql: -------------------------------------------------------------------------------- 1 | -- Databricks notebook source 2 | -- Create a sample table with customer information 3 | USE CATALOG de_book; 4 | USE SCHEMA credit_card; 5 | 6 | CREATE TABLE customer ( 7 | id INT, 8 | name STRING, 9 | email STRING, 10 | phone STRING, 11 | ssn STRING, 12 | country STRING 13 | ); 14 | 15 | -- Insert some sample data into the table 16 | INSERT INTO customer VALUES 17 | (1, 'Alice', 'alice@example.com', '+1-111-1111', '111--111-1111','USA'), 18 | (2, 'Bob', 'bob@example.com', '+1-222-2222', '222-222-2222','USA'), 19 | (3, 'Charlie', 'charlie@example.com', '+1-333-3333', '333-333-3333','USA'), 20 | (4, 'David', 'david@example.com', '+44-444-4444','444-444-4444', 'UK'), 21 | (5, 'Eve', 'eve@example.com', '+44-555-5555', '+555-555-5555','UK'); 22 | 23 | 24 | -- COMMAND ---------- 25 | 26 | CREATE FUNCTION country_filter(country STRING) 27 | RETURN IF(IS_ACCOUNT_GROUP_MEMBER('admin'), true, country='USA'); 28 | 29 | -- COMMAND ---------- 30 | 31 | ALTER TABLE customer SET ROW FILTER country_filter ON (country); 32 | 33 | -- COMMAND ---------- 34 | 35 | CREATE 36 | OR REPLACE FUNCTION country_filter(country STRING) RETURN IF( 37 | IS_ACCOUNT_GROUP_MEMBER('admin'), 38 | true, 39 | IF( 40 | IS_ACCOUNT_GROUP_MEMBER('usteam') 41 | AND country = 'USA', 42 | true, 43 | IF( 44 | IS_ACCOUNT_GROUP_MEMBER('ukteam') 45 | AND country = 'UK', 46 | true, 47 | false 48 | ) 49 | ) 50 | ); 51 | 52 | -- COMMAND ---------- 53 | 54 | ALTER TABLE customer DROP ROW FILTER; 55 | 56 | -- COMMAND ---------- 57 | 58 | DROP FUNCTION country_filter; 59 | 60 | -- COMMAND ---------- 61 | 62 | -- Create a UDF that masks the email column by replacing the domain part with '***' 63 | CREATE FUNCTION mask_email (email STRING) RETURN CASE 64 | WHEN is_account_group_member('hr_dept') THEN email 65 | ELSE CONCAT(SPLIT(email, '@')[0], '@***') 66 | END; 67 | 68 | -- COMMAND ---------- 69 | 70 | ALTER TABLE customer ALTER COLUMN email SET MASK mask_email; 71 | 72 | -- COMMAND ---------- 73 | 74 | ALTER TABLE customer ALTER COLUMN email DROP MASK; 75 | 76 | -- COMMAND ---------- 77 | 78 | CREATE 79 | OR REPLACE FUNCTION mask_email (email STRING) RETURN CASE 80 | WHEN is_account_group_member('hr_dept') 81 | OR is_account_group_member('finance_dept') THEN email 82 | ELSE CONCAT(SPLIT(email, '@') [0], '@***') 83 | END; 84 | 85 | -- COMMAND ---------- 86 | 87 | ALTER TABLE customer ALTER COLUMN email DROP MASK; 88 | DROP FUNCTION mask_email; 89 | 90 | -- COMMAND ---------- 91 | 92 | 93 | -------------------------------------------------------------------------------- /Chapter10/10.6 lineage_view.sql: -------------------------------------------------------------------------------- 1 | -- Databricks notebook source 2 | USE CATALOG de_book; 3 | USE SCHEMA credit_card; 4 | 5 | -- COMMAND ---------- 6 | 7 | CREATE 8 | OR REPLACE TABLE usa_customers AS 9 | SELECT 10 | * 11 | FROM 12 | customer 13 | WHERE 14 | country = 'USA'; 15 | 16 | -- COMMAND ---------- 17 | 18 | CREATE 19 | OR REPLACE TABLE uk_customers AS 20 | SELECT 21 | * 22 | FROM 23 | customer 24 | WHERE 25 | country = 'UK'; 26 | 27 | -- COMMAND ---------- 28 | 29 | 30 | -------------------------------------------------------------------------------- /Chapter10/10.7 system_tables.sql: -------------------------------------------------------------------------------- 1 | -- Databricks notebook source 2 | -- MAGIC %sh 3 | -- MAGIC curl -v -X GET -H "Authorization: Bearer " "https://.cloud.databricks.com/api/2.0/unity-catalog/metastores//systemschemas" 4 | 5 | -- COMMAND ---------- 6 | 7 | -- MAGIC %sh 8 | -- MAGIC curl -v -X POST -H "Authorization: Bearer " "https://.cloud.databricks.com/api/2.0/unity-catalog/metastores//systemschemas//enable" 9 | 10 | -- COMMAND ---------- 11 | 12 | -- MAGIC %sh 13 | -- MAGIC curl -v -X POST -H "Authorization: Bearer " "https://.cloud.databricks.com/api/2.0/unity-catalog/metastores//systemschemas/system.access/enable" 14 | 15 | -- COMMAND ---------- 16 | 17 | GRANT SELECT ON TABLE system.access.audit TO analysts 18 | 19 | -- COMMAND ---------- 20 | 21 | REVOKE SELECT ON TABLE system.billing.usage FROM developers 22 | 23 | -- COMMAND ---------- 24 | 25 | SELECT 26 | user_identity.email as user_id, 27 | COUNT(*) AS event_count 28 | FROM 29 | system.access.audit 30 | WHERE 31 | event_time >= current_date - interval 30 days 32 | GROUP BY 33 | user_id 34 | ORDER BY 35 | event_count DESC 36 | LIMIT 37 | 10 38 | 39 | -- COMMAND ---------- 40 | 41 | 42 | 43 | -- COMMAND ---------- 44 | 45 | SELECT 46 | b.sku_name, 47 | SUM(b.usage_quantity) AS usage_hours, 48 | SUM(b.usage_quantity * p.pricing.default) AS cost 49 | FROM 50 | system.billing.usage AS b 51 | JOIN system.billing.list_prices AS p ON b.sku_name = p.sku_name 52 | AND b.usage_date BETWEEN p.price_start_time 53 | AND coalesce(p.price_end_time, current_timestamp()) 54 | WHERE 55 | b.usage_start_time >= date_trunc('month', current_date) - interval 1 month 56 | AND b.usage_start_time < date_trunc('month', current_date) 57 | GROUP BY 58 | b.sku_name 59 | ORDER BY 60 | cost DESC 61 | 62 | -- COMMAND ---------- 63 | 64 | SELECT 65 | source_table_full_name, 66 | target_table_full_name, 67 | event_time, 68 | created_by, 69 | entity_type, 70 | entity_id, 71 | entity_run_id 72 | FROM 73 | system.access.table_lineage 74 | WHERE 75 | target_table_full_name = 'de_book.credit_card.usa_customers' 76 | ORDER BY 77 | event_time DESC 78 | -------------------------------------------------------------------------------- /Chapter11/11.1 connect_to_git_repo.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | 3 | -------------------------------------------------------------------------------- /Chapter11/11.3 using_databricks_sdk.py: -------------------------------------------------------------------------------- 1 | from databricks.sdk import WorkspaceClient 2 | 3 | w = WorkspaceClient() 4 | 5 | # List the existing clusters 6 | clusters = w.clusters.list() 7 | 8 | # Loop through the clusters and print their names and states 9 | for cluster in clusters: 10 | print(f"Cluster name: {cluster.cluster_name} - {cluster.state}") -------------------------------------------------------------------------------- /Chapter11/11.4 databricks_vscode_extension.py: -------------------------------------------------------------------------------- 1 | from pyspark.sql import SparkSession 2 | from pyspark.sql.functions import * 3 | from pyspark.sql.types import * 4 | # Create a new SparkSession 5 | spark = (SparkSession 6 | .builder 7 | .appName("optimize-data-shuffles") 8 | .master("spark://spark-master:7077") 9 | .config("spark.executor.memory", "512m") 10 | .getOrCreate()) 11 | 12 | # Create some sample data frames 13 | # A large data frame with 1 million rows and two columns: id and value 14 | large_df = (spark.range(0, 1000000) 15 | .withColumn("date", date_sub(current_date(), (rand() * 365).cast("int"))) 16 | .withColumn("age", (rand() * 100).cast("int")) 17 | .withColumn("salary", 100*(rand() * 100).cast("int")) 18 | .withColumn("gender", when((rand() * 2).cast("int") == 0, "M").otherwise("F")) 19 | .withColumn("grade", 20 | when((rand() * 5).cast("int") == 0, "IC") 21 | .when((rand() * 5).cast("int") == 1, "IC-2") 22 | .when((rand() * 5).cast("int") == 2, "M1") 23 | .when((rand() * 5).cast("int") == 3, "M2") 24 | .when((rand() * 5).cast("int") == 4, "IC-3") 25 | .otherwise("M3"))) 26 | large_df.show(5) -------------------------------------------------------------------------------- /Chapter11/11.5_databricks_asset_bundles/dabs_cicd_example/.github/workflows/deploy_to_prod_CD.yml: -------------------------------------------------------------------------------- 1 | # This workflow validates, deploys, and runs the specified bundle 2 | # within a production target named "prod". 3 | name: "Production deployment" 4 | 5 | # Ensure that only a single job or workflow using the same concurrency group 6 | # runs at a time. 7 | concurrency: 1 8 | 9 | # Trigger this workflow whenever a pull request is pushed to the repo's 10 | # main branch. 11 | on: 12 | push: 13 | branches: 14 | - main 15 | 16 | jobs: 17 | deploy: 18 | name: "Deploy bundle" 19 | runs-on: ubuntu-latest 20 | 21 | steps: 22 | # Check out this repo, so that this workflow can access it. 23 | - uses: actions/checkout@v3 24 | 25 | # Download the Databricks CLI. 26 | # See https://github.com/databricks/setup-cli 27 | - uses: databricks/setup-cli@main 28 | 29 | # Deploy the bundle to the "prod" target as defined 30 | # in the bundle's settings file. 31 | - run: databricks bundle deploy 32 | working-directory: . 33 | env: 34 | DATABRICKS_TOKEN: ${{secrets.DATABRICKS_TOKEN}} 35 | DATABRICKS_BUNDLE_ENV: prod 36 | 37 | # Validate, deploy, and then run the bundle. 38 | pipeline_update: 39 | name: "Run Job" 40 | runs-on: ubuntu-latest 41 | 42 | # Run the "deploy" job first. 43 | needs: 44 | - deploy 45 | 46 | steps: 47 | # Check out this repo, so that this workflow can access it. 48 | - uses: actions/checkout@v3 49 | 50 | # Use the downloaded Databricks CLI. 51 | - uses: databricks/setup-cli@main 52 | 53 | # Run the Databricks workflow named "de_book_dabs_example_job" as defined in the 54 | # bundle that was just deployed. 55 | - run: databricks bundle run de_book_dabs_example_job --refresh-all 56 | working-directory: . 57 | env: 58 | DATABRICKS_TOKEN: ${{secrets.DATABRICKS_TOKEN}} 59 | DATABRICKS_BUNDLE_ENV: prod 60 | -------------------------------------------------------------------------------- /Chapter11/11.5_databricks_asset_bundles/dabs_cicd_example/.github/workflows/deploy_to_qa_CI.yml: -------------------------------------------------------------------------------- 1 | # This workflow validates, deploys, and runs the specified bundle 2 | # within a pre-production target named "qa". 3 | name: "QA deployment" 4 | 5 | # Ensure that only a single job or workflow using the same concurrency group 6 | # runs at a time. 7 | concurrency: 1 8 | 9 | # Trigger this workflow whenever a pull request is opened against the repo's 10 | # main branch or an existing pull request's head branch is updated. 11 | on: 12 | pull_request: 13 | types: 14 | - opened 15 | - synchronize 16 | 17 | jobs: 18 | # Used by the "pipeline_update" job to deploy the bundle. 19 | # Bundle validation is automatically performed as part of this deployment. 20 | # If validation fails, this workflow fails. 21 | deploy: 22 | name: "Deploy bundle" 23 | runs-on: ubuntu-latest 24 | 25 | steps: 26 | # Check out this repo, so that this workflow can access it. 27 | - uses: actions/checkout@v3 28 | 29 | # Download the Databricks CLI. 30 | # See https://github.com/databricks/setup-cli 31 | - uses: databricks/setup-cli@main 32 | 33 | # Deploy the bundle to the "qa" target as defined 34 | # in the bundle's settings file. 35 | - run: databricks bundle deploy 36 | working-directory: . 37 | env: 38 | DATABRICKS_TOKEN: ${{secrets.DATABRICKS_TOKEN}} 39 | DATABRICKS_BUNDLE_ENV: dev 40 | 41 | # Validate, deploy, and then run the bundle. 42 | pipeline_update: 43 | name: "Run Job" 44 | runs-on: ubuntu-latest 45 | 46 | # Run the "deploy" job first. 47 | needs: 48 | - deploy 49 | 50 | steps: 51 | # Check out this repo, so that this workflow can access it. 52 | - uses: actions/checkout@v3 53 | 54 | # Use the downloaded Databricks CLI. 55 | - uses: databricks/setup-cli@main 56 | 57 | # Run the Databricks workflow named "de_book_dabs_example_job" as defined in the 58 | # bundle that was just deployed. 59 | - run: databricks bundle run de_book_dabs_example_job --refresh-all 60 | working-directory: . 61 | env: 62 | DATABRICKS_TOKEN: ${{secrets.DATABRICKS_TOKEN}} 63 | DATABRICKS_BUNDLE_ENV: dev 64 | -------------------------------------------------------------------------------- /Chapter11/11.5_databricks_asset_bundles/dabs_cicd_example/README.md: -------------------------------------------------------------------------------- 1 | # dabs_cicd_example 2 | 3 | The 'dabs_cicd_example' project was generated by using the default-python template. 4 | 5 | ## Getting started 6 | 7 | 1. Install the Databricks CLI from https://docs.databricks.com/dev-tools/cli/databricks-cli.html 8 | 9 | 2. Authenticate to your Databricks workspace: 10 | ``` 11 | $ databricks configure 12 | ``` 13 | 14 | 3. To deploy a development copy of this project, type: 15 | ``` 16 | $ databricks bundle deploy --target dev 17 | ``` 18 | (Note that "dev" is the default target, so the `--target` parameter 19 | is optional here.) 20 | 21 | This deploys everything that's defined for this project. 22 | For example, the default template would deploy a job called 23 | `[dev yourname] dabs_cicd_example_job` to your workspace. 24 | You can find that job by opening your workpace and clicking on **Workflows**. 25 | 26 | 4. Similarly, to deploy a production copy, type: 27 | ``` 28 | $ databricks bundle deploy --target prod 29 | ``` 30 | 31 | 5. To run a job or pipeline, use the "run" comand: 32 | ``` 33 | $ databricks bundle run 34 | ``` 35 | 36 | 6. Optionally, install developer tools such as the Databricks extension for Visual Studio Code from 37 | https://docs.databricks.com/dev-tools/vscode-ext.html. 38 | 39 | 7. For documentation on the Databricks asset bundles format used 40 | for this project, and for CI/CD configuration, see 41 | https://docs.databricks.com/dev-tools/bundles/index.html. 42 | -------------------------------------------------------------------------------- /Chapter11/11.5_databricks_asset_bundles/dabs_cicd_example/databricks.yml: -------------------------------------------------------------------------------- 1 | # This is a Databricks asset bundle definition for dabs_cicd_example. 2 | # See https://docs.databricks.com/dev-tools/bundles/index.html for documentation. 3 | bundle: 4 | name: dabs_cicd_example 5 | 6 | include: 7 | - resources/*.yml 8 | 9 | targets: 10 | # The 'dev' target, used for development purposes. 11 | # Whenever a developer deploys using 'dev', they get their own copy. 12 | dev: 13 | # We use 'mode: development' to make sure everything deployed to this target gets a prefix 14 | # like '[dev my_user_name]'. Setting this mode also disables any schedules and 15 | # automatic triggers for jobs and enables the 'development' mode for Delta Live Tables pipelines. 16 | mode: development 17 | default: true 18 | workspace: 19 | host: https://adb-7934447987890817.7.azuredatabricks.net/ 20 | 21 | # Optionally, there could be a 'staging' target here. 22 | # (See Databricks docs on CI/CD at https://docs.databricks.com/dev-tools/bundles/index.html.) 23 | # 24 | # staging: 25 | # workspace: 26 | # host: https://adb-7934447987890817.7.azuredatabricks.net/ 27 | 28 | # The 'prod' target, used for production deployment. 29 | prod: 30 | # For production deployments, we only have a single copy, so we override the 31 | # workspace.root_path default of 32 | # /Users/${workspace.current_user.userName}/.bundle/${bundle.target}/${bundle.name} 33 | # to a path that is not specific to the current user. 34 | mode: production 35 | workspace: 36 | host: https://adb-7934447987890817.7.azuredatabricks.net/ 37 | root_path: /Shared/.bundle/prod/${bundle.name} 38 | run_as: 39 | # This runs as pulkit.chadha.packt@gmail.com in production. Alternatively, 40 | # a service principal could be used here using service_principal_name 41 | # (see Databricks documentation). 42 | user_name: pulkit.chadha.packt@gmail.com 43 | -------------------------------------------------------------------------------- /Chapter11/11.5_databricks_asset_bundles/dabs_cicd_example/fixtures/.gitkeep: -------------------------------------------------------------------------------- 1 | # Fixtures 2 | 3 | This folder is reserved for fixtures, such as CSV files. 4 | 5 | Below is an example of how to load fixtures as a data frame: 6 | 7 | ``` 8 | import pandas as pd 9 | import os 10 | 11 | def get_absolute_path(*relative_parts): 12 | if 'dbutils' in globals(): 13 | base_dir = os.path.dirname(dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get()) # type: ignore 14 | path = os.path.normpath(os.path.join(base_dir, *relative_parts)) 15 | return path if path.startswith("/Workspace") else "/Workspace" + path 16 | else: 17 | return os.path.join(*relative_parts) 18 | 19 | csv_file = get_absolute_path("..", "fixtures", "mycsv.csv") 20 | df = pd.read_csv(csv_file) 21 | display(df) 22 | ``` 23 | -------------------------------------------------------------------------------- /Chapter11/11.5_databricks_asset_bundles/dabs_cicd_example/requirements-dev.txt: -------------------------------------------------------------------------------- 1 | ## requirements-dev.txt: dependencies for local development. 2 | ## 3 | ## For defining dependencies used by jobs in Databricks Workflows, see 4 | ## https://docs.databricks.com/dev-tools/bundles/library-dependencies.html 5 | 6 | ## pytest is the default package used for testing 7 | pytest 8 | 9 | ## databricks-connect can be used to run parts of this project locally. 10 | ## See https://docs.databricks.com/dev-tools/databricks-connect.html. 11 | ## 12 | ## databricks-connect is automatically installed if you're using Databricks 13 | ## extension for Visual Studio Code 14 | ## (https://docs.databricks.com/dev-tools/vscode-ext/dev-tasks/databricks-connect.html). 15 | ## 16 | ## To manually install databricks-connect, either follow the instructions 17 | ## at https://docs.databricks.com/dev-tools/databricks-connect.html 18 | ## to install the package system-wide. Or uncomment the line below to install a 19 | ## version of db-connect that corresponds to the Databricks Runtime version used 20 | ## for this project. 21 | # 22 | # databricks-connect>=13.3,<13.4 23 | -------------------------------------------------------------------------------- /Chapter11/11.5_databricks_asset_bundles/dabs_cicd_example/resources/dabs_cicd_example_job.yml: -------------------------------------------------------------------------------- 1 | # The main job for dabs_cicd_example 2 | resources: 3 | jobs: 4 | dabs_cicd_example_job: 5 | name: dabs_cicd_example_job 6 | 7 | schedule: 8 | quartz_cron_expression: '44 37 8 * * ?' 9 | timezone_id: Europe/Amsterdam 10 | 11 | email_notifications: 12 | on_failure: 13 | - pulkit.chadha.packt@gmail.com 14 | 15 | tasks: 16 | - task_key: notebook_task 17 | job_cluster_key: job_cluster 18 | notebook_task: 19 | notebook_path: ../src/notebook.ipynb 20 | 21 | - task_key: refresh_pipeline 22 | depends_on: 23 | - task_key: notebook_task 24 | pipeline_task: 25 | pipeline_id: ${resources.pipelines.dabs_cicd_example_pipeline.id} 26 | 27 | job_clusters: 28 | - job_cluster_key: job_cluster 29 | new_cluster: 30 | spark_version: 13.3.x-scala2.12 31 | node_type_id: i3.xlarge 32 | autoscale: 33 | min_workers: 1 34 | max_workers: 4 35 | -------------------------------------------------------------------------------- /Chapter11/11.5_databricks_asset_bundles/dabs_cicd_example/resources/dabs_cicd_example_pipeline.yml: -------------------------------------------------------------------------------- 1 | # The main pipeline for dabs_cicd_example 2 | resources: 3 | pipelines: 4 | dabs_cicd_example_pipeline: 5 | name: dabs_cicd_example_pipeline 6 | target: dabs_cicd_example_${bundle.environment} 7 | libraries: 8 | - notebook: 9 | path: ../src/dlt_pipeline.ipynb 10 | 11 | configuration: 12 | bundle.sourcePath: /Workspace/${workspace.file_path}/src 13 | -------------------------------------------------------------------------------- /Chapter11/11.5_databricks_asset_bundles/dabs_cicd_example/scratch/README.md: -------------------------------------------------------------------------------- 1 | # scratch 2 | 3 | This folder is reserved for personal, exploratory notebooks. 4 | By default these are not committed to Git, as 'scratch' is listed in .gitignore. 5 | -------------------------------------------------------------------------------- /Chapter11/11.5_databricks_asset_bundles/dabs_cicd_example/scratch/exploration.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "application/vnd.databricks.v1+cell": { 8 | "cellMetadata": { 9 | "byteLimit": 2048000, 10 | "rowLimit": 10000 11 | }, 12 | "inputWidgets": {}, 13 | "nuid": "6bca260b-13d1-448f-8082-30b60a85c9ae", 14 | "showTitle": false, 15 | "title": "" 16 | } 17 | }, 18 | "outputs": [], 19 | "source": [ 20 | "spark.range(10)" 21 | ] 22 | } 23 | ], 24 | "metadata": { 25 | "application/vnd.databricks.v1+notebook": { 26 | "dashboards": [], 27 | "language": "python", 28 | "notebookMetadata": { 29 | "pythonIndentUnit": 2 30 | }, 31 | "notebookName": "ipynb-notebook", 32 | "widgets": {} 33 | }, 34 | "kernelspec": { 35 | "display_name": "Python 3", 36 | "language": "python", 37 | "name": "python3" 38 | }, 39 | "language_info": { 40 | "name": "python", 41 | "version": "3.11.4" 42 | } 43 | }, 44 | "nbformat": 4, 45 | "nbformat_minor": 0 46 | } 47 | -------------------------------------------------------------------------------- /Chapter11/11.5_databricks_asset_bundles/dabs_cicd_example/src/dlt_pipeline.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "application/vnd.databricks.v1+cell": { 7 | "cellMetadata": {}, 8 | "inputWidgets": {}, 9 | "nuid": "9a626959-61c8-4bba-84d2-2a4ecab1f7ec", 10 | "showTitle": false, 11 | "title": "" 12 | } 13 | }, 14 | "source": [ 15 | "# DLT pipeline\n", 16 | "\n", 17 | "This Delta Live Tables (DLT) definition is executed using a pipeline defined in resources/dabs_cicd_example_pipeline.yml." 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": 0, 23 | "metadata": { 24 | "application/vnd.databricks.v1+cell": { 25 | "cellMetadata": {}, 26 | "inputWidgets": {}, 27 | "nuid": "9198e987-5606-403d-9f6d-8f14e6a4017f", 28 | "showTitle": false, 29 | "title": "" 30 | } 31 | }, 32 | "outputs": [], 33 | "source": [ 34 | "import dlt\n", 35 | "from pyspark.sql.functions import expr\n", 36 | "from pyspark.sql import SparkSession\n", 37 | "spark = SparkSession.builder.getOrCreate()" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 0, 43 | "metadata": { 44 | "application/vnd.databricks.v1+cell": { 45 | "cellMetadata": {}, 46 | "inputWidgets": {}, 47 | "nuid": "3fc19dba-61fd-4a89-8f8c-24fee63bfb14", 48 | "showTitle": false, 49 | "title": "" 50 | } 51 | }, 52 | "outputs": [], 53 | "source": [ 54 | "\n", 55 | "@dlt.view\n", 56 | "def taxi_raw():\n", 57 | " return spark.read.format(\"json\").load(\"/databricks-datasets/nyctaxi/sample/json/\")\n", 58 | "\n", 59 | "@dlt.table\n", 60 | "def filtered_taxis():\n", 61 | " return dlt.read(\"taxi_raw\").filter(expr(\"fare_amount < 30\"))" 62 | ] 63 | } 64 | ], 65 | "metadata": { 66 | "application/vnd.databricks.v1+notebook": { 67 | "dashboards": [], 68 | "language": "python", 69 | "notebookMetadata": { 70 | "pythonIndentUnit": 2 71 | }, 72 | "notebookName": "dlt_pipeline", 73 | "widgets": {} 74 | }, 75 | "kernelspec": { 76 | "display_name": "Python 3", 77 | "language": "python", 78 | "name": "python3" 79 | }, 80 | "language_info": { 81 | "name": "python", 82 | "version": "3.11.4" 83 | } 84 | }, 85 | "nbformat": 4, 86 | "nbformat_minor": 0 87 | } 88 | -------------------------------------------------------------------------------- /Chapter11/11.5_databricks_asset_bundles/dabs_cicd_example/src/notebook.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "application/vnd.databricks.v1+cell": { 7 | "cellMetadata": {}, 8 | "inputWidgets": {}, 9 | "nuid": "ee353e42-ff58-4955-9608-12865bd0950e", 10 | "showTitle": false, 11 | "title": "" 12 | } 13 | }, 14 | "source": [ 15 | "# Default notebook\n", 16 | "\n", 17 | "This default notebook is executed using Databricks Workflows as defined in resources/dabs_cicd_example_job.yml." 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": 0, 23 | "metadata": { 24 | "application/vnd.databricks.v1+cell": { 25 | "cellMetadata": { 26 | "byteLimit": 2048000, 27 | "rowLimit": 10000 28 | }, 29 | "inputWidgets": {}, 30 | "nuid": "6bca260b-13d1-448f-8082-30b60a85c9ae", 31 | "showTitle": false, 32 | "title": "" 33 | } 34 | }, 35 | "outputs": [], 36 | "source": [ 37 | "spark.range(10)" 38 | ] 39 | } 40 | ], 41 | "metadata": { 42 | "application/vnd.databricks.v1+notebook": { 43 | "dashboards": [], 44 | "language": "python", 45 | "notebookMetadata": { 46 | "pythonIndentUnit": 2 47 | }, 48 | "notebookName": "notebook", 49 | "widgets": {} 50 | }, 51 | "kernelspec": { 52 | "display_name": "Python 3", 53 | "language": "python", 54 | "name": "python3" 55 | }, 56 | "language_info": { 57 | "name": "python", 58 | "version": "3.11.4" 59 | } 60 | }, 61 | "nbformat": 4, 62 | "nbformat_minor": 0 63 | } 64 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Packt 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Data Engineering with Databricks Cookbook 2 | 3 | no-image 4 | 5 | This is the code repository for [Data Engineering with Databricks Cookbook](https://www.packtpub.com/product/data-engineering-with-databricks-cookbook/9781837633357), published by Packt. 6 | 7 | **Build effective data and AI solutions using Apache Spark, Databricks, and Delta Lake** 8 | 9 | ## What is this book about? 10 | This book shows you how to use Apache Spark, Delta Lake, and Databricks to build data pipelines, manage and transform data, optimize performance, and more. Additionally, you’ll implement DataOps and DevOps practices, and orchestrate data workflows. 11 | 12 | This book covers the following exciting features: 13 | * Perform data loading, ingestion, and processing with Apache Spark 14 | * Discover data transformation techniques and custom user-defined functions (UDFs) in Apache Spark 15 | * Manage and optimize Delta tables with Apache Spark and Delta Lake APIs 16 | * Use Spark Structured Streaming for real-time data processing 17 | * Optimize Apache Spark application and Delta table query performance 18 | * Implement DataOps and DevOps practices on Databricks 19 | * Orchestrate data pipelines with Delta Live Tables and Databricks Workflows 20 | * Implement data governance policies with Unity Catalog 21 | 22 | If you feel this book is for you, get your [copy](https://www.amazon.com/Engineering-Apache-Spark-Delta-Cookbook/dp/1837633355) today! 23 | 24 | https://www.packtpub.com/ 26 | 27 | ## Instructions and Navigations 28 | All of the code is organized into folders. For example, Chapter01. 29 | 30 | The code will look like the following: 31 | ``` 32 | from pyspark.sql import SparkSession 33 | 34 | spark = (SparkSession.builder 35 | .appName("read-csv-data") 36 | .master(«spark://spark-master:7077») 37 | .config(«spark.executor.memory", "512m") 38 | .getOrCreate()) 39 | 40 | spark.sparkContext.setLogLevel("ERROR") 41 | ``` 42 | 43 | **Following is what you need for this book:** 44 | This book is for data engineers, data scientists, and data practitioners who want to learn how to build efficient and scalable data pipelines using Apache Spark, Delta Lake, and Databricks. To get the most out of this book, you should have basic knowledge of data architecture, SQL, and Python programming. 45 | 46 | With the following software and hardware list you can run all code files present in the book (Chapter 1-11). 47 | ### Software and Hardware List 48 | | Chapter | Software required | OS required | 49 | | -------- | ------------------------------------ | ----------------------------------- | 50 | | 1-11 | Docker Engine version 18.02.0+ | Windows, Mac OS X, and Linux (any) | 51 | | 1-11 | Docker Compose version 1.25.5+ | Windows, Mac OS X, and Linux (any) | 52 | | 1-11 | Docker Desktop | Windows, Mac OS X, and Linux (any) | 53 | | 1-11 | Git | Windows, Mac OS X, and Linux (any) | 54 | 55 | ### Related products 56 | * Business Intelligence with Databricks SQL [[Packt]](https://www.packtpub.com/product/business-intelligence-with-databricks-sql/9781803235332) [[Amazon]](https://www.amazon.com/Business-Intelligence-Databricks-SQL-intelligence/dp/1803235330/ref=sr_1_1?crid=1QYCAOZP9E3NH&dib=eyJ2IjoiMSJ9.nKZ7dRFPdDZyRvWwKM_NiTSZyweCLZ8g9JdktemcYzaWNiGWg9PuoxY2yb2jogGyK8hgRliKebDQfdHu2rRnTZTWZbsWOJAN33k65RFkAgdFX-csS8HgTFfjZj-SFKLpp4FC6LHwQvWr9Nq6f5x6eg.jh99qre-Hl4OHA9rypXLmSGsQp4exBvaZ2xUOPDQ0mM&dib_tag=se&keywords=Business+Intelligence+with+Databricks+SQL&qid=1718173191&s=books&sprefix=business+intelligence+with+databricks+sql%2Cstripbooks-intl-ship%2C553&sr=1-1) 57 | 58 | * Optimizing Databricks Workloads [[Packt]](https://www.packtpub.com/product/optimizing-databricks-workloads/9781801819077) [[Amazon]](https://www.amazon.com/Optimizing-Databricks-Workloads-performance-workloads/dp/1801819076/ref=tmm_pap_swatch_0?_encoding=UTF8&dib_tag=se&dib=eyJ2IjoiMSJ9.cskfrEglx5gEbJF-FnhxlA.rCtKm1bO6Fi1mXUpq1Oai0kjAhGseGT2cCZ2Ccgxaak&qid=1718173341&sr=1-1) 59 | 60 | ## Get to Know the Author 61 | **Pulkit Chadha** 62 | is a seasoned technologist with over 15 years of experience in data engineering. His proficiency in crafting and refining data pipelines has been instrumental in driving success across diverse sectors such as healthcare, media and entertainment, hi-tech, and manufacturing. Pulkit’s tailored data engineering solutions are designed to address the unique challenges and aspirations of each enterprise he collaborates with. 63 | 64 | -------------------------------------------------------------------------------- /build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # -- Build Apache Spark Standalone Cluster Docker Images 4 | 5 | # ---------------------------------------------------------------------------------------------------------------------- 6 | # -- Variables --------------------------------------------------------------------------------------------------------- 7 | # ---------------------------------------------------------------------------------------------------------------------- 8 | 9 | BUILD_DATE="$(date -u +'%Y-%m-%d')" 10 | SPARK_VERSION="3.4.1" 11 | HADOOP_VERSION="3" 12 | DELTA_SPARK_VERSION="2.4.0" 13 | DELTALAKE_VERSION="0.10.0" 14 | JUPYTERLAB_VERSION="4.0.2" 15 | PANDAS_VERSION="2.0.1" 16 | DELTA_PACKAGE_VERSION="delta-core_2.12:2.4.0" 17 | SPARK_VERSION_MAJOR=${SPARK_VERSION:0:1} 18 | SPARK_XML_PACKAGE_VERSION="spark-xml_2.12:0.16.0" 19 | SPARKSQL_MAGIC_VERSION="0.0.3" 20 | KAFKA_PYTHON_VERSION="2.0.2" 21 | 22 | # ---------------------------------------------------------------------------------------------------------------------- 23 | # -- Functions---------------------------------------------------------------------------------------------------------- 24 | # ---------------------------------------------------------------------------------------------------------------------- 25 | 26 | function cleanContainers() { 27 | 28 | container="$(docker ps -a | grep 'jupyterlab' | awk '{print $1}')" 29 | docker stop "${container}" 30 | docker rm "${container}" 31 | 32 | container="$(docker ps -a | grep 'spark-worker' -m 1 | awk '{print $1}')" 33 | while [ -n "${container}" ]; 34 | do 35 | docker stop "${container}" 36 | docker rm "${container}" 37 | container="$(docker ps -a | grep 'spark-worker' -m 1 | awk '{print $1}')" 38 | done 39 | 40 | container="$(docker ps -a | grep 'spark-master' | awk '{print $1}')" 41 | docker stop "${container}" 42 | docker rm "${container}" 43 | 44 | container="$(docker ps -a | grep 'spark-base' | awk '{print $1}')" 45 | docker stop "${container}" 46 | docker rm "${container}" 47 | 48 | container="$(docker ps -a | grep 'base' | awk '{print $1}')" 49 | docker stop "${container}" 50 | docker rm "${container}" 51 | 52 | } 53 | 54 | function cleanImages() { 55 | 56 | docker rmi -f "$(docker images | grep -m 1 'jupyterlab' | awk '{print $3}')" 57 | 58 | docker rmi -f "$(docker images | grep -m 1 'spark-worker' | awk '{print $3}')" 59 | docker rmi -f "$(docker images | grep -m 1 'spark-master' | awk '{print $3}')" 60 | docker rmi -f "$(docker images | grep -m 1 'spark-base' | awk '{print $3}')" 61 | 62 | docker rmi -f "$(docker images | grep -m 1 'base' | awk '{print $3}')" 63 | 64 | } 65 | 66 | function cleanVolume() { 67 | docker volume rm "distributed-file-system" 68 | } 69 | 70 | function buildImages() { 71 | 72 | 73 | docker build \ 74 | --build-arg build_date="${BUILD_DATE}" \ 75 | --build-arg scala_version="${SCALA_VERSION}" \ 76 | --build-arg delta_spark_version="${DELTA_SPARK_VERSION}" \ 77 | --build-arg deltalake_version="${DELTALAKE_VERSION}" \ 78 | --build-arg pandas_version="${PANDAS_VERSION}" \ 79 | -f docker/base/Dockerfile \ 80 | -t base:latest . 81 | 82 | docker build \ 83 | --build-arg build_date="${BUILD_DATE}" \ 84 | --build-arg scala_version="${SCALA_VERSION}" \ 85 | --build-arg delta_spark_version="${DELTA_SPARK_VERSION}" \ 86 | --build-arg deltalake_version="${DELTALAKE_VERSION}" \ 87 | --build-arg pandas_version="${PANDAS_VERSION}" \ 88 | --build-arg spark_version="${SPARK_VERSION}" \ 89 | --build-arg hadoop_version="${HADOOP_VERSION}" \ 90 | --build-arg delta_package_version="${DELTA_PACKAGE_VERSION}" \ 91 | --build-arg spark_xml_package_version="${SPARK_XML_PACKAGE_VERSION}" \ 92 | -f docker/spark-base/Dockerfile \ 93 | -t spark-base:${SPARK_VERSION} . 94 | 95 | docker build \ 96 | --build-arg build_date="${BUILD_DATE}" \ 97 | --build-arg spark_version="${SPARK_VERSION}" \ 98 | -f docker/spark-master/Dockerfile \ 99 | -t spark-master:${SPARK_VERSION} . 100 | 101 | docker build \ 102 | --build-arg build_date="${BUILD_DATE}" \ 103 | --build-arg spark_version="${SPARK_VERSION}" \ 104 | -f docker/spark-worker/Dockerfile \ 105 | -t spark-worker:${SPARK_VERSION} . 106 | 107 | docker build \ 108 | --build-arg build_date="${BUILD_DATE}" \ 109 | --build-arg scala_version="${SCALA_VERSION}" \ 110 | --build-arg delta_spark_version="${DELTA_SPARK_VERSION}" \ 111 | --build-arg deltalake_version="${DELTALAKE_VERSION}" \ 112 | --build-arg pandas_version="${PANDAS_VERSION}" \ 113 | --build-arg spark_version="${SPARK_VERSION}" \ 114 | --build-arg jupyterlab_version="${JUPYTERLAB_VERSION}" \ 115 | --build-arg sparksql_magic_version="${SPARKSQL_MAGIC_VERSION}" \ 116 | --build-arg kafka_python_version="${KAFKA_PYTHON_VERSION}" \ 117 | -f docker/jupyterlab/Dockerfile \ 118 | -t jupyterlab:${JUPYTERLAB_VERSION}-spark-${SPARK_VERSION} . 119 | 120 | } 121 | 122 | # ---------------------------------------------------------------------------------------------------------------------- 123 | # -- Main -------------------------------------------------------------------------------------------------------------- 124 | # ---------------------------------------------------------------------------------------------------------------------- 125 | 126 | cleanContainers; 127 | cleanImages; 128 | cleanVolume; 129 | buildImages; 130 | -------------------------------------------------------------------------------- /data/Credit Card/FraudBase.csv: -------------------------------------------------------------------------------- 1 | Transaction_ID,Fraud_Flag 2 | CTID50558449,1 3 | CTID55936882,1 4 | CTID63762180,1 5 | CTID76723439,1 6 | CTID21246201,1 7 | CTID26555772,1 8 | CTID20567160,1 9 | CTID54759604,1 10 | CTID44626561,1 11 | CTID73773088,1 12 | CTID62499873,1 13 | CTID70746134,1 14 | CTID29266043,1 15 | CTID97776833,1 16 | CTID16281374,1 17 | CTID81479835,1 18 | CTID20456246,1 19 | CTID28195227,1 20 | CTID42582298,1 21 | CTID90938173,1 22 | CTID41668436,1 23 | CTID89260233,1 24 | CTID60575167,1 25 | CTID78032595,1 26 | CTID22306254,1 27 | CTID25962688,1 28 | CTID95859840,1 29 | CTID29469747,1 30 | CTID45648244,1 31 | CTID56303514,1 32 | CTID97760025,1 33 | CTID56692687,1 34 | CTID92439353,1 35 | CTID84579664,1 36 | CTID60614984,1 37 | CTID49913734,1 38 | CTID26134025,1 39 | CTID38789752,1 40 | CTID81793385,1 41 | CTID73000031,1 42 | CTID36997643,1 43 | CTID55429304,1 44 | CTID68050021,1 45 | CTID12041601,1 46 | CTID57707566,1 47 | CTID95135812,1 48 | CTID39550827,1 49 | CTID38931999,1 50 | CTID95884307,1 51 | CTID73429087,1 52 | CTID80787923,1 53 | CTID42980186,1 54 | CTID52594996,1 55 | CTID36309710,1 56 | CTID70222721,1 57 | CTID80436326,1 58 | CTID13865418,1 59 | CTID40881434,1 60 | CTID57357592,1 61 | CTID45385078,1 62 | CTID94511089,1 63 | CTID54381577,1 64 | CTID71795114,1 65 | CTID26238474,1 66 | CTID23350804,1 67 | CTID89116114,1 68 | CTID30763806,1 69 | CTID99066676,1 70 | CTID32892929,1 71 | CTID53972836,1 72 | CTID68584964,1 73 | CTID69594649,1 74 | CTID84024131,1 75 | CTID44773525,1 76 | CTID85085771,1 77 | CTID24913963,1 78 | CTID59571587,1 79 | CTID63195033,1 80 | CTID32907279,1 81 | CTID37742156,1 82 | CTID18338743,1 83 | CTID96772424,1 84 | CTID66265146,1 85 | CTID56680308,1 86 | CTID41847490,1 87 | CTID31867370,1 88 | CTID12270763,1 89 | CTID43014391,1 90 | CTID87470159,1 91 | CTID98722314,1 92 | CTID69782227,1 93 | CTID70707358,1 94 | CTID88342446,1 95 | CTID34574410,1 96 | CTID55853142,1 97 | CTID65786114,1 98 | CTID66191168,1 99 | CTID33699337,1 100 | CTID21253563,1 101 | CTID89585938,1 102 | CTID15034243,1 103 | CTID85930060,1 104 | CTID99663510,1 105 | CTID15730669,1 106 | CTID57993591,1 107 | CTID91108283,1 108 | CTID30494187,1 109 | CTID51301522,1 110 | CTID49517337,1 111 | -------------------------------------------------------------------------------- /data/partitioned_recipes/._SUCCESS.crc: -------------------------------------------------------------------------------- 1 | crc -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2019-01-01/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-01/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2019-01-01/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-01/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2019-01-02/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-02/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2019-01-02/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-02/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2019-01-03/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-03/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2019-01-03/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-03/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2019-01-04/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-04/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2019-01-04/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-04/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2019-01-05/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-05/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2019-01-05/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-05/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2019-01-06/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-06/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2019-01-06/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-06/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2019-01-07/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-07/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2019-01-07/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-07/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2019-01-08/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-08/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2019-01-08/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-08/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2019-01-09/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-09/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2019-01-09/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-09/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2019-01-10/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-10/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2019-01-10/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-10/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2019-01-11/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-11/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2019-01-11/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-11/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2019-01-12/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-12/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2019-01-12/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-12/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2019-01-13/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-13/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2019-01-13/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-13/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2019-01-14/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-14/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2019-01-14/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-14/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2019-01-15/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-15/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2019-01-15/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-15/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2019-01-16/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-16/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2019-01-16/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-16/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2019-01-17/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-17/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2019-01-17/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-17/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2019-01-18/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-18/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2019-01-18/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-18/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2019-01-19/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-19/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2019-01-19/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-19/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2019-01-20/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-20/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2019-01-20/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-20/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2019-01-21/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-21/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2019-01-21/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-21/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2019-01-22/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-22/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2019-01-22/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-22/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2019-01-23/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-23/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2019-01-23/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-23/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2019-01-24/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-24/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2019-01-24/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-24/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2019-01-25/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-25/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2019-01-25/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-25/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2019-01-26/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-26/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2019-01-26/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-26/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2019-01-27/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-27/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2019-01-27/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-27/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2019-01-28/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-28/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2019-01-28/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-28/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2019-01-29/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-29/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2019-01-29/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-29/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2019-01-30/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-30/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2019-01-30/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-30/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2019-01-31/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-31/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2019-01-31/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2019-01-31/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2020-01-01/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-01-01/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2020-01-01/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-01-01/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2020-01-02/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-01-02/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2020-01-02/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-01-02/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2020-01-03/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-01-03/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2020-01-03/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-01-03/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2020-01-07/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-01-07/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2020-01-07/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-01-07/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2020-01-09/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-01-09/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2020-01-09/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-01-09/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2020-01-10/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-01-10/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2020-01-10/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-01-10/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2020-01-13/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-01-13/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2020-01-13/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-01-13/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2020-01-14/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-01-14/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2020-01-14/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-01-14/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2020-01-16/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-01-16/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2020-01-16/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-01-16/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2020-01-21/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-01-21/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2020-01-21/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-01-21/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2020-01-23/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-01-23/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2020-01-23/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-01-23/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2020-01-28/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-01-28/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2020-01-28/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-01-28/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2020-01-29/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-01-29/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2020-01-29/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-01-29/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2020-01-30/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-01-30/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2020-01-30/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-01-30/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2020-01-31/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-01-31/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2020-01-31/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-01-31/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2020-02-03/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-02-03/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2020-02-03/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-02-03/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2020-02-04/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-02-04/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2020-02-04/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-02-04/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2020-02-05/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-02-05/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2020-02-05/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-02-05/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2020-02-06/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-02-06/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2020-02-06/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-02-06/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2020-02-07/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-02-07/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2020-02-07/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-02-07/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2020-02-10/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-02-10/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2020-02-10/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-02-10/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2020-02-11/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-02-11/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2020-02-11/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-02-11/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2020-02-13/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-02-13/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2020-02-13/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-02-13/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2020-02-14/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-02-14/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2020-02-14/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-02-14/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2020-02-16/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-02-16/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2020-02-16/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-02-16/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2020-02-18/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-02-18/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2020-02-18/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-02-18/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2020-02-20/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-02-20/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2020-02-20/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-02-20/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2020-02-23/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-02-23/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2020-02-23/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-02-23/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2020-02-24/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-02-24/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2020-02-24/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-02-24/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2020-02-25/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-02-25/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2020-02-25/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-02-25/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2020-02-27/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-02-27/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2020-02-27/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-02-27/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2020-02-28/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-02-28/.part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet.crc -------------------------------------------------------------------------------- /data/partitioned_recipes/DatePublished=2020-02-28/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/DatePublished=2020-02-28/part-00000-d43e82d2-b639-4714-bbe9-87a246b38387.c000.snappy.parquet -------------------------------------------------------------------------------- /data/partitioned_recipes/_SUCCESS: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/partitioned_recipes/_SUCCESS -------------------------------------------------------------------------------- /data/recipes.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/data/recipes.parquet -------------------------------------------------------------------------------- /diagrams/Chapter10/unity_catalog_hierarchy.md: -------------------------------------------------------------------------------- 1 | ```mermaid 2 | flowchart TD 3 | Metastore --> strCred(Storage\nCredentials) 4 | Metastore --> extLoc(External\nLocation) 5 | Metastore --> Catalog 6 | Metastore --> Share 7 | Metastore --> Recipient 8 | Metastore --> Provider 9 | Metastore --> Connection 10 | Catalog --> Schema 11 | Schema --> Table 12 | Schema --> View 13 | Schema --> Volume 14 | Schema --> Model 15 | Schema --> Functions 16 | ``` -------------------------------------------------------------------------------- /diagrams/Chapter10/unity_catalog_hierarchy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Data-Engineering-with-Databricks-Cookbook/fa1657c3808c0d520ea19a49f967df510b4f627d/diagrams/Chapter10/unity_catalog_hierarchy.png -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3.6" 2 | volumes: 3 | shared-workspace: 4 | name: "distributed-file-system" 5 | driver: local 6 | driver_opts: 7 | o: bind 8 | type: none 9 | device: ./ 10 | services: 11 | zookeeper: 12 | image: docker.io/bitnami/zookeeper:3.8.2 13 | container_name: zookeeper 14 | ports: 15 | - "2181:2181" 16 | volumes: 17 | - shared-workspace:/opt/workspace 18 | environment: 19 | - ALLOW_ANONYMOUS_LOGIN=yes 20 | kafka: 21 | image: docker.io/bitnami/kafka:3.5.1 22 | container_name: kafka 23 | ports: 24 | - "9092:9092" 25 | environment: 26 | - BITNAMI_DEBUG=yes 27 | - KAFKA_BROKER_ID=1 28 | - KAFKA_ENABLE_KRAFT=false 29 | - KAFKA_CFG_LISTENERS=PLAINTEXT://kafka:9092 30 | - KAFKA_CFG_ADVERTISED_LISTENERS=PLAINTEXT://kafka:9092 31 | - KAFKA_CFG_ZOOKEEPER_CONNECT=zookeeper:2181 32 | - KAFKA_CFG_INTER_BROKER_LISTENER_NAME=PLAINTEXT 33 | - KAFKA_CFG_LISTENER_SECURITY_PROTOCOL_MAP=PLAINTEXT:PLAINTEXT 34 | - ALLOW_PLAINTEXT_LISTENER=yes 35 | depends_on: 36 | - zookeeper 37 | jupyterlab: 38 | image: jupyterlab:4.0.2-spark-3.4.1 39 | container_name: jupyterlab 40 | ports: 41 | - 8888:8888 42 | - 4040:4040 43 | volumes: 44 | - shared-workspace:/opt/workspace 45 | spark-master: 46 | image: spark-master:3.4.1 47 | container_name: spark-master 48 | ports: 49 | - 8080:8080 50 | - 7077:7077 51 | volumes: 52 | - shared-workspace:/opt/workspace 53 | spark-worker-1: 54 | image: spark-worker:3.4.1 55 | container_name: spark-worker-1 56 | environment: 57 | - SPARK_WORKER_CORES=1 58 | - SPARK_WORKER_MEMORY=512m 59 | ports: 60 | - 8081:8081 61 | volumes: 62 | - shared-workspace:/opt/workspace 63 | depends_on: 64 | - spark-master 65 | spark-worker-2: 66 | image: spark-worker:3.4.1 67 | container_name: spark-worker-2 68 | environment: 69 | - SPARK_WORKER_CORES=1 70 | - SPARK_WORKER_MEMORY=512m 71 | ports: 72 | - 8082:8081 73 | volumes: 74 | - shared-workspace:/opt/workspace 75 | depends_on: 76 | - spark-master 77 | -------------------------------------------------------------------------------- /docker/base/Dockerfile: -------------------------------------------------------------------------------- 1 | ARG java_image_tag=17-jre 2 | FROM eclipse-temurin:${java_image_tag} 3 | 4 | # -- Layer: Image Metadata 5 | 6 | ARG build_date 7 | ARG delta_spark_version 8 | ARG deltalake_version 9 | ARG pandas_version 10 | 11 | LABEL org.label-schema.build-date=${build_date} 12 | LABEL org.label-schema.description="Data Engineering wih Apache Spark and Delta Lake Cookbook - Cluster base image" 13 | LABEL org.label-schema.schema-version="1.0" 14 | 15 | # -- Layer: OS + Python + Scala 16 | 17 | ARG shared_workspace=/opt/workspace 18 | 19 | RUN mkdir -p ${shared_workspace}/data && \ 20 | mkdir -p /usr/share/man/man1 && \ 21 | apt-get update -y && \ 22 | apt-get install -y --no-install-recommends curl python3 r-base netcat && \ 23 | ln -s /usr/bin/python3 /usr/bin/python && \ 24 | apt-get clean && \ 25 | rm -rf /var/lib/apt/lists/* 26 | 27 | RUN apt-get update -y && \ 28 | apt-get install -y --no-install-recommends build-essential manpages-dev python3-pip python3-dev && \ 29 | pip3 install --no-cache-dir --upgrade pip && \ 30 | apt-get clean && \ 31 | rm -rf /var/lib/apt/lists/* 32 | 33 | # We are explicitly pinning the versions of various libraries which this Docker image runs on. 34 | RUN pip3 install --quiet --no-cache-dir \ 35 | delta-spark==${delta_spark_version} \ 36 | deltalake==${deltalake_version} \ 37 | pandas==${pandas_version} 38 | 39 | ENV SCALA_HOME="/usr/bin/scala" 40 | ENV PATH=${PATH}:${SCALA_HOME}/bin 41 | ENV SHARED_WORKSPACE=${shared_workspace} 42 | 43 | # -- Runtime 44 | 45 | VOLUME ${shared_workspace} 46 | CMD ["bash"] 47 | -------------------------------------------------------------------------------- /docker/jupyterlab/00-first.py: -------------------------------------------------------------------------------- 1 | from delta import * 2 | from pyspark.sql import SparkSession 3 | 4 | builder = (SparkSession.builder 5 | .appName("data-eng-cookbook") 6 | .master("spark://spark-master:7077") 7 | .config("spark.executor.memory", "512m") 8 | .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") 9 | .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")) 10 | 11 | spark = configure_spark_with_delta_pip(builder).getOrCreate() 12 | spark.sparkContext.setLogLevel("ERROR") 13 | 14 | 15 | get_ipython().run_line_magic('load_ext', 'sparksql_magic') 16 | get_ipython().run_line_magic('config', 'SparkSql.limit=20') -------------------------------------------------------------------------------- /docker/jupyterlab/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM base 2 | 3 | # -- Layer: Image Metadata 4 | 5 | ARG build_date 6 | 7 | LABEL org.label-schema.build-date=${build_date} 8 | LABEL org.label-schema.name="Data Engineering wih Apache Spark and Delta Lake Cookbook - JupyterLab Image" 9 | LABEL org.label-schema.description="JupyterLab image" 10 | 11 | 12 | # -- Layer: Notebooks and data 13 | 14 | # ADD docker/jupyterlab/kafka-producer.py / 15 | 16 | # -- Layer: JupyterLab + Python kernel for PySpark 17 | 18 | ARG spark_version 19 | ARG jupyterlab_version 20 | ARG sparksql_magic_version 21 | ARG kafka_python_version 22 | 23 | RUN pip3 install --no-cache-dir wget==3.2 \ 24 | pyspark==${spark_version} \ 25 | jupyterlab==${jupyterlab_version} \ 26 | sparksql-magic==${sparksql_magic_version} \ 27 | kafka-python==${kafka_python_version} 28 | 29 | EXPOSE 8888 30 | 31 | WORKDIR ${SHARED_WORKSPACE} 32 | # COPY docker/jupyterlab/00-first.py /root/.ipython/profile_default/startup/00-first.py 33 | CMD jupyter lab --ip=0.0.0.0 --port=8888 --no-browser --allow-root --NotebookApp.token=;python -------------------------------------------------------------------------------- /docker/spark-base/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM base 2 | 3 | # -- Layer: Image Metadata 4 | ARG build_date 5 | ARG delta_package_version 6 | ARG spark_xml_package_version 7 | 8 | LABEL org.label-schema.build-date=${build_date} 9 | LABEL org.label-schema.description="Data Engineering wih Apache Spark and Delta Lake Cookbook - Spark base image" 10 | LABEL org.label-schema.schema-version="1.0" 11 | 12 | # -- Layer: Apache Spark 13 | ARG spark_version 14 | ARG hadoop_version 15 | 16 | RUN curl https://archive.apache.org/dist/spark/spark-${spark_version}/spark-${spark_version}-bin-hadoop${hadoop_version}.tgz -o spark.tgz && \ 17 | tar -xf spark.tgz && \ 18 | mv spark-${spark_version}-bin-hadoop${hadoop_version} /usr/bin/ && \ 19 | echo "alias pyspark=/usr/bin/spark-${spark_version}-bin-hadoop${hadoop_version}/bin/pyspark" >> ~/.bashrc && \ 20 | echo "alias spark-shell=/usr/bin/spark-${spark_version}-bin-hadoop${hadoop_version}/bin/spark-shell" >> ~/.bashrc && \ 21 | mkdir /usr/bin/spark-${spark_version}-bin-hadoop${hadoop_version}/logs && \ 22 | rm spark.tgz 23 | 24 | ENV SPARK_HOME /usr/bin/spark-${spark_version}-bin-hadoop${hadoop_version} 25 | ENV SPARK_MASTER_HOST spark-master 26 | ENV SPARK_MASTER_PORT 7077 27 | ENV PYSPARK_PYTHON python3 28 | 29 | # -- Runtime 30 | WORKDIR ${SPARK_HOME} 31 | 32 | USER root 33 | 34 | ARG NBuser=NBuser 35 | ARG GROUP=NBuser 36 | 37 | RUN groupadd -r ${GROUP} && useradd -r -m -g ${GROUP} ${NBuser} 38 | 39 | RUN chown -R "${NBuser}":"${GROUP}" /home/"${NBuser}"/ \ 40 | && chown -R "${NBuser}":"${GROUP}" "${SPARK_HOME}"\ 41 | && chown -R "${NBuser}":"${GROUP}" "${SHARED_WORKSPACE}" 42 | 43 | USER ${NBuser} 44 | 45 | 46 | RUN ${SPARK_HOME}/bin/spark-shell --packages io.delta:${delta_package_version} \ 47 | --conf "spark.sql.extensions=io.delta.sql.DeltaSparkSessionExtension" \ 48 | --conf "spark.sql.catalog.spark_catalog=org.apache.spark.sql.delta.catalog.DeltaCatalog" \ 49 | && ${SPARK_HOME}/bin/spark-shell --packages com.databricks:${spark_xml_package_version} \ 50 | && ${SPARK_HOME}/bin/spark-shell --packages org.apache.spark:spark-sql-kafka-0-10_2.12:3.4.1 -------------------------------------------------------------------------------- /docker/spark-master/Dockerfile: -------------------------------------------------------------------------------- 1 | ARG spark_version 2 | FROM spark-base:${spark_version} 3 | 4 | # -- Layer: Image Metadata 5 | 6 | ARG build_date 7 | 8 | LABEL org.label-schema.build-date=${build_date} 9 | LABEL org.label-schema.description="Spark master image" 10 | LABEL org.label-schema.schema-version="1.0" 11 | 12 | # -- Runtime 13 | 14 | EXPOSE 8080 7077 15 | 16 | CMD bin/spark-class org.apache.spark.deploy.master.Master >> logs/spark-master.out 17 | -------------------------------------------------------------------------------- /docker/spark-worker/Dockerfile: -------------------------------------------------------------------------------- 1 | ARG spark_version 2 | FROM spark-base:${spark_version} 3 | 4 | # -- Layer: Image Metadata 5 | 6 | ARG build_date 7 | 8 | LABEL org.label-schema.build-date=${build_date} 9 | LABEL org.label-schema.description="Spark worker image" 10 | LABEL org.label-schema.schema-version="1.0" 11 | 12 | # -- Runtime 13 | 14 | EXPOSE 8081 15 | 16 | CMD bin/spark-class org.apache.spark.deploy.worker.Worker spark://${SPARK_MASTER_HOST}:${SPARK_MASTER_PORT} >> logs/spark-worker.out 17 | --------------------------------------------------------------------------------