├── .gitignore ├── CODEOWNERS ├── README.md ├── assets ├── cover.jpeg └── cover_color.jpg ├── chapter-00-glossary-terms-for-the-book └── README.md ├── chapter-02 ├── 01-full-load │ ├── 01-full-loader-airflow-postgresql-data-exposition │ │ ├── README.md │ │ ├── assets │ │ │ ├── clear_tasks.png │ │ │ └── restart_clear.png │ │ ├── dags │ │ │ ├── devices_loader.py │ │ │ └── macros.py │ │ ├── docker │ │ │ ├── dataset │ │ │ │ ├── docker-compose.yaml │ │ │ │ └── generation_configuration.yaml │ │ │ └── postgresql │ │ │ │ ├── docker-compose.yaml │ │ │ │ └── init.sql │ │ ├── requirements.txt │ │ ├── sql │ │ │ ├── expose_new_table.sql │ │ │ └── load_file_to_device_table.sql │ │ └── start.sh │ └── 01-full-loader-spark-with-conversion │ │ ├── docker │ │ ├── docker-compose.yaml │ │ └── generation_configuration.yaml │ │ └── python │ │ ├── README.md │ │ ├── config.py │ │ ├── devices_table_reader.py │ │ ├── devices_table_reader_past_version.py │ │ ├── load_json_data.py │ │ ├── load_json_partial_data.py │ │ └── requirements.txt ├── 02-incremental-load │ ├── 01-incremental-loader-apache-airflow-apache-spark │ │ ├── README.md │ │ ├── airflow │ │ │ ├── dags │ │ │ │ ├── config.py │ │ │ │ ├── visits_incremental_loader.py │ │ │ │ └── visits_incremental_loader.yaml │ │ │ ├── requirements.txt │ │ │ └── start.sh │ │ ├── assets │ │ │ └── ch02_enable_dag.png │ │ ├── dataset │ │ │ ├── docker-compose.yaml │ │ │ └── generation_configuration_json.yaml │ │ └── incremental-spark-job │ │ │ ├── Dockerfile │ │ │ ├── requirements.txt │ │ │ └── visits_loader.py │ ├── 02-change-data-capture-debezium │ │ └── python │ │ │ ├── README.md │ │ │ ├── docker │ │ │ ├── docker-compose.yaml │ │ │ ├── generation_configuration.yaml │ │ │ ├── init.sql │ │ │ ├── register-postgresql-connector.json │ │ │ └── visits_to_insert.sql │ │ │ ├── requirements.txt │ │ │ └── visits_stream_processor.py │ └── 02-change-data-capture-delta-lake-change-data-feed │ │ ├── dataset │ │ ├── docker-compose.yaml │ │ └── generation_configuration.yaml │ │ └── python │ │ ├── README.md │ │ ├── events_table_streaming_reader.py │ │ └── requirements.txt ├── 03-replication │ ├── 01-passthrough-replicator-apache-spark-apache-kafka │ │ ├── README.md │ │ ├── config.py │ │ ├── dataset_reader_kafka.py │ │ ├── dataset_reader_kafka_raw.py │ │ ├── dataset_reader_raw.py │ │ ├── dataset_replicator_kafka.py │ │ ├── dataset_replicator_raw.py │ │ ├── docker │ │ │ ├── docker-compose.yaml │ │ │ └── generation_configuration_json.yaml │ │ ├── kafka_data_producer.py │ │ └── requirements.txt │ └── 02-transformation-replicator-apache-spark-delta-lake │ │ ├── README.md │ │ ├── config.py │ │ ├── dataset │ │ ├── docker-compose.yaml │ │ └── generation_configuration.yaml │ │ ├── dataset_reader_reduction.py │ │ ├── dataset_reader_transformation.py │ │ ├── dataset_replicator_raw_reduction.py │ │ ├── dataset_replicator_raw_transformation.py │ │ ├── prepare_delta_table.py │ │ └── requirements.txt ├── 04-data-compaction │ ├── 01-compactor-apache-kafka │ │ ├── README.md │ │ └── docker-compose.yaml │ └── 01-compactor-delta-lake │ │ ├── README.md │ │ ├── compact_devices_table.py │ │ ├── config.py │ │ ├── dataset │ │ ├── docker-compose.yaml │ │ └── generation_configuration.yaml │ │ ├── devices_table_reader.py │ │ ├── load_devices_data.py │ │ ├── requirements.txt │ │ └── vacuum_devices_table.py ├── 05-data-readiness │ ├── 01-readiness-marker-apache-airflow │ │ ├── README.md │ │ ├── assets │ │ │ └── flow.png │ │ ├── dags │ │ │ └── dataset_creator.py │ │ ├── requirements.txt │ │ └── start.sh │ └── 01-readiness-marker-apache-spark-success-file │ │ └── python │ │ ├── README.md │ │ ├── config.py │ │ ├── dataset │ │ ├── docker-compose.yaml │ │ └── generation_configuration.yaml │ │ ├── devices_parquet_reader.py │ │ ├── load_devices_data.py │ │ └── requirements.txt └── 06-event-driven │ └── 01-external-trigger-lambda-airflow │ ├── README.md │ ├── airflow │ ├── dags │ │ └── devices_loader.py │ └── start.sh │ ├── assets │ ├── enable_dag.png │ └── running_dag.png │ ├── dataset │ ├── docker-compose.yaml │ └── generation_configuration.yaml │ ├── requirements.txt │ ├── s3hook.json │ └── trigger-lambda │ └── event_handler.py ├── chapter-03 ├── 01-unprocessable-records │ ├── 01-dead-letter-null-safe-transformations │ │ ├── README.md │ │ ├── config.py │ │ ├── dataset │ │ │ ├── docker-compose.yaml │ │ │ └── generation_configuration.yaml │ │ ├── devices_dead_letter_table_reader.py │ │ ├── devices_loader.py │ │ ├── devices_table_reader.py │ │ └── requirements.txt │ └── 01-dead-letter-streaming-apache-flink-kafka │ │ ├── README.md │ │ ├── __init__.py │ │ ├── dead_letter_job.py │ │ ├── docker │ │ ├── docker-compose.yaml │ │ └── generation_configuration.yaml │ │ ├── flink-connector-base-1.17.0.jar │ │ ├── flink-connector-kafka-1.17.0.jar │ │ ├── kafka-clients-3.2.3.jar │ │ ├── kafka_sink.py │ │ ├── requirements.txt │ │ └── visit_mapper.py ├── 02-duplicated-records │ ├── 01-windowed-deduplicator-dropduplicates-spark-batch │ │ ├── README.md │ │ ├── config.py │ │ ├── dataset │ │ │ ├── docker-compose.yaml │ │ │ └── generation_configuration.yaml │ │ ├── devices_deduplicator.py │ │ ├── devices_table_reader.py │ │ └── requirements.txt │ ├── 01-windowed-deduplicator-dropduplicates-spark-streaming │ │ ├── README.md │ │ ├── config.py │ │ ├── docker │ │ │ ├── docker-compose.yaml │ │ │ └── generation_configuration.yaml │ │ ├── duplicates_checker.py │ │ ├── requirements.txt │ │ └── visits_deduplicator.py │ └── 01-windowed-deduplicator-window-sql │ │ ├── README.md │ │ ├── config.py │ │ ├── dataset │ │ ├── docker-compose.yaml │ │ └── generation_configuration.yaml │ │ ├── devices_deduplicator.py │ │ ├── devices_table_reader.py │ │ └── requirements.txt ├── 03-late-data │ ├── 01-late-data-detector-flink-side-output │ │ ├── README.md │ │ ├── __init__.py │ │ ├── docker │ │ │ ├── docker-compose.yaml │ │ │ └── generation_configuration.yaml │ │ ├── flink-connector-base-1.17.0.jar │ │ ├── flink-connector-kafka-1.17.0.jar │ │ ├── kafka-clients-3.2.3.jar │ │ ├── late_data_dispatcher_job.py │ │ ├── reduced_visit.py │ │ ├── requirements.txt │ │ ├── visit_late_data_processor.py │ │ └── visit_timestamp_assigner.py │ ├── 01-late-data-detector-spark-stateful-window │ │ ├── README.md │ │ ├── docker │ │ │ └── docker-compose.yaml │ │ ├── requirements.txt │ │ └── visits_per_10_minutes.py │ ├── 02-static-late-data-integrator-apache-airflow │ │ ├── README.md │ │ ├── airflow │ │ │ ├── dags │ │ │ │ └── devices_loader.py │ │ │ ├── requirements.txt │ │ │ └── start.sh │ │ └── assets │ │ │ ├── clear_dag_run.png │ │ │ └── expected_run_1.png │ ├── 03-dynamic-late-data-integrator-apache-airflow-delta-lake-concurrent-embedded │ │ ├── README.md │ │ ├── airflow │ │ │ ├── __init__.py │ │ │ ├── dags │ │ │ │ ├── devices_loader.py │ │ │ │ ├── job_late_data_detection.yaml │ │ │ │ ├── job_mark_partition_being_processed.yaml │ │ │ │ └── job_processing_and_marker_partition_as_processed.yaml │ │ │ ├── plugins │ │ │ │ ├── __init__.py │ │ │ │ └── operators │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── spark_kubernetes_with_deferrable_driver_pod_operator.py │ │ │ ├── requirements.txt │ │ │ └── start.sh │ │ ├── assets │ │ │ ├── after_backfilling.png │ │ │ ├── backfilled_task.png │ │ │ ├── clear_dag_run.png │ │ │ ├── expected_run_1.png │ │ │ ├── raw │ │ │ │ ├── 1.png │ │ │ │ ├── 10.png │ │ │ │ ├── 11.png │ │ │ │ ├── 12.png │ │ │ │ ├── 2.png │ │ │ │ ├── 3.png │ │ │ │ ├── 4.png │ │ │ │ ├── 5.png │ │ │ │ ├── 6.png │ │ │ │ ├── 7.png │ │ │ │ ├── 8.png │ │ │ │ └── 9.png │ │ │ ├── run_in_progress.png │ │ │ └── simulation.gif │ │ ├── data-generator │ │ │ ├── dataset │ │ │ │ ├── docker-compose.yaml │ │ │ │ └── generation_configuration_json.yaml │ │ │ ├── load_devices_to_delta_table.py │ │ │ └── requirements.txt │ │ └── late-data-integrator │ │ │ ├── Dockerfile │ │ │ ├── build.sbt │ │ │ ├── project │ │ │ ├── build.properties │ │ │ └── plugins.sbt │ │ │ └── src │ │ │ ├── main │ │ │ ├── resources │ │ │ │ └── log4j2.properties │ │ │ └── scala │ │ │ │ └── com │ │ │ │ └── waitingforcode │ │ │ │ ├── Json.scala │ │ │ │ ├── PartitionsHandler.scala │ │ │ │ ├── PartitionsStateTable.scala │ │ │ │ ├── PartitionsToBackfill.scala │ │ │ │ ├── SparkSessionFactory.scala │ │ │ │ └── jobs │ │ │ │ ├── DataProcessorAndPartitionMarkerAsProcessed.scala │ │ │ │ ├── LateDataDetectionJob.scala │ │ │ │ └── PartitionStateAsBeingProcessedJob.scala │ │ │ └── test │ │ │ └── scala │ │ │ └── com │ │ │ └── waitingforcode │ │ │ ├── PartitionsHandlerTest.scala │ │ │ ├── PartitionsStateTablePartitionsToBackfillTest.scala │ │ │ ├── PartitionsStateTableWritingTest.scala │ │ │ └── SparkSessionSpec.scala │ ├── 03-dynamic-late-data-integrator-apache-airflow-delta-lake-concurrent-trigger │ │ ├── README.md │ │ ├── airflow │ │ │ ├── dags │ │ │ │ ├── devices_loader.py │ │ │ │ ├── job_late_data_detection.yaml │ │ │ │ ├── job_mark_partition_being_processed.yaml │ │ │ │ └── job_processing_and_marker_partition_as_processed.yaml │ │ │ ├── requirements.txt │ │ │ └── start.sh │ │ ├── assets │ │ │ ├── backfilling_1.png │ │ │ ├── backfilling_late_data.png │ │ │ ├── backfilling_late_data_mapped_tasks.png │ │ │ ├── clear_runs.png │ │ │ ├── dag_after_backfilling_1.png │ │ │ ├── expected_run_progress_1.png │ │ │ ├── expected_run_progress_2.png │ │ │ ├── expected_run_progress_3.png │ │ │ └── expected_run_result.png │ │ ├── data-generator │ │ │ ├── dataset │ │ │ │ ├── docker-compose.yaml │ │ │ │ └── generation_configuration_json.yaml │ │ │ ├── load_devices_to_delta_table.py │ │ │ └── requirements.txt │ │ └── late-data-integrator │ │ │ ├── Dockerfile │ │ │ ├── build.sbt │ │ │ └── src │ │ │ ├── main │ │ │ ├── resources │ │ │ │ └── log4j2.properties │ │ │ └── scala │ │ │ │ └── com │ │ │ │ └── waitingforcode │ │ │ │ ├── DataProcessingJobExecutionConfiguration.scala │ │ │ │ ├── Json.scala │ │ │ │ ├── PartitionsHandler.scala │ │ │ │ ├── PartitionsStateTable.scala │ │ │ │ ├── PartitionsToBackfill.scala │ │ │ │ ├── SparkSessionFactory.scala │ │ │ │ └── jobs │ │ │ │ ├── LateDataDetectionJob.scala │ │ │ │ ├── PartitionStateInProgressMarkerJob.scala │ │ │ │ └── PartitionStateProcessingCompletedJob.scala │ │ │ └── test │ │ │ └── scala │ │ │ └── com │ │ │ └── waitingforcode │ │ │ ├── PartitionsHandlerTest.scala │ │ │ ├── PartitionsStateTablePartitionsToBackfillTest.scala │ │ │ ├── PartitionsStateTableWritingTest.scala │ │ │ └── SparkSessionSpec.scala │ └── 03-dynamic-late-data-integrator-apache-airflow-delta-lake-sequential │ │ ├── README.md │ │ ├── airflow │ │ ├── dags │ │ │ ├── devices_loader.py │ │ │ └── late_data_detector_job.yaml │ │ ├── requirements.txt │ │ └── start.sh │ │ ├── assets │ │ ├── clear_dag_run.png │ │ ├── dag_status_after_late_data_integration.png │ │ └── expected_run_1.png │ │ ├── data-generator │ │ ├── dataset │ │ │ ├── ? │ │ │ │ └── .sbt │ │ │ │ │ ├── 1.0 │ │ │ │ │ └── java9-rt-ext-oracle_corporation_11_0_14_1 │ │ │ │ │ │ └── rt.jar │ │ │ │ │ └── boot │ │ │ │ │ └── sbt.boot.lock │ │ │ ├── dedp_ch03_late_data_integrator_sequential.tar │ │ │ ├── docker-compose.yaml │ │ │ └── generation_configuration_json.yaml │ │ ├── load_devices_to_delta_table.py │ │ └── requirements.txt │ │ └── late-data-integrator │ │ ├── Dockerfile │ │ ├── build.sbt │ │ └── src │ │ └── main │ │ ├── resources │ │ └── log4j2.properties │ │ └── scala │ │ └── com │ │ └── waitingforcode │ │ ├── LastProcessedVersion.scala │ │ ├── LateDataPartitionsCreationJob.scala │ │ ├── PartitionsHandler.scala │ │ ├── PartitionsToBackfill.scala │ │ └── SparkSessionFactory.scala ├── 04-filtering │ ├── 01-filter-interceptor-scala-accumulators │ │ ├── README.md │ │ ├── build.sbt │ │ ├── dataset │ │ │ ├── docker-compose.yaml │ │ │ └── generation_configuration.yaml │ │ └── src │ │ │ └── main │ │ │ ├── resources │ │ │ └── log4j2.properties │ │ │ └── scala │ │ │ └── com │ │ │ └── waitingforcode │ │ │ └── FilterWithStatsInterceptor.scala │ ├── 01-filter-interceptor-spark-accumulators │ │ ├── README.md │ │ ├── config.py │ │ ├── dataset │ │ │ ├── docker-compose.yaml │ │ │ └── generation_configuration.yaml │ │ ├── devices_table_creator.py │ │ ├── devices_table_reader.py │ │ └── requirements.txt │ └── 01-filter-interceptor-spark-sql │ │ ├── README.md │ │ ├── config.py │ │ ├── dataset │ │ ├── docker-compose.yaml │ │ └── generation_configuration.yaml │ │ ├── devices_table_creator.py │ │ ├── devices_table_reader.py │ │ └── requirements.txt └── 05-fault-tolerance │ ├── 01-checkpointer-apache-flink-apache-kafka │ ├── .gitignore │ ├── README.md │ ├── __init__.py │ ├── assets │ │ ├── flink_cancel_job.png │ │ └── flink_checkpoint.png │ ├── docker │ │ ├── Dockerfile_Flink │ │ ├── docker-compose.yaml │ │ └── generation_configuration.yaml │ ├── flink-connector-base-1.17.0.jar │ ├── flink-connector-kafka-1.17.0.jar │ ├── kafka-clients-3.2.3.jar │ ├── stateful_flink_consumer.py │ ├── visit.py │ ├── visit_timestamp_assigner.py │ └── visit_window_processor.py │ └── 01-checkpointer-apache-spark-apache-kafka │ ├── README.md │ ├── __init__.py │ ├── docker │ ├── docker-compose.yaml │ └── generation_configuration.yaml │ ├── requirements.txt │ └── visits_json_synchronizer.py ├── chapter-04 ├── 01-overwriting │ ├── 01-fast-metadata-cleaner-airflow-postgresql │ │ ├── README.md │ │ ├── assets │ │ │ └── clear_tasks.png │ │ ├── dags │ │ │ ├── __init__.py │ │ │ ├── config.py │ │ │ ├── idempotency_metadata_overwrite.py │ │ │ └── macros.py │ │ ├── docker │ │ │ ├── docker-compose.yaml │ │ │ ├── generation_configuration_json.yaml │ │ │ └── init.sql │ │ ├── plugins │ │ │ ├── __init__.py │ │ │ └── operators │ │ │ │ ├── __init__.py │ │ │ │ └── view_manager_operator.py │ │ ├── requirements.txt │ │ ├── sql │ │ │ ├── create_weekly_table.sql │ │ │ ├── load_visits_to_weekly_table.sql │ │ │ └── recreate_view.sql │ │ └── start.sh │ ├── 02-data-overwrite-airflow-delta-lake │ │ ├── README.md │ │ ├── assets │ │ │ ├── clear_tasks.png │ │ │ └── dag_tasks.png │ │ ├── dags │ │ │ ├── __init__.py │ │ │ ├── devices_reader.py │ │ │ └── devices_synchronizer.py │ │ ├── dags_functions │ │ │ ├── __init__.py │ │ │ └── device_synchronizer_functions.py │ │ ├── requirements.txt │ │ └── start.sh │ └── 02-data-overwrite-airflow-spark │ │ ├── README.md │ │ ├── airflow │ │ ├── dags │ │ │ ├── config.py │ │ │ ├── visits_loader.py │ │ │ └── visits_loader.yaml │ │ ├── requirements.txt │ │ └── start.sh │ │ ├── assets │ │ └── clear_tasks.png │ │ ├── dataset │ │ ├── docker-compose.yaml │ │ └── generation_configuration_json.yaml │ │ └── visits-loader-job │ │ ├── Dockerfile │ │ ├── requirements.txt │ │ └── visits_loader.py ├── 02-updates │ ├── 01-merger-airflow-postgresql │ │ ├── README.md │ │ ├── assets │ │ │ ├── clear_task_metadata.png │ │ │ └── clear_tasks.png │ │ ├── dags │ │ │ ├── __init__.py │ │ │ └── idempotency_merge.py │ │ ├── docker │ │ │ ├── docker-compose.yaml │ │ │ └── init.sql │ │ ├── requirements.txt │ │ ├── sql │ │ │ └── load_new_devices.sql │ │ └── start.sh │ ├── 01-merger-postgresql-soft-deletes │ │ ├── README.md │ │ └── docker │ │ │ ├── docker-compose.yaml │ │ │ └── init.sql │ ├── 02-stateful-merger-apache-airflow-delta-lake │ │ ├── README.md │ │ ├── assets │ │ │ ├── clear_tasks.png │ │ │ └── dag_tasks.png │ │ ├── dags │ │ │ ├── __init__.py │ │ │ ├── devices_reader.py │ │ │ └── devices_synchronizer.py │ │ ├── dags_functions │ │ │ ├── __init__.py │ │ │ └── device_synchronizer_functions.py │ │ ├── requirements.txt │ │ └── start.sh │ └── 02-stateful-merger-apache-airflow-postgresql │ │ ├── README.md │ │ ├── assets │ │ ├── after_run_1.png │ │ ├── after_run_2.png │ │ └── clear_retry.png │ │ ├── dags │ │ ├── __init__.py │ │ └── devices_synchronizer.py │ │ ├── docker │ │ ├── docker-compose.yaml │ │ └── init.sql │ │ ├── requirements.txt │ │ ├── sql │ │ ├── clean_table_before_restore.sql │ │ ├── define_merge_mode.sql │ │ ├── load_data_to_historical_table.sql │ │ ├── merge_new_devices.sql │ │ └── restore_table.sql │ │ └── start.sh ├── 03-database │ ├── 01-keyed-idempotency-airflow-files │ │ ├── README.md │ │ ├── assets │ │ │ ├── clear_dag.png │ │ │ └── dag_tasks.png │ │ ├── dags │ │ │ ├── __init__.py │ │ │ └── devices_synchronizer.py │ │ ├── requirements.txt │ │ └── start.sh │ ├── 01-keyed-idempotency-spark-kafka-scylladb │ │ ├── README.md │ │ ├── docker │ │ │ ├── docker-compose.yaml │ │ │ └── init.cql │ │ ├── requirements.txt │ │ ├── scylla_db_writer.py │ │ ├── sessions_generator_job.py │ │ └── visits_mapper.py │ ├── 02-transactional-writer-airflow-postgresql │ │ ├── README.md │ │ ├── assets │ │ │ ├── clear_task_metadata.png │ │ │ └── clear_tasks.png │ │ ├── dags │ │ │ ├── __init__.py │ │ │ └── devices_loader.py │ │ ├── docker │ │ │ ├── docker-compose.yaml │ │ │ └── init.sql │ │ ├── requirements.txt │ │ ├── sql │ │ │ └── load_new_devices.sql │ │ └── start.sh │ └── 02-transactional-writer-flink-kafka │ │ ├── README.md │ │ ├── __init__.py │ │ ├── docker │ │ └── docker-compose.yaml │ │ ├── flink-connector-base-1.17.0.jar │ │ ├── flink-connector-kafka-1.17.0.jar │ │ ├── kafka-clients-3.2.3.jar │ │ ├── reduced_visit.py │ │ ├── requirements.txt │ │ ├── visit_timestamp_assigner.py │ │ └── visits_reducer_job.py └── 04-immutable-dataset │ └── 01-proxy-postgresql-view │ ├── README.md │ ├── assets │ ├── clear_task_metadata.png │ └── clear_tasks.png │ ├── dags │ ├── __init__.py │ ├── config.py │ ├── devices_loader.py │ └── macros.py │ ├── docker │ ├── docker-compose.yaml │ ├── generation_configuration_json.yaml │ └── init.sql │ ├── plugins │ ├── __init__.py │ └── operators │ │ ├── __init__.py │ │ └── view_manager_operator.py │ ├── requirements.txt │ ├── sql │ ├── load_visits_to_weekly_table.sql │ └── refresh_view.sql │ └── start.sh ├── chapter-05 ├── 01-data-enrichment │ ├── 01-static-joiner-api │ │ ├── README.md │ │ ├── docker │ │ │ ├── docker-compose.yaml │ │ │ └── generation_configuration.yaml │ │ ├── geoloc_api.py │ │ ├── kafka_writer_with_enricher.py │ │ ├── requirements.txt │ │ └── visits_enricher_job.py │ ├── 01-static-joiner-data-at-rest │ │ ├── README.md │ │ └── docker │ │ │ ├── docker-compose.yaml │ │ │ ├── generation_configuration.yaml │ │ │ └── init.sql │ ├── 01-static-joiner-data-in-motion │ │ ├── README.md │ │ ├── config.py │ │ ├── devices_table_creator.py │ │ ├── docker │ │ │ ├── docker-compose.yaml │ │ │ └── generation_configuration.yaml │ │ ├── requirements.txt │ │ └── visits_enricher.py │ ├── 02-dynamic-joiner-flink │ │ ├── README.md │ │ ├── assets │ │ │ └── flink_watermark_local_time.png │ │ ├── docker │ │ │ └── docker-compose.yaml │ │ ├── pom.xml │ │ └── src │ │ │ └── main │ │ │ ├── java │ │ │ └── com │ │ │ │ └── waitingforcode │ │ │ │ ├── SchemaBuilders.java │ │ │ │ └── VisitsWithAdsJoinJob.java │ │ │ └── resources │ │ │ └── log4j2.properties │ └── 02-dynamic-joiner-spark │ │ ├── README.md │ │ ├── docker │ │ └── docker-compose.yaml │ │ ├── requirements.txt │ │ └── visits_ads_enrichment_job.py ├── 02-data-decoration │ ├── 01-wrapper-spark │ │ ├── README.md │ │ ├── docker │ │ │ ├── docker-compose.yaml │ │ │ └── generation_configuration.yaml │ │ ├── requirements.txt │ │ └── wrapper_decorator_job.py │ ├── 01-wrapper-sql │ │ ├── README.md │ │ ├── config.py │ │ ├── dataset │ │ │ ├── docker-compose.yaml │ │ │ └── generation_configuration_json.yaml │ │ ├── requirements.txt │ │ ├── visits_decorator.py │ │ └── visits_table_reader.py │ ├── 02-metadata-airflow-postgresql │ │ ├── README.md │ │ ├── dags │ │ │ ├── __init__.py │ │ │ ├── config.py │ │ │ ├── idempotency_metadata_overwrite.py │ │ │ └── macros.py │ │ ├── docker │ │ │ ├── docker-compose.yaml │ │ │ ├── generation_configuration_json.yaml │ │ │ └── init.sql │ │ ├── plugins │ │ │ ├── __init__.py │ │ │ └── operators │ │ │ │ ├── __init__.py │ │ │ │ └── view_manager_operator.py │ │ ├── requirements.txt │ │ ├── sql │ │ │ ├── create_weekly_table.sql │ │ │ ├── load_visits_to_weekly_table.sql │ │ │ └── recreate_view.sql │ │ └── start.sh │ └── 02-metadata-spark-kafka │ │ ├── README.md │ │ ├── config.py │ │ ├── docker │ │ ├── docker-compose.yaml │ │ └── generation_configuration.yaml │ │ ├── metadata_decorator_job.py │ │ └── requirements.txt ├── 03-data-combination │ ├── 01-distributed-json-postgresql │ │ ├── README.md │ │ ├── dataset │ │ │ ├── docker-compose.yaml │ │ │ ├── generation_configuration.yaml │ │ │ └── init.sql │ │ ├── distributed_combiner.py │ │ └── requirements.txt │ ├── 02-local-buckets-spark │ │ ├── README.md │ │ ├── bucket_preparators.py │ │ ├── config.py │ │ ├── dataset │ │ │ ├── docker-compose.yaml │ │ │ └── generation_configuration.yaml │ │ ├── local_combiner.py │ │ └── requirements.txt │ ├── 02-local-flink-kafka-hint │ │ ├── docker │ │ │ ├── docker-compose.yaml │ │ │ └── generation_configuration.yaml │ │ └── src │ │ │ └── main │ │ │ └── java │ │ │ └── com │ │ │ └── becomedataengineer │ │ │ └── RecentVisitsInfoPreparatorJob.java │ ├── 02-local-kafka-kafka-streams │ │ ├── README.md │ │ ├── docker │ │ │ ├── docker-compose.yaml │ │ │ └── generation_configuration.yaml │ │ ├── pom.xml │ │ └── src │ │ │ └── main │ │ │ ├── java │ │ │ └── com │ │ │ │ └── waitingforcode │ │ │ │ ├── AggregatedVisits.java │ │ │ │ ├── AggregatedVisitsAggregator.java │ │ │ │ ├── InputVisit.java │ │ │ │ ├── JsonMapper.java │ │ │ │ ├── JsonSerializer.java │ │ │ │ └── VisitsLocalAggregatorJob.java │ │ │ └── resources │ │ │ └── log4j.properties │ └── 02-local-kafka-spark │ │ ├── README.md │ │ ├── docker │ │ ├── docker-compose.yaml │ │ └── generation_configuration.yaml │ │ ├── kafka_writer.py │ │ ├── recent_visit_info_preparator_job.py │ │ └── requirements.txt ├── 04-sessionization │ ├── 01-incremental-airflow-windows-postgresql │ │ ├── README.md │ │ ├── dags │ │ │ ├── __init__.py │ │ │ └── sessions_generator.py │ │ ├── docker │ │ │ ├── docker-compose.yaml │ │ │ ├── generation_configuration.yaml │ │ │ └── init.sql │ │ ├── requirements.txt │ │ ├── sql │ │ │ ├── clean_previous_run_generated_sessions.sql │ │ │ ├── clean_previous_run_pending_sessions.sql │ │ │ └── generate_sessions.sql │ │ └── start.sh │ ├── 02-stateful-kafka-flink │ │ ├── README.md │ │ ├── __init__.py │ │ ├── docker │ │ │ ├── docker-compose.yaml │ │ │ └── generation_configuration.yaml │ │ ├── flink-connector-base-1.18.0.jar │ │ ├── flink-connector-kafka-3.1.0-1.18.jar │ │ ├── kafka-clients-3.7.0.jar │ │ ├── requirements.txt │ │ ├── visit_timestamp_assigner.py │ │ ├── visits_processor.py │ │ └── visits_sessionization_job.py │ └── 02-stateful-kafka-spark │ │ ├── README.md │ │ ├── docker │ │ ├── docker-compose.yaml │ │ └── generation_configuration.yaml │ │ ├── requirements.txt │ │ ├── sessions_generator_job.py │ │ └── sessions_mapper.py └── 05-data-ordering │ ├── 01-bin-packer-kinesis-spark │ ├── README.md │ ├── bin_pack_orderer_job.py │ ├── bin_pack_orderer_writer_to_kinesis.py │ ├── docker │ │ ├── docker-compose.yaml │ │ └── generation_configuration.yaml │ ├── kinesis_reader.py │ └── requirements.txt │ └── 02-fifo-kafka-spark │ ├── README.md │ ├── config.py │ ├── docker │ ├── docker-compose.yaml │ └── generation_configuration.yaml │ ├── fifo_orderer_bulk_idempotent_job.py │ ├── fifo_orderer_bulk_send_job.py │ ├── fifo_orderer_individual_send_job.py │ ├── kafka_writer_bulk_send.py │ ├── kafka_writer_bulk_send_idempotent.py │ ├── kafka_writer_individual_send.py │ └── requirements.txt ├── chapter-06 ├── 01-sequence │ ├── 01-local-sequencer-airflow │ │ ├── README.md │ │ ├── assets │ │ │ ├── devices_loader_isolated.png │ │ │ └── devices_loader_not_isolated.png │ │ ├── dags │ │ │ ├── devices_loader.py │ │ │ ├── devices_loader_not_isolated.py │ │ │ └── macros.py │ │ ├── docker │ │ │ ├── dataset │ │ │ │ ├── docker-compose.yaml │ │ │ │ └── generation_configuration.yaml │ │ │ └── postgresql │ │ │ │ ├── docker-compose.yaml │ │ │ │ └── init.sql │ │ ├── requirements.txt │ │ ├── sql │ │ │ ├── create_final_table.sql │ │ │ ├── expose_new_table.sql │ │ │ └── load_file_to_device_table.sql │ │ └── start.sh │ ├── 01-local-sequencer-spark │ │ ├── README.md │ │ ├── config.py │ │ ├── dataset │ │ │ ├── docker-compose.yaml │ │ │ └── generation_configuration.yaml │ │ ├── devices_table_reader.py │ │ ├── load_devices_data_with_sequencer.py │ │ └── requirements.txt │ ├── 02-isolated-sequencer-dataset-dependency │ │ ├── README.md │ │ ├── dags │ │ │ ├── devices_aggregator.py │ │ │ ├── devices_loader.py │ │ │ └── macros.py │ │ ├── docker │ │ │ ├── dataset │ │ │ │ ├── docker-compose.yaml │ │ │ │ └── generation_configuration.yaml │ │ │ └── postgresql │ │ │ │ ├── docker-compose.yaml │ │ │ │ └── init.sql │ │ ├── requirements.txt │ │ ├── sql │ │ │ ├── load_file_to_device_table.sql │ │ │ └── refresh_aggregates.sql │ │ └── start.sh │ └── 02-isolated-sequencer-external-trigger │ │ ├── README.md │ │ ├── assets │ │ ├── backfill_recursive.png │ │ └── clean_task_recursive.png │ │ ├── dags │ │ ├── devices_aggregator.py │ │ ├── devices_loader.py │ │ └── macros.py │ │ ├── docker │ │ ├── dataset │ │ │ ├── docker-compose.yaml │ │ │ └── generation_configuration.yaml │ │ └── postgresql │ │ │ ├── docker-compose.yaml │ │ │ └── init.sql │ │ ├── requirements.txt │ │ ├── sql │ │ ├── load_file_to_device_table.sql │ │ └── refresh_aggregates.sql │ │ └── start.sh ├── 02-fan-in │ ├── 01-aligned-fan-in-airflow-aggregates │ │ ├── README.md │ │ ├── dags │ │ │ ├── __init__.py │ │ │ └── visits_cube_generator.py │ │ ├── docker │ │ │ ├── docker-compose.yaml │ │ │ ├── generation_configuration.yaml │ │ │ └── init.sql │ │ ├── requirements.txt │ │ ├── sql │ │ │ ├── clear_context.sql │ │ │ ├── generate_visits_cube.sql │ │ │ └── load_visits.sql │ │ └── start.sh │ ├── 01-aligned-fan-in-spark-union │ │ ├── README.md │ │ ├── aligned_fan_in_devices_union_job.py │ │ ├── config.py │ │ ├── dataset │ │ │ ├── docker-compose.yaml │ │ │ └── generation_configuration.yaml │ │ ├── devices_table_reader.py │ │ └── requirements.txt │ ├── 02-unaligned-fan-in-airflow-aggregates-partial │ │ ├── README.md │ │ ├── dags │ │ │ ├── __init__.py │ │ │ └── visits_cube_generator.py │ │ ├── docker │ │ │ ├── docker-compose.yaml │ │ │ ├── generation_configuration.yaml │ │ │ └── init.sql │ │ ├── requirements.txt │ │ ├── sql │ │ │ ├── clear_context.sql │ │ │ ├── generate_visits_cube.sql │ │ │ └── load_visits.sql │ │ └── start.sh │ └── 02-unaligned-fan-in-aws-step-functions │ │ ├── README.md │ │ ├── lambda-partitions-detector │ │ └── detect_partitions.py │ │ ├── lambda-partitions-processor │ │ └── process_partition.py │ │ ├── lambda-table-creator │ │ └── create_table_from_processed_partitions.py │ │ ├── process_partitions_state_machine.json │ │ └── requirements.txt ├── 03-fan-out │ ├── 01-parallel-split-airflow-jobs │ │ ├── README.md │ │ ├── assets │ │ │ └── parallel_split_graph.png │ │ ├── dags │ │ │ ├── __init__.py │ │ │ ├── visits_converter.py │ │ │ └── visits_converter.yaml │ │ ├── docker │ │ │ ├── docker-compose.yaml │ │ │ ├── generation_configuration.yaml │ │ │ └── init.sql │ │ ├── requirements.txt │ │ ├── start.sh │ │ └── visits-loader-job │ │ │ ├── Dockerfile │ │ │ ├── requirements.txt │ │ │ └── visits_loader.py │ ├── 01-parallel-split-spark-foreachbatch │ │ ├── README.md │ │ ├── config.py │ │ ├── dataset │ │ │ ├── docker-compose.yaml │ │ │ └── generation_configuration.yaml │ │ ├── devices_table_reader.py │ │ ├── load_devices_data.py │ │ └── requirements.txt │ ├── 02-exclusive-choice-airflow-migration │ │ ├── README.md │ │ ├── assets │ │ │ ├── exclusive_choice_migration.png │ │ │ └── exclusive_choice_migration_colors.png │ │ ├── dags │ │ │ ├── __init__.py │ │ │ ├── visits_converter.py │ │ │ └── visits_converter.yaml │ │ ├── docker │ │ │ ├── docker-compose.yaml │ │ │ ├── generation_configuration.yaml │ │ │ └── init.sql │ │ ├── requirements.txt │ │ ├── start.sh │ │ └── visits-loader-job │ │ │ ├── Dockerfile │ │ │ ├── requirements.txt │ │ │ └── visits_loader.py │ └── 02-exclusive-choice-spark-dataset-criteria │ │ ├── README.md │ │ ├── config.py │ │ ├── dataset │ │ ├── docker-compose.yaml │ │ └── generation_configuration.yaml │ │ ├── exlusive_choice_dataset_attribute.py │ │ ├── exlusive_choice_input_argument_job.py │ │ ├── output_generation_factory.py │ │ └── requirements.txt └── 04-orchestration │ ├── 01-singler-runner-airflow-visits-trends │ ├── README.md │ ├── dags │ │ ├── __init__.py │ │ └── visits_trends_generator.py │ ├── docker │ │ ├── docker-compose.yaml │ │ ├── generation_configuration.yaml │ │ └── init.sql │ ├── requirements.txt │ ├── sql │ │ ├── generate_trends.sql │ │ └── load_visits.sql │ └── start.sh │ └── 02-concurrent-runner-airflow-ingestion │ ├── README.md │ ├── assets │ └── concurrent_runs.png │ ├── dags │ └── devices_loader.py │ ├── dataset │ ├── docker-compose.yaml │ └── generation_configuration.yaml │ ├── requirements.txt │ └── start.sh ├── chapter-07 ├── 01-personal-data-removal │ ├── 01-vertical-partitioner-kafka-spark-delta │ │ ├── README.md │ │ ├── config.py │ │ ├── dataset │ │ │ ├── docker-compose.yaml │ │ │ └── generation_configuration.yaml │ │ ├── requirements.txt │ │ ├── users_delta_table_creator.py │ │ ├── users_kafka_to_delta_converter.py │ │ ├── users_table_cleaner.py │ │ ├── users_table_reader.py │ │ └── vertical_partitioner_kafka.py │ ├── 02-in-place-overwriter-delta-lake │ │ ├── README.md │ │ ├── config.py │ │ ├── dataset │ │ │ ├── docker-compose.yaml │ │ │ └── generation_configuration.yaml │ │ ├── requirements.txt │ │ ├── user_cleaner.py │ │ ├── visits_delta_table_creator.py │ │ └── visits_table_reader.py │ ├── 02-in-place-overwriter-kafka-compaction │ │ └── users_cleaner.py │ └── 02-in-place-overwriter-spark-json │ │ ├── README.md │ │ ├── bootstrap_output_table.py │ │ ├── config.py │ │ ├── dataset │ │ ├── docker-compose.yaml │ │ └── generation_configuration.yaml │ │ ├── requirements.txt │ │ ├── user_cleaner.py │ │ └── visits_table_reader.py ├── 02-access-control │ ├── 01-fine-grained-accessor-postgresql-columns │ │ ├── README.md │ │ └── docker │ │ │ ├── docker-compose.yaml │ │ │ ├── generation_configuration.yaml │ │ │ └── init.sql │ ├── 01-fine-grained-accessor-postgresql-rows-view │ │ ├── README.md │ │ └── docker │ │ │ ├── docker-compose.yaml │ │ │ └── init.sql │ └── 01-fine-grained-accessor-postgresql-rows │ │ ├── README.md │ │ └── docker │ │ ├── docker-compose.yaml │ │ └── init.sql ├── 03-data-protection │ ├── 01-encryptor-s3 │ │ ├── ec2.tf │ │ ├── encrypted_file_on_s3.txt │ │ ├── iam.tf │ │ └── kms.tf │ ├── 02-anonymizer-apache-spark │ │ ├── README.md │ │ ├── config.py │ │ ├── dataset_creator.py │ │ ├── requirements.txt │ │ └── users_anonymizer.py │ └── 03-pseudo-anonymizer-apache-spark │ │ ├── README.md │ │ ├── config.py │ │ ├── dataset_creator.py │ │ ├── requirements.txt │ │ └── users_pseudo_anonymizer.py └── 04-connectivity │ ├── 01-secrets-pointer-spark-postgresql │ ├── README.md │ ├── dataset │ │ ├── docker-compose.yaml │ │ ├── generation_configuration.yaml │ │ └── init.sql │ ├── devices_json_converter.py │ └── requirements.txt │ └── 02-secretless-connector-apache-spark-postgresql │ ├── README.md │ ├── dataset │ ├── certs │ │ ├── ssl-cert-snakeoil.key │ │ └── ssl-cert-snakeoil.pem │ ├── docker-compose.yaml │ ├── generation_configuration.yaml │ └── init.sql │ ├── devices_json_converter.py │ └── requirements.txt ├── chapter-08 ├── 01-partitioning │ ├── 01-horizontal-partitioner-apache-spark │ │ ├── README.md │ │ ├── config.py │ │ ├── reader_delta_lake.py │ │ ├── reader_json.py │ │ ├── records_partitioner_delta_lake_job.py │ │ ├── records_partitioner_json_job.py │ │ └── requirements.txt │ ├── 01-horizontal-partitioner-custom-apache-kafka │ │ └── custom-kafka-partitioner │ │ │ ├── .gitignore │ │ │ ├── README.md │ │ │ ├── docker │ │ │ └── docker-compose.yaml │ │ │ ├── pom.xml │ │ │ └── src │ │ │ └── main │ │ │ └── java │ │ │ └── com │ │ │ └── waitingforcode │ │ │ ├── DataProducerWithRangePartitioner.java │ │ │ └── RangePartitioner.java │ ├── 01-horizontal-partitioner-kafka │ │ ├── README.md │ │ ├── docker │ │ │ └── docker-compose.yaml │ │ ├── records_reader_job.py │ │ ├── records_writer_job.py │ │ └── requirements.txt │ ├── 01-horizontal-partitioner-postgresql │ │ ├── README.md │ │ └── dataset │ │ │ ├── docker-compose.yaml │ │ │ ├── generation_configuration_1.yaml │ │ │ ├── generation_configuration_2.yaml │ │ │ ├── generation_configuration_3.yaml │ │ │ └── init.sql │ ├── 02-vertical-partitioner-apache-spark │ │ ├── README.md │ │ ├── config.py │ │ ├── dataset │ │ │ ├── docker-compose.yaml │ │ │ └── generation_configuration.yaml │ │ ├── partitioned_tables_reader.py │ │ ├── requirements.txt │ │ └── vertical_partitioner.py │ └── 02-vertical-partitioner-postgresql │ │ ├── README.md │ │ └── dataset │ │ ├── docker-compose.yaml │ │ ├── generation_configuration.yaml │ │ └── init.sql ├── 02-records-organization │ ├── 01-bucket-apache-spark │ │ ├── README.md │ │ ├── bucket_appender.py │ │ ├── bucket_preparators.py │ │ ├── config.py │ │ ├── dataset │ │ │ ├── docker-compose.yaml │ │ │ └── generation_configuration.yaml │ │ ├── local_combiner.py │ │ └── requirements.txt │ └── 02-sorter-delta-lake │ │ ├── README.md │ │ ├── assets │ │ ├── flat_table.png │ │ └── z_ordered_table.png │ │ ├── config.py │ │ ├── create_sorted_table.py │ │ ├── dataset │ │ ├── docker-compose.yaml │ │ └── generation_configuration.yaml │ │ ├── load_flat_table.py │ │ ├── load_sorted_table.py │ │ └── requirements.txt ├── 03-read-performance-optimization │ ├── 01-metadata-enhancer-apache-spark-apache-parquet │ │ ├── README.md │ │ ├── assets │ │ │ ├── json_read.png │ │ │ └── parquet_read.png │ │ ├── config.py │ │ ├── dataset │ │ │ ├── docker-compose.yaml │ │ │ └── generation_configuration.yaml │ │ ├── read_datasets.py │ │ ├── requirements.txt │ │ └── write_datasets.py │ ├── 01-metadata-enhancer-delta-lake │ │ ├── README.md │ │ ├── assets │ │ │ ├── delta_1.png │ │ │ ├── delta_2.png │ │ │ ├── delta_filter_1.png │ │ │ ├── delta_filter_2.png │ │ │ ├── json.png │ │ │ └── json_filter.png │ │ ├── config.py │ │ ├── dataset │ │ │ ├── docker-compose.yaml │ │ │ └── generation_configuration.yaml │ │ ├── requirements.txt │ │ ├── tables_creator.py │ │ ├── tables_reader.py │ │ └── tables_reader_null_type.py │ ├── 01-metadata-enhancer-postgresql │ │ ├── README.md │ │ └── docker │ │ │ ├── docker-compose.yaml │ │ │ ├── generation_configuration.yaml │ │ │ └── init.sql │ ├── 02-dataset-materializer-incremental-table-postgresql │ │ ├── README.md │ │ └── docker │ │ │ ├── docker-compose.yaml │ │ │ └── init.sql │ ├── 02-dataset-materializer-materialized-view-postgresql │ │ ├── README.md │ │ └── dataset │ │ │ ├── docker-compose.yaml │ │ │ ├── generation_configuration.yaml │ │ │ └── init.sql │ └── 03-manifest-delta-lake │ │ ├── README.md │ │ ├── config.py │ │ ├── create_devices_table.py │ │ ├── create_devices_table_manifest.py │ │ ├── dataset │ │ ├── docker-compose.yaml │ │ └── generation_configuration.yaml │ │ └── requirements.txt └── 04-data-representation │ ├── 01-normalizer-apache-spark │ ├── config.py │ ├── dataset │ │ ├── docker-compose.yaml │ │ └── generation_configuration.yaml │ ├── normalized_schema_tables_creator.py │ ├── normalized_schema_tables_reader.py │ ├── one_big_table_tables_creator.py │ ├── one_big_table_tables_reader.py │ ├── requirements.txt │ ├── snowflake_schema_tables_creator.py │ ├── snowflake_schema_tables_reader.py │ ├── star_schema_tables_creator.py │ └── star_schema_tables_reader.py │ ├── 01-normalizer-normal-forms │ ├── README.md │ ├── config.py │ ├── dataset │ │ ├── docker-compose.yaml │ │ └── generation_configuration.yaml │ ├── normalized_schema_tables_creator.py │ ├── normalized_schema_tables_reader.py │ └── requirements.txt │ ├── 01-normalizer-snowflake-schema │ ├── README.md │ ├── config.py │ ├── dataset │ │ ├── docker-compose.yaml │ │ └── generation_configuration.yaml │ ├── requirements.txt │ ├── snowflake_schema_tables_creator.py │ └── snowflake_schema_tables_reader.py │ ├── 02-denormalizer-one-big-table │ ├── README.md │ ├── config.py │ ├── dataset │ │ ├── docker-compose.yaml │ │ └── generation_configuration.yaml │ ├── one_big_table_tables_creator.py │ ├── one_big_table_tables_reader.py │ ├── prepare_base_tables.py │ └── requirements.txt │ └── 02-denormalizer-star-schema │ ├── README.md │ ├── config.py │ ├── dataset │ ├── docker-compose.yaml │ └── generation_configuration.yaml │ ├── prepare_base_tables.py │ ├── requirements.txt │ ├── star_schema_tables_creator.py │ └── star_schema_tables_reader.py ├── chapter-09 ├── 01-quality-enforcement │ ├── 01-audit-write-audit-publish-apache-airflow-postgresql │ │ ├── README.md │ │ ├── assets │ │ │ └── airflow_status.png │ │ ├── dags │ │ │ ├── __init__.py │ │ │ ├── lib │ │ │ │ ├── __init__.py │ │ │ │ ├── audit_functions.py │ │ │ │ └── config.py │ │ │ └── visits_synchronizer.py │ │ ├── dataset │ │ │ ├── docker-compose.yaml │ │ │ └── generation_configuration.yaml │ │ ├── docker │ │ │ ├── docker-compose.yaml │ │ │ └── init.sql │ │ ├── requirements.txt │ │ ├── sql │ │ │ └── load_file_to_visits_table.sql │ │ └── start.sh │ ├── 01-audit-write-audit-publish-apache-kafka-apache-flink-delta-lake │ │ ├── README.md │ │ ├── __init__.py │ │ ├── docker │ │ │ ├── docker-compose.yaml │ │ │ └── generation_configuration.yaml │ │ ├── flink-connector-base-1.18.0.jar │ │ ├── flink-connector-kafka-3.1.0-1.18.jar │ │ ├── kafka-clients-3.7.0.jar │ │ ├── kafka_sink.py │ │ ├── requirements.txt │ │ ├── visit_mapper.py │ │ └── visits_processor_job.py │ ├── 01-audit-write-audit-publish-apache-kafka-apache-spark-delta-lake-staging │ │ ├── README.md │ │ ├── config.py │ │ ├── docker │ │ │ ├── docker-compose.yaml │ │ │ └── generation_configuration.yaml │ │ ├── requirements.txt │ │ ├── staging_table_auditor_job.py │ │ ├── tables_creator.py │ │ ├── tables_reader.py │ │ └── visits_writer_job.py │ ├── 01-audit-write-audit-publish-apache-kafka-apache-spark-delta-lake │ │ ├── README.md │ │ ├── config.py │ │ ├── docker │ │ │ ├── docker-compose.yaml │ │ │ └── generation_configuration.yaml │ │ ├── requirements.txt │ │ ├── tables_creator.py │ │ ├── tables_reader.py │ │ └── visits_writer_job.py │ ├── 02-constraints-apache-kafka-protobuf │ │ ├── README.md │ │ ├── buf.gen.yaml │ │ ├── buf.lock │ │ ├── buf.yaml │ │ ├── definitons │ │ │ ├── invalid_visit.proto │ │ │ └── visit.proto │ │ ├── docker │ │ │ └── docker-compose.yaml │ │ ├── protobuf_output │ │ │ └── python │ │ │ │ ├── buf │ │ │ │ └── validate │ │ │ │ │ ├── expression_pb2.py │ │ │ │ │ ├── priv │ │ │ │ │ └── private_pb2.py │ │ │ │ │ └── validate_pb2.py │ │ │ │ └── definitons │ │ │ │ ├── invalid_visit_pb2.py │ │ │ │ └── visit_pb2.py │ │ ├── requirements.txt │ │ ├── visits_generator.py │ │ └── visits_reader.py │ └── 02-constraints-delta-lake │ │ ├── README.md │ │ ├── requirements.txt │ │ ├── table_with_constraints.py │ │ └── table_without_constraints.py ├── 02-schema-consistency │ ├── 01-schema-enforcer-apache-avro │ │ ├── README.md │ │ ├── pom.xml │ │ └── src │ │ │ └── main │ │ │ └── java │ │ │ └── com │ │ │ └── waitingforcode │ │ │ ├── InvalidSchemaCompatibility.java │ │ │ └── ValidSchemaCompatibility.java │ ├── 01-schema-enforcer-apache-kafka │ │ ├── README.md │ │ ├── assets │ │ │ ├── add_new_schema.png │ │ │ ├── compatibility_click.png │ │ │ └── compatibility_forward.png │ │ ├── docker │ │ │ └── docker-compose.yaml │ │ ├── producer_factory.py │ │ ├── requirements.txt │ │ ├── schemas │ │ │ ├── v1_visit.avsc │ │ │ └── v2_visit_without_visit_id.avsc │ │ ├── topic_visits_flexible_consumer.py │ │ ├── topic_visits_flexible_v1_producer.py │ │ ├── topic_visits_flexible_v2_producer.py │ │ ├── topic_visits_forward_consumer.py │ │ ├── topic_visits_forward_v1_producer.py │ │ └── topic_visits_forward_v2_producer.py │ ├── 01-schema-enforcer-delta-lake │ │ ├── README.md │ │ ├── requirements.txt │ │ ├── table_with_schema_enforcement.py │ │ ├── table_with_schema_enforcement_casted_types.py │ │ └── table_with_schema_enforcement_types.py │ ├── 01-schema-enforcer-postgresql │ │ ├── README.md │ │ └── docker │ │ │ ├── docker-compose.yaml │ │ │ └── init.sql │ ├── 02-schema-migrator-apache-spark-delta-lake │ │ ├── README.md │ │ ├── create_table.py │ │ ├── processing_restart_streaming_after_rename.py │ │ ├── processing_start_streaming_and_table_rename.py │ │ ├── requirements.txt │ │ ├── restart_streaming_after_rename.py │ │ └── start_streaming_and_table_rename.py │ ├── 02-schema-migrator-postgresql │ │ ├── README.md │ │ └── docker │ │ │ ├── docker-compose.yaml │ │ │ └── init.sql │ └── 02-schema-migrator-protobuf-apache-spark │ │ ├── README.md │ │ ├── buf.gen.yaml │ │ ├── buf.lock │ │ ├── buf.yaml │ │ ├── dataframe_printer.py │ │ ├── definitons │ │ ├── visit.proto │ │ ├── visit_v2.proto │ │ └── visit_v3.proto │ │ ├── docker │ │ └── docker-compose.yaml │ │ ├── protobuf_output │ │ ├── python │ │ │ └── definitons │ │ │ │ ├── visit_pb2.py │ │ │ │ ├── visit_v2_pb2.py │ │ │ │ └── visit_v3_pb2.py │ │ └── visit.bin │ │ ├── requirements.txt │ │ ├── visits_generator.py │ │ ├── visits_generator_v2.py │ │ ├── visits_generator_v3.py │ │ ├── visits_reader.py │ │ └── visits_reader_v2.py └── 03-quality-observation │ ├── 01-offline-observer-airflow-postgresql │ ├── README.md │ ├── assets │ │ ├── alerts_1.png │ │ ├── alerts_2.png │ │ ├── dash_errors_all_1.png │ │ ├── dash_errors_all_zoom.png │ │ ├── dash_errors_distribution.png │ │ ├── dash_lag.png │ │ ├── postgresql_check.png │ │ └── postgresql_config.png │ ├── dags │ │ ├── __init__.py │ │ ├── offline_observer.py │ │ └── passthrough_visits_synchronizer.py │ ├── docker │ │ ├── docker-compose.yaml │ │ ├── generation_configuration.yaml │ │ └── init.sql │ ├── requirements.txt │ ├── sql │ │ ├── clean_previously_inserted_visits.sql │ │ ├── copy_new_visits.sql │ │ ├── insert_new_observations.sql │ │ ├── record_new_observation_state.sql │ │ ├── record_new_synchronization_state.sql │ │ ├── wait_for_new_data.sql │ │ └── wait_for_new_data_to_observe.sql │ └── start.sh │ ├── 01-offline-observer-apache-spark-apache-kafka │ ├── README.md │ ├── assets │ │ ├── errors_minute.png │ │ ├── es_config.png │ │ ├── es_config_ok.png │ │ ├── lag_alert.png │ │ ├── offset_lag.png │ │ └── profile.png │ ├── connection_parameters.py │ ├── data_observation_functions.py │ ├── data_observation_job.py │ ├── dataset │ │ ├── docker-compose.yaml │ │ ├── generation_configuration.yaml │ │ └── grafana │ │ │ └── provisioning │ │ │ └── datasources │ │ │ └── datasource.yaml │ ├── passthrough_visits_processor_job.py │ ├── requirements.txt │ └── validation_results.py │ ├── 02-online-observer-airflow-postgresql │ ├── README.md │ ├── assets │ │ ├── alerts_1.png │ │ ├── alerts_2.png │ │ ├── dash_errors_all_1.png │ │ ├── dash_errors_all_zoom.png │ │ ├── dash_errors_distribution.png │ │ ├── dash_lag.png │ │ ├── postgresql_check.png │ │ └── postgresql_config.png │ ├── dags │ │ ├── __init__.py │ │ └── passthrough_visits_synchronizer.py │ ├── docker │ │ ├── docker-compose.yaml │ │ ├── generation_configuration.yaml │ │ └── init.sql │ ├── requirements.txt │ ├── sql │ │ ├── clean_previously_inserted_visits.sql │ │ ├── copy_new_visits.sql │ │ ├── insert_new_observations.sql │ │ ├── record_new_observation_state.sql │ │ ├── record_new_synchronization_state.sql │ │ └── wait_for_new_data.sql │ └── start.sh │ └── 02-online-observer-apache-spark-apache-kafka │ ├── README.md │ ├── assets │ ├── dash_brokers.png │ ├── dash_global_healthcheck.png │ ├── dash_visits_topic.png │ ├── errors_minute.png │ ├── es_config.png │ ├── es_config_ok.png │ ├── lag_alert.png │ ├── offset_lag.png │ └── profile.png │ ├── connection_parameters.py │ ├── data_observation_functions.py │ ├── dataset │ ├── docker-compose.yaml │ ├── etc │ │ ├── jmx_exporter │ │ │ └── config_kafka.yml │ │ └── prometheus │ │ │ └── prometheus.yml │ ├── generation_configuration.yaml │ ├── grafana │ │ ├── dashboards │ │ │ ├── grafana_dashboard_broker_hard_disk_usage.json │ │ │ ├── grafana_dashboard_broker_jvm_os.json │ │ │ ├── grafana_dashboard_broker_performance.json │ │ │ ├── grafana_dashboard_client_consumers_fetch_lag.json │ │ │ ├── grafana_dashboard_cluster_healthcheck.json │ │ │ ├── grafana_dashboard_cluster_replication.json │ │ │ └── grafana_dashboard_topics_logs.json │ │ └── provisioning │ │ │ ├── dashboards │ │ │ └── kafka.yaml │ │ │ └── datasources │ │ │ └── datasource.yaml │ └── kafka_healthcheck.sh │ ├── kafka_sink_with_observer.py │ ├── passthrough_visits_processor_job.py │ ├── requirements.txt │ └── validation_results.py └── chapter-10 ├── 01-data-detectors ├── 01-flow-interruption-apache-airflow-postgresql-grafana │ ├── README.md │ ├── assets │ │ ├── alert_config_b_part.png │ │ ├── alert_config_c_part.png │ │ ├── dag_run_success.png │ │ ├── state_1_pending.png │ │ ├── state_2_firing.png │ │ ├── state_3_normal.png │ │ └── state_history.png │ ├── dags │ │ ├── __init__.py │ │ ├── lib │ │ │ ├── __init__.py │ │ │ └── config.py │ │ └── visits_synchronizer.py │ ├── dataset │ │ ├── docker-compose.yaml │ │ └── generation_configuration.yaml │ ├── docker │ │ ├── docker-compose.yaml │ │ ├── etc │ │ │ ├── jmx_exporter │ │ │ │ └── config_kafka.yml │ │ │ └── prometheus │ │ │ │ └── prometheus.yml │ │ ├── grafana │ │ │ └── provisioning │ │ │ │ └── datasources │ │ │ │ └── datasource.yaml │ │ └── init.sql │ ├── requirements.txt │ ├── sql │ │ └── load_file_to_visits_table.sql │ └── start.sh ├── 01-flow-interruption-detector-apache-kafka-grafana │ ├── README.md │ ├── assets │ │ ├── alert_condition.png │ │ ├── alert_setting.png │ │ ├── alerting_state.png │ │ ├── state_firing.png │ │ ├── state_history.png │ │ ├── state_normal_1.png │ │ ├── state_normal_2.png │ │ └── state_pending.png │ ├── data_producer_job.py │ ├── docker │ │ ├── docker-compose.yaml │ │ ├── etc │ │ │ ├── jmx_exporter │ │ │ │ └── config_kafka.yml │ │ │ └── prometheus │ │ │ │ └── prometheus.yml │ │ ├── grafana │ │ │ └── provisioning │ │ │ │ └── datasources │ │ │ │ └── datasource.yaml │ │ └── kafka_healthcheck.sh │ └── requirements.txt ├── 01-flow-interruption-detector-delta-lake-grafana │ ├── README.md │ ├── assets │ │ ├── alert_b_part.png │ │ ├── alert_conditions_part.png │ │ ├── alert_firing.png │ │ ├── alert_history.png │ │ ├── alert_normal.png │ │ └── alert_pending.png │ ├── config.py │ ├── docker │ │ ├── docker-compose.yaml │ │ ├── etc │ │ │ ├── jmx_exporter │ │ │ │ └── config_kafka.yml │ │ │ └── prometheus │ │ │ │ └── prometheus.yml │ │ ├── generation_configuration.yaml │ │ ├── grafana │ │ │ └── provisioning │ │ │ │ └── datasources │ │ │ │ └── datasource.yaml │ │ └── kafka_healthcheck.sh │ ├── requirements.txt │ ├── tables_creator.py │ ├── tables_reader.py │ └── visits_writer_job.py ├── 02-skew-detector-apache-airflow-postgresql │ ├── README.md │ ├── assets │ │ ├── failed_executions.png │ │ └── sensor_clear.png │ ├── dags │ │ ├── __init__.py │ │ ├── lib │ │ │ ├── __init__.py │ │ │ └── config.py │ │ └── visits_synchronizer.py │ ├── dataset │ │ ├── docker-compose.yaml │ │ └── generation_configuration.yaml │ ├── docker │ │ ├── docker-compose.yaml │ │ └── init.sql │ ├── requirements.txt │ ├── sql │ │ └── load_file_to_visits_table.sql │ └── start.sh ├── 02-skew-detector-apache-kafka-grafana │ ├── README.md │ ├── assets │ │ ├── alert_firing.png │ │ ├── alert_history.png │ │ ├── alert_normal.png │ │ ├── alert_pending.png │ │ ├── alert_status.png │ │ ├── config_b.png │ │ └── config_conditions.png │ ├── data_producer_job.py │ ├── data_producer_job_balanced.py │ ├── data_producer_job_little_skew.py │ ├── docker │ │ ├── docker-compose.yaml │ │ ├── grafana │ │ │ └── provisioning │ │ │ │ └── datasources │ │ │ │ └── datasource.yaml │ │ └── kafka_healthcheck.sh │ └── requirements.txt └── 02-skew-detector-postgresql-grafana │ ├── README.md │ ├── assets │ ├── alert_config_b_part.png │ ├── alert_config_c_part.png │ ├── state_1_pending.png │ ├── state_2_firing.png │ ├── state_3_normal.png │ └── state_history.png │ └── docker │ ├── docker-compose.yaml │ ├── etc │ ├── jmx_exporter │ │ └── config_kafka.yml │ └── prometheus │ │ └── prometheus.yml │ ├── grafana │ └── provisioning │ │ └── datasources │ │ └── datasource.yaml │ └── init.sql ├── 02-time-detectors ├── 01-lag-detector-apache-spark-apache-kafka-grafana │ ├── README.md │ ├── assets │ │ ├── alert_b_part.png │ │ ├── alert_conditions_part.png │ │ ├── alert_firing.png │ │ ├── alert_history.png │ │ ├── alert_normal.png │ │ └── alert_pending.png │ ├── config.py │ ├── docker │ │ ├── docker-compose.yaml │ │ ├── etc │ │ │ ├── jmx_exporter │ │ │ │ └── config_kafka.yml │ │ │ └── prometheus │ │ │ │ └── prometheus.yml │ │ ├── generation_configuration.yaml │ │ ├── grafana │ │ │ └── provisioning │ │ │ │ └── datasources │ │ │ │ └── datasource.yaml │ │ └── kafka_healthcheck.sh │ ├── requirements.txt │ ├── stream_listeners.py │ ├── tables_creator.py │ └── visits_writer_job.py ├── 01-lag-detector-delta-lake-apache-spark-grafana │ ├── README.md │ ├── assets │ │ ├── alert_b_part.png │ │ ├── alert_conditions_part.png │ │ ├── alert_firing.png │ │ ├── alert_history.png │ │ ├── alert_normal.png │ │ ├── alert_pending.png │ │ ├── explore_metrics_reader.png │ │ ├── explore_metrics_writer.png │ │ └── writer_version_increased.png │ ├── create_table.py │ ├── docker │ │ ├── docker-compose.yaml │ │ ├── etc │ │ │ └── prometheus │ │ │ │ └── prometheus.yml │ │ └── grafana │ │ │ └── provisioning │ │ │ └── datasources │ │ │ └── datasource.yaml │ ├── requirements.txt │ ├── visits_consumer.py │ ├── visits_creator_job.py │ └── visits_reader.py ├── 02-sla-misses-apache-airflow │ ├── README.md │ ├── assets │ │ └── sla_table.png │ ├── dags │ │ ├── __init__.py │ │ └── visits_synchronizer.py │ ├── requirements.txt │ └── start.sh ├── 02-sla-misses-apache-flink │ ├── README.md │ ├── __init__.py │ ├── aggregation.py │ ├── assets │ │ ├── alert_a_b_parts.png │ │ ├── alert_conditions_part.png │ │ ├── alert_firing.png │ │ ├── alert_history.png │ │ ├── alert_normal.png │ │ ├── alert_pending.png │ │ └── es_config.png │ ├── assigner.py │ ├── docker │ │ ├── docker-compose.yaml │ │ ├── etc │ │ │ ├── jmx_exporter │ │ │ │ └── config_kafka.yml │ │ │ └── prometheus │ │ │ │ └── prometheus.yml │ │ ├── generation_configuration.yaml │ │ ├── grafana │ │ │ └── provisioning │ │ │ │ └── datasources │ │ │ │ └── datasource.yaml │ │ └── kafka_healthcheck.sh │ ├── flink-connector-base-1.18.0.jar │ ├── flink-connector-kafka-3.1.0-1.18.jar │ ├── flink-metrics-prometheus-1.18.0.jar │ ├── flink-sql-connector-kafka-3.1.0-1.18.jar │ ├── kafka-clients-3.7.0.jar │ ├── mappers.py │ ├── models.py │ ├── processor.py │ ├── reduced_visit_synchronizer_sla_monitoring_job.py │ ├── reduced_visits_data_synchronizer_job.py │ ├── requirements.txt │ └── window.py └── 02-sla-misses-apache-spark-structured-streaming │ ├── README.md │ ├── assets │ ├── alert_b_part.png │ ├── alert_conditions_part.png │ ├── alert_firing.png │ ├── alert_history.png │ ├── alert_normal.png │ └── alert_pending.png │ ├── config.py │ ├── docker │ ├── docker-compose.yaml │ ├── etc │ │ ├── jmx_exporter │ │ │ └── config_kafka.yml │ │ └── prometheus │ │ │ └── prometheus.yml │ ├── generation_configuration.yaml │ ├── grafana │ │ └── provisioning │ │ │ └── datasources │ │ │ └── datasource.yaml │ └── kafka_healthcheck.sh │ ├── requirements.txt │ ├── stream_listeners.py │ ├── tables_creator.py │ └── visits_writer_job.py └── 03-data-lineage ├── 01-dataset-tracker-openlineage-apache-airflow-marquez ├── README.md ├── assets │ └── tables_lineage.png ├── dags │ ├── devices_aggregator.py │ ├── devices_aggregator_bi.py │ ├── devices_loader.py │ └── macros.py ├── dataset │ ├── docker-compose.yaml │ └── generation_configuration.yaml ├── docker │ ├── docker-compose.yaml │ └── init.sql ├── requirements.txt ├── sql │ ├── load_file_to_device_table.sql │ ├── refresh_aggregates.sql │ └── refresh_bi_aggregates.sql └── start.sh ├── 01-dataset-tracker-openlineage-apache-kafka-apache-spark-marquez ├── README.md ├── assets │ └── topics_lineage.png ├── config.py ├── docker │ ├── docker-compose.yaml │ └── generation_configuration.yaml ├── requirements.txt ├── spark_session_factory.py ├── visits_decorator_job.py └── visits_reducer_job.py ├── 02-fine-grained-tracker-apache-spark-apache-kafka ├── README.md ├── config.py ├── docker │ ├── docker-compose.yaml │ └── generation_configuration.yaml ├── requirements.txt ├── visits_decorator_job.py └── visits_reducer_job.py └── 02-fine-grained-tracker-apache-spark-openlineage-marquez ├── README.md ├── bronze_table_users_writer.py ├── bronze_table_visits_writer.py ├── dataset_b_writer_job.py ├── docker └── docker-compose.yaml ├── gold_table_visits_aggregation_writer.py ├── requirements.txt ├── silver_table_enriched_visits_writer.py └── spark_session_factory.py /CODEOWNERS: -------------------------------------------------------------------------------- 1 | * @bartosz25 2 | -------------------------------------------------------------------------------- /assets/cover.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/assets/cover.jpeg -------------------------------------------------------------------------------- /assets/cover_color.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/assets/cover_color.jpg -------------------------------------------------------------------------------- /chapter-02/01-full-load/01-full-loader-airflow-postgresql-data-exposition/assets/clear_tasks.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-02/01-full-load/01-full-loader-airflow-postgresql-data-exposition/assets/clear_tasks.png -------------------------------------------------------------------------------- /chapter-02/01-full-load/01-full-loader-airflow-postgresql-data-exposition/assets/restart_clear.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-02/01-full-load/01-full-loader-airflow-postgresql-data-exposition/assets/restart_clear.png -------------------------------------------------------------------------------- /chapter-02/01-full-load/01-full-loader-airflow-postgresql-data-exposition/dags/macros.py: -------------------------------------------------------------------------------- 1 | def get_table_name(ds_nodash: str) -> str: 2 | return 'devices_'+ds_nodash 3 | 4 | 5 | def get_input_csv_to_load_for_host() -> str: 6 | return '/tmp/dedp/ch02/full-loader/data-exposition/input/dataset.csv' 7 | -------------------------------------------------------------------------------- /chapter-02/01-full-load/01-full-loader-airflow-postgresql-data-exposition/docker/dataset/generation_configuration.yaml: -------------------------------------------------------------------------------- 1 | dataset: 2 | rows: 50 3 | composition_percentage: 4 | duplicates: 0 5 | missing_fields: 0 6 | unprocessable_rows: 0 7 | data_blocker: 8 | type: 'no' 9 | entity: 10 | type: device 11 | generator: 12 | type: one-shot 13 | writer: 14 | type: csv 15 | configuration: 16 | output_path: '/home/data_generator_user/data_generator_output/input' 17 | clean_path: true 18 | -------------------------------------------------------------------------------- /chapter-02/01-full-load/01-full-loader-airflow-postgresql-data-exposition/docker/postgresql/init.sql: -------------------------------------------------------------------------------- 1 | CREATE SCHEMA dedp_test; 2 | -------------------------------------------------------------------------------- /chapter-02/01-full-load/01-full-loader-airflow-postgresql-data-exposition/requirements.txt: -------------------------------------------------------------------------------- 1 | apache-airflow==2.9.0 2 | apache-airflow-providers-postgres==5.12.0 3 | pendulum==2.1.2 -------------------------------------------------------------------------------- /chapter-02/01-full-load/01-full-loader-airflow-postgresql-data-exposition/sql/expose_new_table.sql: -------------------------------------------------------------------------------- 1 | {% set table_name = get_table_name(ds_nodash) %} 2 | CREATE OR REPLACE VIEW devices AS SELECT * FROM {{ table_name }} -------------------------------------------------------------------------------- /chapter-02/01-full-load/01-full-loader-spark-with-conversion/docker/generation_configuration.yaml: -------------------------------------------------------------------------------- 1 | dataset: 2 | rows: 1000 3 | composition_percentage: 4 | duplicates: 0 5 | missing_fields: 0 6 | unprocessable_rows: 0 7 | data_blocker: 8 | type: 'no' 9 | entity: 10 | type: device 11 | generator: 12 | type: one-shot 13 | writer: 14 | type: json 15 | configuration: 16 | output_path: '/home/data_generator_user/data_generator_output/input' 17 | clean_path: true 18 | -------------------------------------------------------------------------------- /chapter-02/01-full-load/01-full-loader-spark-with-conversion/python/config.py: -------------------------------------------------------------------------------- 1 | class DemoConfiguration: 2 | BASE_DIR: str = '/tmp/dedp/ch02/full-loader/' 3 | INPUT_PATH: str = f'{BASE_DIR}/input' 4 | DEVICES_TABLE: str = f'{BASE_DIR}/devices-table' 5 | -------------------------------------------------------------------------------- /chapter-02/01-full-load/01-full-loader-spark-with-conversion/python/requirements.txt: -------------------------------------------------------------------------------- 1 | delta-spark==3.0.0 2 | pyspark==3.5.0 -------------------------------------------------------------------------------- /chapter-02/02-incremental-load/01-incremental-loader-apache-airflow-apache-spark/airflow/requirements.txt: -------------------------------------------------------------------------------- 1 | apache-airflow==2.7.3 2 | apache-airflow-providers-cncf-kubernetes==7.3.0 3 | -------------------------------------------------------------------------------- /chapter-02/02-incremental-load/01-incremental-loader-apache-airflow-apache-spark/airflow/start.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | airflow db reset 3 | airflow db init 4 | airflow users create --username "dedp" --role "Admin" --password "dedp" --email "empty" --firstname "admin" --lastname "admin" 5 | export AIRFLOW__CORE__DAGS_FOLDER=./dags 6 | export AIRFLOW__CORE__LOAD_EXAMPLES=false 7 | airflow webserver & airflow scheduler -------------------------------------------------------------------------------- /chapter-02/02-incremental-load/01-incremental-loader-apache-airflow-apache-spark/assets/ch02_enable_dag.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-02/02-incremental-load/01-incremental-loader-apache-airflow-apache-spark/assets/ch02_enable_dag.png -------------------------------------------------------------------------------- /chapter-02/02-incremental-load/01-incremental-loader-apache-airflow-apache-spark/incremental-spark-job/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM apache/spark:3.5.0 2 | 3 | COPY ./visits_loader.py /tmp -------------------------------------------------------------------------------- /chapter-02/02-incremental-load/01-incremental-loader-apache-airflow-apache-spark/incremental-spark-job/requirements.txt: -------------------------------------------------------------------------------- 1 | pyspark==3.5.0 2 | -------------------------------------------------------------------------------- /chapter-02/02-incremental-load/02-change-data-capture-debezium/python/docker/init.sql: -------------------------------------------------------------------------------- 1 | CREATE SCHEMA dedp_schema; 2 | 3 | -- enable PostGis 4 | CREATE EXTENSION postgis; 5 | 6 | CREATE TABLE dedp_schema.visits ( 7 | visit_id VARCHAR(40) NOT NULL, 8 | event_time TIMESTAMP NOT NULL, 9 | user_id TEXT NOT NULL, 10 | page VARCHAR(20) NOT NULL, 11 | PRIMARY KEY (visit_id, event_time) 12 | ); 13 | 14 | -- TODO: explain me! 15 | ALTER TABLE dedp_schema.visits REPLICA IDENTITY FULL; 16 | 17 | 18 | -------------------------------------------------------------------------------- /chapter-02/02-incremental-load/02-change-data-capture-debezium/python/requirements.txt: -------------------------------------------------------------------------------- 1 | pyspark==3.5.0 2 | -------------------------------------------------------------------------------- /chapter-02/02-incremental-load/02-change-data-capture-delta-lake-change-data-feed/python/requirements.txt: -------------------------------------------------------------------------------- 1 | delta-spark==3.0.0 2 | pyspark==3.5.0 -------------------------------------------------------------------------------- /chapter-02/03-replication/01-passthrough-replicator-apache-spark-apache-kafka/config.py: -------------------------------------------------------------------------------- 1 | class DemoConfiguration: 2 | BASE_DIR: str = '/tmp/dedp/ch02/replication/passthrough-replicator' 3 | INPUT_PATH: str = f'{BASE_DIR}/input' 4 | OUTPUT_PATH: str = f'{BASE_DIR}/output' 5 | -------------------------------------------------------------------------------- /chapter-02/03-replication/01-passthrough-replicator-apache-spark-apache-kafka/dataset_replicator_raw.py: -------------------------------------------------------------------------------- 1 | from pyspark.sql import SparkSession 2 | 3 | from config import DemoConfiguration 4 | 5 | if __name__ == "__main__": 6 | spark_session = SparkSession.builder.master("local[*]").getOrCreate() 7 | 8 | input_dataset = spark_session.read.text(DemoConfiguration.INPUT_PATH) 9 | 10 | input_dataset.write.mode('overwrite').text(DemoConfiguration.OUTPUT_PATH) 11 | -------------------------------------------------------------------------------- /chapter-02/03-replication/01-passthrough-replicator-apache-spark-apache-kafka/requirements.txt: -------------------------------------------------------------------------------- 1 | pyspark==3.5.0 2 | -------------------------------------------------------------------------------- /chapter-02/03-replication/02-transformation-replicator-apache-spark-delta-lake/config.py: -------------------------------------------------------------------------------- 1 | class DemoConfiguration: 2 | BASE_DIR: str = '/tmp/dedp/ch02/replication/transformation-replicator' 3 | INPUT_PATH: str = f'{BASE_DIR}/input' 4 | DEVICES_TABLE_PATH: str = f'{BASE_DIR}/devices-table' 5 | DEVICES_TABLE_NO_NAME_PATH: str = f'{BASE_DIR}/devices-table-no-name' 6 | DEVICES_TABLE_TRUNCATED_NAME_PATH: str = f'{BASE_DIR}/devices-table-truncated-name' 7 | OUTPUT_PATH: str = f'{BASE_DIR}/output' 8 | -------------------------------------------------------------------------------- /chapter-02/03-replication/02-transformation-replicator-apache-spark-delta-lake/dataset/generation_configuration.yaml: -------------------------------------------------------------------------------- 1 | dataset: 2 | rows: 50 3 | composition_percentage: 4 | duplicates: 0 5 | missing_fields: 0 6 | unprocessable_rows: 0 7 | data_blocker: 8 | type: 'no' 9 | entity: 10 | type: device 11 | generator: 12 | type: one-shot 13 | writer: 14 | type: json 15 | configuration: 16 | output_path: '/home/data_generator_user/data_generator_output/input' 17 | clean_path: true 18 | -------------------------------------------------------------------------------- /chapter-02/03-replication/02-transformation-replicator-apache-spark-delta-lake/requirements.txt: -------------------------------------------------------------------------------- 1 | pyspark==3.5.0 2 | delta-spark==3.0.0 -------------------------------------------------------------------------------- /chapter-02/04-data-compaction/01-compactor-delta-lake/config.py: -------------------------------------------------------------------------------- 1 | class DemoConfiguration: 2 | BASE_DIR: str = '/tmp/dedp/ch02/data-compactor/' 3 | INPUT_PATH: str = f'{BASE_DIR}/input' 4 | DEVICES_TABLE: str = f'{BASE_DIR}/devices-table' 5 | -------------------------------------------------------------------------------- /chapter-02/04-data-compaction/01-compactor-delta-lake/dataset/generation_configuration.yaml: -------------------------------------------------------------------------------- 1 | dataset: 2 | rows: 50 3 | composition_percentage: 4 | duplicates: 0 5 | missing_fields: 0 6 | unprocessable_rows: 0 7 | data_blocker: 8 | type: 'no' 9 | entity: 10 | type: device 11 | generator: 12 | type: one-shot 13 | writer: 14 | type: json 15 | configuration: 16 | output_path: '/home/data_generator_user/data_generator_output/input' 17 | clean_path: true 18 | -------------------------------------------------------------------------------- /chapter-02/04-data-compaction/01-compactor-delta-lake/requirements.txt: -------------------------------------------------------------------------------- 1 | delta-spark==3.0.0 2 | pyspark==3.5.0 -------------------------------------------------------------------------------- /chapter-02/05-data-readiness/01-readiness-marker-apache-airflow/assets/flow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-02/05-data-readiness/01-readiness-marker-apache-airflow/assets/flow.png -------------------------------------------------------------------------------- /chapter-02/05-data-readiness/01-readiness-marker-apache-airflow/requirements.txt: -------------------------------------------------------------------------------- 1 | apache-airflow==2.7.3 2 | apache-airflow-providers-postgres==5.8.0 3 | pendulum==2.1.2 -------------------------------------------------------------------------------- /chapter-02/05-data-readiness/01-readiness-marker-apache-spark-success-file/python/config.py: -------------------------------------------------------------------------------- 1 | class DemoConfiguration: 2 | BASE_DIR: str = '/tmp/dedp/ch02/data-readiness/marker' 3 | INPUT_PATH: str = f'{BASE_DIR}/input' 4 | OUTPUT_PATH: str = f'{BASE_DIR}/devices-parquet' 5 | -------------------------------------------------------------------------------- /chapter-02/05-data-readiness/01-readiness-marker-apache-spark-success-file/python/dataset/generation_configuration.yaml: -------------------------------------------------------------------------------- 1 | dataset: 2 | rows: 50 3 | composition_percentage: 4 | duplicates: 0 5 | missing_fields: 0 6 | unprocessable_rows: 0 7 | data_blocker: 8 | type: 'no' 9 | entity: 10 | type: device 11 | generator: 12 | type: one-shot 13 | writer: 14 | type: json 15 | configuration: 16 | output_path: '/home/data_generator_user/data_generator_output/input' 17 | clean_path: true 18 | -------------------------------------------------------------------------------- /chapter-02/05-data-readiness/01-readiness-marker-apache-spark-success-file/python/requirements.txt: -------------------------------------------------------------------------------- 1 | pyspark==3.5.0 2 | -------------------------------------------------------------------------------- /chapter-02/06-event-driven/01-external-trigger-lambda-airflow/assets/enable_dag.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-02/06-event-driven/01-external-trigger-lambda-airflow/assets/enable_dag.png -------------------------------------------------------------------------------- /chapter-02/06-event-driven/01-external-trigger-lambda-airflow/assets/running_dag.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-02/06-event-driven/01-external-trigger-lambda-airflow/assets/running_dag.png -------------------------------------------------------------------------------- /chapter-02/06-event-driven/01-external-trigger-lambda-airflow/dataset/generation_configuration.yaml: -------------------------------------------------------------------------------- 1 | dataset: 2 | rows: 1000 3 | composition_percentage: 4 | duplicates: 0 5 | missing_fields: 0 6 | unprocessable_rows: 0 7 | data_blocker: 8 | type: 'no' 9 | entity: 10 | type: device 11 | generator: 12 | type: one-shot 13 | writer: 14 | type: json 15 | configuration: 16 | output_path: '/home/data_generator_user/data_generator_output/input' 17 | clean_path: true 18 | -------------------------------------------------------------------------------- /chapter-02/06-event-driven/01-external-trigger-lambda-airflow/requirements.txt: -------------------------------------------------------------------------------- 1 | apache-airflow==2.7.3 2 | apache-airflow-providers-postgres==5.8.0 3 | pendulum==2.1.2 4 | localstack==3.0.2 5 | awscli-local==0.21.1 6 | awscli==1.31.8 7 | boto3==1.33.8 8 | requests==2.31.0 9 | # I don't understand why but Flask-Session2 seems to be missing while setting Airflow up 10 | Flask-Session2==1.3.1 -------------------------------------------------------------------------------- /chapter-02/06-event-driven/01-external-trigger-lambda-airflow/s3hook.json: -------------------------------------------------------------------------------- 1 | { 2 | "LambdaFunctionConfigurations": [ 3 | { 4 | "Id": "1234567890123", 5 | "LambdaFunctionArn": "arn:aws:lambda:us-east-1:000000000000:function:devices-loader-trigger", 6 | "Events": [ 7 | "s3:ObjectCreated:*" 8 | ] 9 | } 10 | ] 11 | } -------------------------------------------------------------------------------- /chapter-03/01-unprocessable-records/01-dead-letter-null-safe-transformations/config.py: -------------------------------------------------------------------------------- 1 | def get_base_dir() -> str: 2 | return '/tmp/dedp/ch03/dead-letter' 3 | -------------------------------------------------------------------------------- /chapter-03/01-unprocessable-records/01-dead-letter-null-safe-transformations/dataset/generation_configuration.yaml: -------------------------------------------------------------------------------- 1 | dataset: 2 | rows: 1000 3 | composition_percentage: 4 | duplicates: 0 5 | missing_fields: 60 6 | unprocessable_rows: 0 7 | data_blocker: 8 | type: 'no' 9 | entity: 10 | type: device 11 | generator: 12 | type: one-shot 13 | writer: 14 | type: json 15 | configuration: 16 | output_path: '/home/data_generator_user/data_generator_output/input' 17 | clean_path: true 18 | -------------------------------------------------------------------------------- /chapter-03/01-unprocessable-records/01-dead-letter-null-safe-transformations/requirements.txt: -------------------------------------------------------------------------------- 1 | delta-spark==3.0.0 2 | pyspark==3.5.0 -------------------------------------------------------------------------------- /chapter-03/01-unprocessable-records/01-dead-letter-streaming-apache-flink-kafka/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-03/01-unprocessable-records/01-dead-letter-streaming-apache-flink-kafka/__init__.py -------------------------------------------------------------------------------- /chapter-03/01-unprocessable-records/01-dead-letter-streaming-apache-flink-kafka/flink-connector-base-1.17.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-03/01-unprocessable-records/01-dead-letter-streaming-apache-flink-kafka/flink-connector-base-1.17.0.jar -------------------------------------------------------------------------------- /chapter-03/01-unprocessable-records/01-dead-letter-streaming-apache-flink-kafka/flink-connector-kafka-1.17.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-03/01-unprocessable-records/01-dead-letter-streaming-apache-flink-kafka/flink-connector-kafka-1.17.0.jar -------------------------------------------------------------------------------- /chapter-03/01-unprocessable-records/01-dead-letter-streaming-apache-flink-kafka/kafka-clients-3.2.3.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-03/01-unprocessable-records/01-dead-letter-streaming-apache-flink-kafka/kafka-clients-3.2.3.jar -------------------------------------------------------------------------------- /chapter-03/01-unprocessable-records/01-dead-letter-streaming-apache-flink-kafka/requirements.txt: -------------------------------------------------------------------------------- 1 | apache-flink==1.17.0 2 | -------------------------------------------------------------------------------- /chapter-03/02-duplicated-records/01-windowed-deduplicator-dropduplicates-spark-batch/config.py: -------------------------------------------------------------------------------- 1 | def get_base_dir() -> str: 2 | return '/tmp/dedp/ch03/deduplication/' 3 | -------------------------------------------------------------------------------- /chapter-03/02-duplicated-records/01-windowed-deduplicator-dropduplicates-spark-batch/dataset/generation_configuration.yaml: -------------------------------------------------------------------------------- 1 | dataset: 2 | rows: 1000 3 | composition_percentage: 4 | duplicates: 50 5 | missing_fields: 0 6 | unprocessable_rows: 0 7 | data_blocker: 8 | type: 'no' 9 | entity: 10 | type: device 11 | generator: 12 | type: one-shot 13 | writer: 14 | type: json 15 | configuration: 16 | output_path: '/home/data_generator_user/data_generator_output/input' 17 | clean_path: true 18 | -------------------------------------------------------------------------------- /chapter-03/02-duplicated-records/01-windowed-deduplicator-dropduplicates-spark-batch/requirements.txt: -------------------------------------------------------------------------------- 1 | delta-spark==3.0.0 2 | pyspark==3.5.0 -------------------------------------------------------------------------------- /chapter-03/02-duplicated-records/01-windowed-deduplicator-dropduplicates-spark-streaming/config.py: -------------------------------------------------------------------------------- 1 | def get_base_dir() -> str: 2 | return '/tmp/dedp/ch03/deduplication/visits/' 3 | -------------------------------------------------------------------------------- /chapter-03/02-duplicated-records/01-windowed-deduplicator-dropduplicates-spark-streaming/requirements.txt: -------------------------------------------------------------------------------- 1 | pyspark==3.5.0 2 | -------------------------------------------------------------------------------- /chapter-03/02-duplicated-records/01-windowed-deduplicator-window-sql/config.py: -------------------------------------------------------------------------------- 1 | def get_base_dir() -> str: 2 | return '/tmp/dedp/ch03/deduplication/' 3 | -------------------------------------------------------------------------------- /chapter-03/02-duplicated-records/01-windowed-deduplicator-window-sql/dataset/generation_configuration.yaml: -------------------------------------------------------------------------------- 1 | dataset: 2 | rows: 1000 3 | composition_percentage: 4 | duplicates: 50 5 | missing_fields: 0 6 | unprocessable_rows: 0 7 | data_blocker: 8 | type: 'no' 9 | entity: 10 | type: device 11 | generator: 12 | type: one-shot 13 | writer: 14 | type: json 15 | configuration: 16 | output_path: '/home/data_generator_user/data_generator_output/input' 17 | clean_path: true 18 | -------------------------------------------------------------------------------- /chapter-03/02-duplicated-records/01-windowed-deduplicator-window-sql/requirements.txt: -------------------------------------------------------------------------------- 1 | delta-spark==3.0.0 2 | pyspark==3.5.0 -------------------------------------------------------------------------------- /chapter-03/03-late-data/01-late-data-detector-flink-side-output/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-03/03-late-data/01-late-data-detector-flink-side-output/__init__.py -------------------------------------------------------------------------------- /chapter-03/03-late-data/01-late-data-detector-flink-side-output/flink-connector-base-1.17.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-03/03-late-data/01-late-data-detector-flink-side-output/flink-connector-base-1.17.0.jar -------------------------------------------------------------------------------- /chapter-03/03-late-data/01-late-data-detector-flink-side-output/flink-connector-kafka-1.17.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-03/03-late-data/01-late-data-detector-flink-side-output/flink-connector-kafka-1.17.0.jar -------------------------------------------------------------------------------- /chapter-03/03-late-data/01-late-data-detector-flink-side-output/kafka-clients-3.2.3.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-03/03-late-data/01-late-data-detector-flink-side-output/kafka-clients-3.2.3.jar -------------------------------------------------------------------------------- /chapter-03/03-late-data/01-late-data-detector-flink-side-output/requirements.txt: -------------------------------------------------------------------------------- 1 | apache-flink==1.17.0 2 | -------------------------------------------------------------------------------- /chapter-03/03-late-data/01-late-data-detector-spark-stateful-window/requirements.txt: -------------------------------------------------------------------------------- 1 | pyspark==3.5.0 2 | pandas==2.0.1 3 | pyarrow==12.0.0 4 | -------------------------------------------------------------------------------- /chapter-03/03-late-data/02-static-late-data-integrator-apache-airflow/airflow/requirements.txt: -------------------------------------------------------------------------------- 1 | apache-airflow==2.9.0 -------------------------------------------------------------------------------- /chapter-03/03-late-data/02-static-late-data-integrator-apache-airflow/airflow/start.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | airflow db reset 3 | airflow db init 4 | airflow users create --username "dedp" --role "Admin" --password "dedp" --email "empty" --firstname "admin" --lastname "admin" 5 | export AIRFLOW__CORE__DAGS_FOLDER=./dags 6 | export AIRFLOW__CORE__LOAD_EXAMPLES=false 7 | airflow webserver & airflow scheduler -------------------------------------------------------------------------------- /chapter-03/03-late-data/02-static-late-data-integrator-apache-airflow/assets/clear_dag_run.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-03/03-late-data/02-static-late-data-integrator-apache-airflow/assets/clear_dag_run.png -------------------------------------------------------------------------------- /chapter-03/03-late-data/02-static-late-data-integrator-apache-airflow/assets/expected_run_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-03/03-late-data/02-static-late-data-integrator-apache-airflow/assets/expected_run_1.png -------------------------------------------------------------------------------- /chapter-03/03-late-data/03-dynamic-late-data-integrator-apache-airflow-delta-lake-concurrent-embedded/airflow/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-03/03-late-data/03-dynamic-late-data-integrator-apache-airflow-delta-lake-concurrent-embedded/airflow/__init__.py -------------------------------------------------------------------------------- /chapter-03/03-late-data/03-dynamic-late-data-integrator-apache-airflow-delta-lake-concurrent-embedded/airflow/plugins/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-03/03-late-data/03-dynamic-late-data-integrator-apache-airflow-delta-lake-concurrent-embedded/airflow/plugins/__init__.py -------------------------------------------------------------------------------- /chapter-03/03-late-data/03-dynamic-late-data-integrator-apache-airflow-delta-lake-concurrent-embedded/airflow/plugins/operators/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-03/03-late-data/03-dynamic-late-data-integrator-apache-airflow-delta-lake-concurrent-embedded/airflow/plugins/operators/__init__.py -------------------------------------------------------------------------------- /chapter-03/03-late-data/03-dynamic-late-data-integrator-apache-airflow-delta-lake-concurrent-embedded/airflow/requirements.txt: -------------------------------------------------------------------------------- 1 | apache-airflow==2.9.3 2 | #apache-airflow-providers-cncf-kubernetes==10.0.1 3 | apache-airflow-providers-cncf-kubernetes==10.0.1 -------------------------------------------------------------------------------- /chapter-03/03-late-data/03-dynamic-late-data-integrator-apache-airflow-delta-lake-concurrent-embedded/airflow/start.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | airflow db reset 3 | airflow db init 4 | airflow users create --username "dedp" --role "Admin" --password "dedp" --email "empty" --firstname "admin" --lastname "admin" 5 | export AIRFLOW__CORE__DAGS_FOLDER=./dags 6 | export AIRFLOW__CORE__PLUGINS_FOLDER=./plugins 7 | export AIRFLOW__CORE__LOAD_EXAMPLES=false 8 | airflow webserver & airflow scheduler & airflow triggerer -------------------------------------------------------------------------------- /chapter-03/03-late-data/03-dynamic-late-data-integrator-apache-airflow-delta-lake-concurrent-embedded/assets/after_backfilling.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-03/03-late-data/03-dynamic-late-data-integrator-apache-airflow-delta-lake-concurrent-embedded/assets/after_backfilling.png -------------------------------------------------------------------------------- /chapter-03/03-late-data/03-dynamic-late-data-integrator-apache-airflow-delta-lake-concurrent-embedded/assets/backfilled_task.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-03/03-late-data/03-dynamic-late-data-integrator-apache-airflow-delta-lake-concurrent-embedded/assets/backfilled_task.png -------------------------------------------------------------------------------- /chapter-03/03-late-data/03-dynamic-late-data-integrator-apache-airflow-delta-lake-concurrent-embedded/assets/clear_dag_run.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-03/03-late-data/03-dynamic-late-data-integrator-apache-airflow-delta-lake-concurrent-embedded/assets/clear_dag_run.png -------------------------------------------------------------------------------- /chapter-03/03-late-data/03-dynamic-late-data-integrator-apache-airflow-delta-lake-concurrent-embedded/assets/expected_run_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-03/03-late-data/03-dynamic-late-data-integrator-apache-airflow-delta-lake-concurrent-embedded/assets/expected_run_1.png -------------------------------------------------------------------------------- /chapter-03/03-late-data/03-dynamic-late-data-integrator-apache-airflow-delta-lake-concurrent-embedded/assets/raw/1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-03/03-late-data/03-dynamic-late-data-integrator-apache-airflow-delta-lake-concurrent-embedded/assets/raw/1.png -------------------------------------------------------------------------------- /chapter-03/03-late-data/03-dynamic-late-data-integrator-apache-airflow-delta-lake-concurrent-embedded/assets/raw/10.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-03/03-late-data/03-dynamic-late-data-integrator-apache-airflow-delta-lake-concurrent-embedded/assets/raw/10.png -------------------------------------------------------------------------------- /chapter-03/03-late-data/03-dynamic-late-data-integrator-apache-airflow-delta-lake-concurrent-embedded/assets/raw/11.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-03/03-late-data/03-dynamic-late-data-integrator-apache-airflow-delta-lake-concurrent-embedded/assets/raw/11.png -------------------------------------------------------------------------------- /chapter-03/03-late-data/03-dynamic-late-data-integrator-apache-airflow-delta-lake-concurrent-embedded/assets/raw/12.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-03/03-late-data/03-dynamic-late-data-integrator-apache-airflow-delta-lake-concurrent-embedded/assets/raw/12.png -------------------------------------------------------------------------------- /chapter-03/03-late-data/03-dynamic-late-data-integrator-apache-airflow-delta-lake-concurrent-embedded/assets/raw/2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-03/03-late-data/03-dynamic-late-data-integrator-apache-airflow-delta-lake-concurrent-embedded/assets/raw/2.png -------------------------------------------------------------------------------- /chapter-03/03-late-data/03-dynamic-late-data-integrator-apache-airflow-delta-lake-concurrent-embedded/assets/raw/3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-03/03-late-data/03-dynamic-late-data-integrator-apache-airflow-delta-lake-concurrent-embedded/assets/raw/3.png -------------------------------------------------------------------------------- /chapter-03/03-late-data/03-dynamic-late-data-integrator-apache-airflow-delta-lake-concurrent-embedded/assets/raw/4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-03/03-late-data/03-dynamic-late-data-integrator-apache-airflow-delta-lake-concurrent-embedded/assets/raw/4.png -------------------------------------------------------------------------------- /chapter-03/03-late-data/03-dynamic-late-data-integrator-apache-airflow-delta-lake-concurrent-embedded/assets/raw/5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-03/03-late-data/03-dynamic-late-data-integrator-apache-airflow-delta-lake-concurrent-embedded/assets/raw/5.png -------------------------------------------------------------------------------- /chapter-03/03-late-data/03-dynamic-late-data-integrator-apache-airflow-delta-lake-concurrent-embedded/assets/raw/6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-03/03-late-data/03-dynamic-late-data-integrator-apache-airflow-delta-lake-concurrent-embedded/assets/raw/6.png -------------------------------------------------------------------------------- /chapter-03/03-late-data/03-dynamic-late-data-integrator-apache-airflow-delta-lake-concurrent-embedded/assets/raw/7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-03/03-late-data/03-dynamic-late-data-integrator-apache-airflow-delta-lake-concurrent-embedded/assets/raw/7.png -------------------------------------------------------------------------------- /chapter-03/03-late-data/03-dynamic-late-data-integrator-apache-airflow-delta-lake-concurrent-embedded/assets/raw/8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-03/03-late-data/03-dynamic-late-data-integrator-apache-airflow-delta-lake-concurrent-embedded/assets/raw/8.png -------------------------------------------------------------------------------- /chapter-03/03-late-data/03-dynamic-late-data-integrator-apache-airflow-delta-lake-concurrent-embedded/assets/raw/9.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-03/03-late-data/03-dynamic-late-data-integrator-apache-airflow-delta-lake-concurrent-embedded/assets/raw/9.png -------------------------------------------------------------------------------- /chapter-03/03-late-data/03-dynamic-late-data-integrator-apache-airflow-delta-lake-concurrent-embedded/assets/run_in_progress.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-03/03-late-data/03-dynamic-late-data-integrator-apache-airflow-delta-lake-concurrent-embedded/assets/run_in_progress.png -------------------------------------------------------------------------------- /chapter-03/03-late-data/03-dynamic-late-data-integrator-apache-airflow-delta-lake-concurrent-embedded/assets/simulation.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-03/03-late-data/03-dynamic-late-data-integrator-apache-airflow-delta-lake-concurrent-embedded/assets/simulation.gif -------------------------------------------------------------------------------- /chapter-03/03-late-data/03-dynamic-late-data-integrator-apache-airflow-delta-lake-concurrent-embedded/data-generator/requirements.txt: -------------------------------------------------------------------------------- 1 | delta-spark==3.0.0 2 | pyspark==3.5.0 -------------------------------------------------------------------------------- /chapter-03/03-late-data/03-dynamic-late-data-integrator-apache-airflow-delta-lake-concurrent-embedded/late-data-integrator/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM apache/spark:3.5.0 2 | 3 | COPY target/scala-2.12/late_data_integrator-assembly-1.0.0-SNAPSHOT.jar /tmp 4 | -------------------------------------------------------------------------------- /chapter-03/03-late-data/03-dynamic-late-data-integrator-apache-airflow-delta-lake-concurrent-embedded/late-data-integrator/project/build.properties: -------------------------------------------------------------------------------- 1 | sbt.version=1.9.7 2 | -------------------------------------------------------------------------------- /chapter-03/03-late-data/03-dynamic-late-data-integrator-apache-airflow-delta-lake-concurrent-embedded/late-data-integrator/project/plugins.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "1.2.0") -------------------------------------------------------------------------------- /chapter-03/03-late-data/03-dynamic-late-data-integrator-apache-airflow-delta-lake-concurrent-embedded/late-data-integrator/src/main/scala/com/waitingforcode/Json.scala: -------------------------------------------------------------------------------- 1 | package com.waitingforcode 2 | 3 | import com.fasterxml.jackson.databind.ObjectMapper 4 | import com.fasterxml.jackson.module.scala.DefaultScalaModule 5 | 6 | object Json { 7 | 8 | val Mapper = new ObjectMapper() 9 | Mapper.registerModule(DefaultScalaModule) 10 | } 11 | -------------------------------------------------------------------------------- /chapter-03/03-late-data/03-dynamic-late-data-integrator-apache-airflow-delta-lake-concurrent-trigger/airflow/requirements.txt: -------------------------------------------------------------------------------- 1 | apache-airflow==2.9.0 2 | apache-airflow-providers-cncf-kubernetes==10.0.1 3 | -------------------------------------------------------------------------------- /chapter-03/03-late-data/03-dynamic-late-data-integrator-apache-airflow-delta-lake-concurrent-trigger/airflow/start.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | airflow db reset 3 | airflow db init 4 | airflow users create --username "dedp" --role "Admin" --password "dedp" --email "empty" --firstname "admin" --lastname "admin" 5 | export AIRFLOW__CORE__DAGS_FOLDER=./dags 6 | export AIRFLOW__CORE__LOAD_EXAMPLES=false 7 | export AIRFLOW__WEBSERVER__WORKERS=2 8 | airflow webserver & airflow scheduler -------------------------------------------------------------------------------- /chapter-03/03-late-data/03-dynamic-late-data-integrator-apache-airflow-delta-lake-concurrent-trigger/assets/backfilling_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-03/03-late-data/03-dynamic-late-data-integrator-apache-airflow-delta-lake-concurrent-trigger/assets/backfilling_1.png -------------------------------------------------------------------------------- /chapter-03/03-late-data/03-dynamic-late-data-integrator-apache-airflow-delta-lake-concurrent-trigger/assets/backfilling_late_data.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-03/03-late-data/03-dynamic-late-data-integrator-apache-airflow-delta-lake-concurrent-trigger/assets/backfilling_late_data.png -------------------------------------------------------------------------------- /chapter-03/03-late-data/03-dynamic-late-data-integrator-apache-airflow-delta-lake-concurrent-trigger/assets/backfilling_late_data_mapped_tasks.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-03/03-late-data/03-dynamic-late-data-integrator-apache-airflow-delta-lake-concurrent-trigger/assets/backfilling_late_data_mapped_tasks.png -------------------------------------------------------------------------------- /chapter-03/03-late-data/03-dynamic-late-data-integrator-apache-airflow-delta-lake-concurrent-trigger/assets/clear_runs.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-03/03-late-data/03-dynamic-late-data-integrator-apache-airflow-delta-lake-concurrent-trigger/assets/clear_runs.png -------------------------------------------------------------------------------- /chapter-03/03-late-data/03-dynamic-late-data-integrator-apache-airflow-delta-lake-concurrent-trigger/assets/dag_after_backfilling_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-03/03-late-data/03-dynamic-late-data-integrator-apache-airflow-delta-lake-concurrent-trigger/assets/dag_after_backfilling_1.png -------------------------------------------------------------------------------- /chapter-03/03-late-data/03-dynamic-late-data-integrator-apache-airflow-delta-lake-concurrent-trigger/assets/expected_run_progress_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-03/03-late-data/03-dynamic-late-data-integrator-apache-airflow-delta-lake-concurrent-trigger/assets/expected_run_progress_1.png -------------------------------------------------------------------------------- /chapter-03/03-late-data/03-dynamic-late-data-integrator-apache-airflow-delta-lake-concurrent-trigger/assets/expected_run_progress_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-03/03-late-data/03-dynamic-late-data-integrator-apache-airflow-delta-lake-concurrent-trigger/assets/expected_run_progress_2.png -------------------------------------------------------------------------------- /chapter-03/03-late-data/03-dynamic-late-data-integrator-apache-airflow-delta-lake-concurrent-trigger/assets/expected_run_progress_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-03/03-late-data/03-dynamic-late-data-integrator-apache-airflow-delta-lake-concurrent-trigger/assets/expected_run_progress_3.png -------------------------------------------------------------------------------- /chapter-03/03-late-data/03-dynamic-late-data-integrator-apache-airflow-delta-lake-concurrent-trigger/assets/expected_run_result.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-03/03-late-data/03-dynamic-late-data-integrator-apache-airflow-delta-lake-concurrent-trigger/assets/expected_run_result.png -------------------------------------------------------------------------------- /chapter-03/03-late-data/03-dynamic-late-data-integrator-apache-airflow-delta-lake-concurrent-trigger/data-generator/requirements.txt: -------------------------------------------------------------------------------- 1 | delta-spark==3.0.0 2 | pyspark==3.5.0 -------------------------------------------------------------------------------- /chapter-03/03-late-data/03-dynamic-late-data-integrator-apache-airflow-delta-lake-concurrent-trigger/late-data-integrator/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM apache/spark:3.5.0 2 | 3 | COPY target/scala-2.12/late_data_integrator-assembly-1.0.0-SNAPSHOT.jar /tmp 4 | -------------------------------------------------------------------------------- /chapter-03/03-late-data/03-dynamic-late-data-integrator-apache-airflow-delta-lake-concurrent-trigger/late-data-integrator/src/main/scala/com/waitingforcode/Json.scala: -------------------------------------------------------------------------------- 1 | package com.waitingforcode 2 | 3 | import com.fasterxml.jackson.databind.ObjectMapper 4 | import com.fasterxml.jackson.module.scala.DefaultScalaModule 5 | 6 | object Json { 7 | 8 | val Mapper = new ObjectMapper() 9 | Mapper.registerModule(DefaultScalaModule) 10 | } 11 | -------------------------------------------------------------------------------- /chapter-03/03-late-data/03-dynamic-late-data-integrator-apache-airflow-delta-lake-sequential/airflow/requirements.txt: -------------------------------------------------------------------------------- 1 | apache-airflow==2.9.3 2 | apache-airflow-providers-cncf-kubernetes==10.0.1 -------------------------------------------------------------------------------- /chapter-03/03-late-data/03-dynamic-late-data-integrator-apache-airflow-delta-lake-sequential/airflow/start.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | airflow db reset 3 | airflow db init 4 | airflow users create --username "dedp" --role "Admin" --password "dedp" --email "empty" --firstname "admin" --lastname "admin" 5 | export AIRFLOW__CORE__DAGS_FOLDER=./dags 6 | export AIRFLOW__CORE__LOAD_EXAMPLES=false 7 | airflow webserver & airflow scheduler -------------------------------------------------------------------------------- /chapter-03/03-late-data/03-dynamic-late-data-integrator-apache-airflow-delta-lake-sequential/assets/clear_dag_run.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-03/03-late-data/03-dynamic-late-data-integrator-apache-airflow-delta-lake-sequential/assets/clear_dag_run.png -------------------------------------------------------------------------------- /chapter-03/03-late-data/03-dynamic-late-data-integrator-apache-airflow-delta-lake-sequential/assets/dag_status_after_late_data_integration.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-03/03-late-data/03-dynamic-late-data-integrator-apache-airflow-delta-lake-sequential/assets/dag_status_after_late_data_integration.png -------------------------------------------------------------------------------- /chapter-03/03-late-data/03-dynamic-late-data-integrator-apache-airflow-delta-lake-sequential/assets/expected_run_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-03/03-late-data/03-dynamic-late-data-integrator-apache-airflow-delta-lake-sequential/assets/expected_run_1.png -------------------------------------------------------------------------------- /chapter-03/03-late-data/03-dynamic-late-data-integrator-apache-airflow-delta-lake-sequential/data-generator/dataset/?/.sbt/1.0/java9-rt-ext-oracle_corporation_11_0_14_1/rt.jar: -------------------------------------------------------------------------------- 1 | PK -------------------------------------------------------------------------------- /chapter-03/03-late-data/03-dynamic-late-data-integrator-apache-airflow-delta-lake-sequential/data-generator/dataset/?/.sbt/boot/sbt.boot.lock: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-03/03-late-data/03-dynamic-late-data-integrator-apache-airflow-delta-lake-sequential/data-generator/dataset/?/.sbt/boot/sbt.boot.lock -------------------------------------------------------------------------------- /chapter-03/03-late-data/03-dynamic-late-data-integrator-apache-airflow-delta-lake-sequential/data-generator/dataset/dedp_ch03_late_data_integrator_sequential.tar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-03/03-late-data/03-dynamic-late-data-integrator-apache-airflow-delta-lake-sequential/data-generator/dataset/dedp_ch03_late_data_integrator_sequential.tar -------------------------------------------------------------------------------- /chapter-03/03-late-data/03-dynamic-late-data-integrator-apache-airflow-delta-lake-sequential/data-generator/requirements.txt: -------------------------------------------------------------------------------- 1 | delta-spark==3.0.0 2 | pyspark==3.5.0 -------------------------------------------------------------------------------- /chapter-03/03-late-data/03-dynamic-late-data-integrator-apache-airflow-delta-lake-sequential/late-data-integrator/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM apache/spark:3.5.0 2 | 3 | COPY target/scala-2.12/late_data_integrator-assembly-1.0.0-SNAPSHOT.jar /tmp -------------------------------------------------------------------------------- /chapter-03/04-filtering/01-filter-interceptor-scala-accumulators/build.sbt: -------------------------------------------------------------------------------- 1 | name := "filter_interceptor" 2 | organization := "com.waitingforcode" 3 | version := "1.0.0-SNAPSHOT" 4 | 5 | 6 | val sparkVersion = "3.5.0" 7 | 8 | libraryDependencies += "org.apache.spark" %% "spark-sql" % sparkVersion 9 | libraryDependencies += "org.apache.spark" %% "spark-hive" % sparkVersion 10 | -------------------------------------------------------------------------------- /chapter-03/04-filtering/01-filter-interceptor-spark-accumulators/config.py: -------------------------------------------------------------------------------- 1 | def get_base_dir() -> str: 2 | return '/tmp/dedp/ch03/filtering/' 3 | -------------------------------------------------------------------------------- /chapter-03/04-filtering/01-filter-interceptor-spark-accumulators/requirements.txt: -------------------------------------------------------------------------------- 1 | delta-spark==3.0.0 2 | pyspark==3.5.0 3 | pandas==2.0.2 4 | pyarrow==14.0.1 -------------------------------------------------------------------------------- /chapter-03/04-filtering/01-filter-interceptor-spark-sql/config.py: -------------------------------------------------------------------------------- 1 | def get_base_dir() -> str: 2 | return '/tmp/dedp/ch03/filtering/' 3 | -------------------------------------------------------------------------------- /chapter-03/04-filtering/01-filter-interceptor-spark-sql/requirements.txt: -------------------------------------------------------------------------------- 1 | delta-spark==3.0.0 2 | pyspark==3.5.0 3 | pandas==2.0.2 4 | pyarrow==14.0.1 -------------------------------------------------------------------------------- /chapter-03/05-fault-tolerance/01-checkpointer-apache-flink-apache-kafka/.gitignore: -------------------------------------------------------------------------------- 1 | docker/checkpoints/* -------------------------------------------------------------------------------- /chapter-03/05-fault-tolerance/01-checkpointer-apache-flink-apache-kafka/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-03/05-fault-tolerance/01-checkpointer-apache-flink-apache-kafka/__init__.py -------------------------------------------------------------------------------- /chapter-03/05-fault-tolerance/01-checkpointer-apache-flink-apache-kafka/assets/flink_cancel_job.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-03/05-fault-tolerance/01-checkpointer-apache-flink-apache-kafka/assets/flink_cancel_job.png -------------------------------------------------------------------------------- /chapter-03/05-fault-tolerance/01-checkpointer-apache-flink-apache-kafka/assets/flink_checkpoint.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-03/05-fault-tolerance/01-checkpointer-apache-flink-apache-kafka/assets/flink_checkpoint.png -------------------------------------------------------------------------------- /chapter-03/05-fault-tolerance/01-checkpointer-apache-flink-apache-kafka/flink-connector-base-1.17.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-03/05-fault-tolerance/01-checkpointer-apache-flink-apache-kafka/flink-connector-base-1.17.0.jar -------------------------------------------------------------------------------- /chapter-03/05-fault-tolerance/01-checkpointer-apache-flink-apache-kafka/flink-connector-kafka-1.17.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-03/05-fault-tolerance/01-checkpointer-apache-flink-apache-kafka/flink-connector-kafka-1.17.0.jar -------------------------------------------------------------------------------- /chapter-03/05-fault-tolerance/01-checkpointer-apache-flink-apache-kafka/kafka-clients-3.2.3.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-03/05-fault-tolerance/01-checkpointer-apache-flink-apache-kafka/kafka-clients-3.2.3.jar -------------------------------------------------------------------------------- /chapter-03/05-fault-tolerance/01-checkpointer-apache-flink-apache-kafka/visit.py: -------------------------------------------------------------------------------- 1 | import dataclasses 2 | 3 | 4 | @dataclasses.dataclass 5 | class Visit: 6 | visit_id: str 7 | event_time: int 8 | page: str 9 | -------------------------------------------------------------------------------- /chapter-03/05-fault-tolerance/01-checkpointer-apache-spark-apache-kafka/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-03/05-fault-tolerance/01-checkpointer-apache-spark-apache-kafka/__init__.py -------------------------------------------------------------------------------- /chapter-03/05-fault-tolerance/01-checkpointer-apache-spark-apache-kafka/requirements.txt: -------------------------------------------------------------------------------- 1 | pyspark==3.5.0 2 | -------------------------------------------------------------------------------- /chapter-04/01-overwriting/01-fast-metadata-cleaner-airflow-postgresql/assets/clear_tasks.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-04/01-overwriting/01-fast-metadata-cleaner-airflow-postgresql/assets/clear_tasks.png -------------------------------------------------------------------------------- /chapter-04/01-overwriting/01-fast-metadata-cleaner-airflow-postgresql/dags/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-04/01-overwriting/01-fast-metadata-cleaner-airflow-postgresql/dags/__init__.py -------------------------------------------------------------------------------- /chapter-04/01-overwriting/01-fast-metadata-cleaner-airflow-postgresql/dags/macros.py: -------------------------------------------------------------------------------- 1 | import pendulum 2 | 3 | from config import get_data_location_base_dir 4 | 5 | 6 | def get_weekly_table_name(execution_date: pendulum.date) -> str: 7 | ds_nodash = f'week_{execution_date.week_of_year}_{execution_date.year}' 8 | return f'dedp.visits_{ds_nodash}' 9 | 10 | 11 | def get_input_csv_to_load(ds: str) -> str: 12 | return f'{get_data_location_base_dir(True)}/date={ds}/dataset.csv' 13 | 14 | -------------------------------------------------------------------------------- /chapter-04/01-overwriting/01-fast-metadata-cleaner-airflow-postgresql/docker/init.sql: -------------------------------------------------------------------------------- 1 | CREATE SCHEMA dedp; 2 | -------------------------------------------------------------------------------- /chapter-04/01-overwriting/01-fast-metadata-cleaner-airflow-postgresql/plugins/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-04/01-overwriting/01-fast-metadata-cleaner-airflow-postgresql/plugins/__init__.py -------------------------------------------------------------------------------- /chapter-04/01-overwriting/01-fast-metadata-cleaner-airflow-postgresql/plugins/operators/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-04/01-overwriting/01-fast-metadata-cleaner-airflow-postgresql/plugins/operators/__init__.py -------------------------------------------------------------------------------- /chapter-04/01-overwriting/01-fast-metadata-cleaner-airflow-postgresql/requirements.txt: -------------------------------------------------------------------------------- 1 | apache-airflow==2.7.3 2 | apache-airflow-providers-postgres==5.8.0 -------------------------------------------------------------------------------- /chapter-04/01-overwriting/01-fast-metadata-cleaner-airflow-postgresql/sql/load_visits_to_weekly_table.sql: -------------------------------------------------------------------------------- 1 | {% set weekly_table = get_weekly_table_name(execution_date) %} 2 | 3 | COPY {{ weekly_table }} FROM '{{ get_input_csv_to_load(ds) }}' CSV DELIMITER ';' HEADER; -------------------------------------------------------------------------------- /chapter-04/01-overwriting/01-fast-metadata-cleaner-airflow-postgresql/sql/recreate_view.sql: -------------------------------------------------------------------------------- 1 | SELECT table_name FROM information_schema.tables WHERE table_catalog = 'dedp' 2 | AND table_schema = 'dedp' 3 | AND table_type = 'BASE TABLE' 4 | AND table_name LIKE 'visits_%' -------------------------------------------------------------------------------- /chapter-04/01-overwriting/02-data-overwrite-airflow-delta-lake/assets/clear_tasks.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-04/01-overwriting/02-data-overwrite-airflow-delta-lake/assets/clear_tasks.png -------------------------------------------------------------------------------- /chapter-04/01-overwriting/02-data-overwrite-airflow-delta-lake/assets/dag_tasks.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-04/01-overwriting/02-data-overwrite-airflow-delta-lake/assets/dag_tasks.png -------------------------------------------------------------------------------- /chapter-04/01-overwriting/02-data-overwrite-airflow-delta-lake/dags/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-04/01-overwriting/02-data-overwrite-airflow-delta-lake/dags/__init__.py -------------------------------------------------------------------------------- /chapter-04/01-overwriting/02-data-overwrite-airflow-delta-lake/dags_functions/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-04/01-overwriting/02-data-overwrite-airflow-delta-lake/dags_functions/__init__.py -------------------------------------------------------------------------------- /chapter-04/01-overwriting/02-data-overwrite-airflow-delta-lake/requirements.txt: -------------------------------------------------------------------------------- 1 | apache-airflow==2.9.0 2 | apache-airflow-providers-postgres==5.12.0 3 | virtualenv==20.26.6 -------------------------------------------------------------------------------- /chapter-04/01-overwriting/02-data-overwrite-airflow-delta-lake/start.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | airflow db reset 3 | airflow db migrate 4 | airflow users create --username "dedp" --role "Admin" --password "dedp" --email "empty" --firstname "admin" --lastname "admin" 5 | export AIRFLOW__CORE__DAGS_FOLDER=./dags 6 | export AIRFLOW__CORE__LOAD_EXAMPLES=false 7 | export AIRFLOW__WEBSERVER__WORKERS=2 8 | airflow webserver & airflow scheduler -------------------------------------------------------------------------------- /chapter-04/01-overwriting/02-data-overwrite-airflow-spark/airflow/requirements.txt: -------------------------------------------------------------------------------- 1 | apache-airflow==2.7.3 2 | apache-airflow-providers-cncf-kubernetes==7.3.0 3 | -------------------------------------------------------------------------------- /chapter-04/01-overwriting/02-data-overwrite-airflow-spark/airflow/start.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | airflow db reset 3 | airflow db init 4 | airflow users create --username "dedp" --role "Admin" --password "dedp" --email "empty" --firstname "admin" --lastname "admin" 5 | export AIRFLOW__CORE__DAGS_FOLDER=./dags 6 | export AIRFLOW__CORE__LOAD_EXAMPLES=false 7 | airflow webserver & airflow scheduler -------------------------------------------------------------------------------- /chapter-04/01-overwriting/02-data-overwrite-airflow-spark/assets/clear_tasks.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-04/01-overwriting/02-data-overwrite-airflow-spark/assets/clear_tasks.png -------------------------------------------------------------------------------- /chapter-04/01-overwriting/02-data-overwrite-airflow-spark/visits-loader-job/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM apache/spark:3.5.0 2 | 3 | COPY ./visits_loader.py /tmp -------------------------------------------------------------------------------- /chapter-04/01-overwriting/02-data-overwrite-airflow-spark/visits-loader-job/requirements.txt: -------------------------------------------------------------------------------- 1 | pyspark==3.5.0 2 | -------------------------------------------------------------------------------- /chapter-04/02-updates/01-merger-airflow-postgresql/assets/clear_task_metadata.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-04/02-updates/01-merger-airflow-postgresql/assets/clear_task_metadata.png -------------------------------------------------------------------------------- /chapter-04/02-updates/01-merger-airflow-postgresql/assets/clear_tasks.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-04/02-updates/01-merger-airflow-postgresql/assets/clear_tasks.png -------------------------------------------------------------------------------- /chapter-04/02-updates/01-merger-airflow-postgresql/dags/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-04/02-updates/01-merger-airflow-postgresql/dags/__init__.py -------------------------------------------------------------------------------- /chapter-04/02-updates/01-merger-airflow-postgresql/docker/docker-compose.yaml: -------------------------------------------------------------------------------- 1 | version: '3.1' 2 | services: 3 | postgresql: 4 | image: postgres:15 5 | container_name: dedp_postgresql 6 | environment: 7 | - POSTGRES_USER=dedp_test 8 | - POSTGRES_PASSWORD=dedp_test 9 | - POSTGRES_DB=dedp 10 | ports: 11 | - "5432:5432" 12 | volumes: 13 | - ./init.sql:/docker-entrypoint-initdb.d/init.sql 14 | - /tmp/dedp/ch04/merger/input:/data_to_load -------------------------------------------------------------------------------- /chapter-04/02-updates/01-merger-airflow-postgresql/docker/init.sql: -------------------------------------------------------------------------------- 1 | CREATE SCHEMA dedp; 2 | 3 | CREATE TABLE dedp.devices ( 4 | type VARCHAR(10) NOT NULL, 5 | full_name TEXT NOT NULL, 6 | version VARCHAR(25) NOT NULL, 7 | PRIMARY KEY(type, version) 8 | ); -------------------------------------------------------------------------------- /chapter-04/02-updates/01-merger-airflow-postgresql/requirements.txt: -------------------------------------------------------------------------------- 1 | apache-airflow==2.7.3 2 | apache-airflow-providers-postgres==5.8.0 -------------------------------------------------------------------------------- /chapter-04/02-updates/02-stateful-merger-apache-airflow-delta-lake/assets/clear_tasks.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-04/02-updates/02-stateful-merger-apache-airflow-delta-lake/assets/clear_tasks.png -------------------------------------------------------------------------------- /chapter-04/02-updates/02-stateful-merger-apache-airflow-delta-lake/assets/dag_tasks.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-04/02-updates/02-stateful-merger-apache-airflow-delta-lake/assets/dag_tasks.png -------------------------------------------------------------------------------- /chapter-04/02-updates/02-stateful-merger-apache-airflow-delta-lake/dags/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-04/02-updates/02-stateful-merger-apache-airflow-delta-lake/dags/__init__.py -------------------------------------------------------------------------------- /chapter-04/02-updates/02-stateful-merger-apache-airflow-delta-lake/dags_functions/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-04/02-updates/02-stateful-merger-apache-airflow-delta-lake/dags_functions/__init__.py -------------------------------------------------------------------------------- /chapter-04/02-updates/02-stateful-merger-apache-airflow-delta-lake/requirements.txt: -------------------------------------------------------------------------------- 1 | apache-airflow==2.9.0 2 | apache-airflow-providers-postgres==5.12.0 3 | virtualenv==20.26.6 -------------------------------------------------------------------------------- /chapter-04/02-updates/02-stateful-merger-apache-airflow-delta-lake/start.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | airflow db reset 3 | airflow db migrate 4 | airflow users create --username "dedp" --role "Admin" --password "dedp" --email "empty" --firstname "admin" --lastname "admin" 5 | export AIRFLOW__CORE__DAGS_FOLDER=./dags 6 | export AIRFLOW__CORE__LOAD_EXAMPLES=false 7 | export AIRFLOW__WEBSERVER__WORKERS=2 8 | airflow webserver & airflow scheduler -------------------------------------------------------------------------------- /chapter-04/02-updates/02-stateful-merger-apache-airflow-postgresql/assets/after_run_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-04/02-updates/02-stateful-merger-apache-airflow-postgresql/assets/after_run_1.png -------------------------------------------------------------------------------- /chapter-04/02-updates/02-stateful-merger-apache-airflow-postgresql/assets/after_run_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-04/02-updates/02-stateful-merger-apache-airflow-postgresql/assets/after_run_2.png -------------------------------------------------------------------------------- /chapter-04/02-updates/02-stateful-merger-apache-airflow-postgresql/assets/clear_retry.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-04/02-updates/02-stateful-merger-apache-airflow-postgresql/assets/clear_retry.png -------------------------------------------------------------------------------- /chapter-04/02-updates/02-stateful-merger-apache-airflow-postgresql/dags/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-04/02-updates/02-stateful-merger-apache-airflow-postgresql/dags/__init__.py -------------------------------------------------------------------------------- /chapter-04/02-updates/02-stateful-merger-apache-airflow-postgresql/requirements.txt: -------------------------------------------------------------------------------- 1 | apache-airflow==2.9.0 2 | apache-airflow-providers-postgres==5.14.0 -------------------------------------------------------------------------------- /chapter-04/02-updates/02-stateful-merger-apache-airflow-postgresql/sql/clean_table_before_restore.sql: -------------------------------------------------------------------------------- 1 | DELETE FROM dedp.devices_history WHERE execution_time >= '{{ ts }}' -------------------------------------------------------------------------------- /chapter-04/02-updates/02-stateful-merger-apache-airflow-postgresql/sql/define_merge_mode.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | CASE 3 | WHEN COUNT(*) > 0 THEN true 4 | ELSE false 5 | END 6 | FROM dedp.devices_history WHERE execution_time > '{{ ts }}' -------------------------------------------------------------------------------- /chapter-04/02-updates/02-stateful-merger-apache-airflow-postgresql/sql/restore_table.sql: -------------------------------------------------------------------------------- 1 | TRUNCATE TABLE dedp.devices; 2 | 3 | INSERT INTO dedp.devices (id, brand_name, full_name, processor_brand) 4 | SELECT id, brand_name, full_name, processor_brand FROM ( 5 | SELECT 6 | id, brand_name, full_name, processor_brand, 7 | ROW_NUMBER() OVER (PARTITION BY id ORDER BY execution_time DESC) AS position 8 | FROM dedp.devices_history 9 | ) AS to_load 10 | WHERE to_load.position = 1; -------------------------------------------------------------------------------- /chapter-04/03-database/01-keyed-idempotency-airflow-files/assets/clear_dag.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-04/03-database/01-keyed-idempotency-airflow-files/assets/clear_dag.png -------------------------------------------------------------------------------- /chapter-04/03-database/01-keyed-idempotency-airflow-files/assets/dag_tasks.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-04/03-database/01-keyed-idempotency-airflow-files/assets/dag_tasks.png -------------------------------------------------------------------------------- /chapter-04/03-database/01-keyed-idempotency-airflow-files/dags/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-04/03-database/01-keyed-idempotency-airflow-files/dags/__init__.py -------------------------------------------------------------------------------- /chapter-04/03-database/01-keyed-idempotency-airflow-files/requirements.txt: -------------------------------------------------------------------------------- 1 | apache-airflow==2.9.0 2 | apache-airflow-providers-postgres==5.12.0 3 | virtualenv==20.26.6 -------------------------------------------------------------------------------- /chapter-04/03-database/01-keyed-idempotency-airflow-files/start.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | airflow db reset 3 | airflow db migrate 4 | airflow users create --username "dedp" --role "Admin" --password "dedp" --email "empty" --firstname "admin" --lastname "admin" 5 | export AIRFLOW__CORE__DAGS_FOLDER=./dags 6 | export AIRFLOW__CORE__LOAD_EXAMPLES=false 7 | export AIRFLOW__WEBSERVER__WORKERS=2 8 | airflow webserver & airflow scheduler -------------------------------------------------------------------------------- /chapter-04/03-database/01-keyed-idempotency-spark-kafka-scylladb/docker/init.cql: -------------------------------------------------------------------------------- 1 | CREATE KEYSPACE dedp WITH REPLICATION = { 'class' : 'SimpleStrategy', 'replication_factor':1 }; 2 | 3 | USE dedp; 4 | 5 | CREATE TABLE sessions ( 6 | session_id BIGINT, 7 | user_id BIGINT, 8 | pages LIST, 9 | ingestion_time TIMESTAMP, 10 | PRIMARY KEY(session_id, user_id)); -------------------------------------------------------------------------------- /chapter-04/03-database/01-keyed-idempotency-spark-kafka-scylladb/requirements.txt: -------------------------------------------------------------------------------- 1 | pyspark==3.5.0 2 | pandas==2.0.1 3 | pyarrow==12.0.0 4 | scylla-driver==3.25.11 -------------------------------------------------------------------------------- /chapter-04/03-database/02-transactional-writer-airflow-postgresql/assets/clear_task_metadata.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-04/03-database/02-transactional-writer-airflow-postgresql/assets/clear_task_metadata.png -------------------------------------------------------------------------------- /chapter-04/03-database/02-transactional-writer-airflow-postgresql/assets/clear_tasks.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-04/03-database/02-transactional-writer-airflow-postgresql/assets/clear_tasks.png -------------------------------------------------------------------------------- /chapter-04/03-database/02-transactional-writer-airflow-postgresql/dags/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-04/03-database/02-transactional-writer-airflow-postgresql/dags/__init__.py -------------------------------------------------------------------------------- /chapter-04/03-database/02-transactional-writer-airflow-postgresql/docker/init.sql: -------------------------------------------------------------------------------- 1 | CREATE SCHEMA dedp; 2 | 3 | CREATE TABLE dedp.devices ( 4 | type VARCHAR(6) NOT NULL, 5 | full_name TEXT NOT NULL, 6 | version VARCHAR(25) NOT NULL, 7 | PRIMARY KEY(type, version) 8 | ); -------------------------------------------------------------------------------- /chapter-04/03-database/02-transactional-writer-airflow-postgresql/requirements.txt: -------------------------------------------------------------------------------- 1 | apache-airflow==2.7.3 2 | apache-airflow-providers-postgres==5.8.0 -------------------------------------------------------------------------------- /chapter-04/03-database/02-transactional-writer-flink-kafka/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-04/03-database/02-transactional-writer-flink-kafka/__init__.py -------------------------------------------------------------------------------- /chapter-04/03-database/02-transactional-writer-flink-kafka/flink-connector-base-1.17.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-04/03-database/02-transactional-writer-flink-kafka/flink-connector-base-1.17.0.jar -------------------------------------------------------------------------------- /chapter-04/03-database/02-transactional-writer-flink-kafka/flink-connector-kafka-1.17.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-04/03-database/02-transactional-writer-flink-kafka/flink-connector-kafka-1.17.0.jar -------------------------------------------------------------------------------- /chapter-04/03-database/02-transactional-writer-flink-kafka/kafka-clients-3.2.3.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-04/03-database/02-transactional-writer-flink-kafka/kafka-clients-3.2.3.jar -------------------------------------------------------------------------------- /chapter-04/03-database/02-transactional-writer-flink-kafka/requirements.txt: -------------------------------------------------------------------------------- 1 | apache-flink==1.17.0 2 | -------------------------------------------------------------------------------- /chapter-04/04-immutable-dataset/01-proxy-postgresql-view/assets/clear_task_metadata.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-04/04-immutable-dataset/01-proxy-postgresql-view/assets/clear_task_metadata.png -------------------------------------------------------------------------------- /chapter-04/04-immutable-dataset/01-proxy-postgresql-view/assets/clear_tasks.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-04/04-immutable-dataset/01-proxy-postgresql-view/assets/clear_tasks.png -------------------------------------------------------------------------------- /chapter-04/04-immutable-dataset/01-proxy-postgresql-view/dags/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-04/04-immutable-dataset/01-proxy-postgresql-view/dags/__init__.py -------------------------------------------------------------------------------- /chapter-04/04-immutable-dataset/01-proxy-postgresql-view/docker/generation_configuration_json.yaml: -------------------------------------------------------------------------------- 1 | dataset: 2 | rows: 50 3 | composition_percentage: 4 | duplicates: 0 5 | missing_fields: 0 6 | unprocessable_rows: 0 7 | data_blocker: 8 | type: 'no' 9 | entity: 10 | type: device 11 | generator: 12 | type: one-shot 13 | writer: 14 | type: csv 15 | configuration: 16 | output_path: '/home/data_generator_user/data_generator_output/' 17 | clean_path: true 18 | -------------------------------------------------------------------------------- /chapter-04/04-immutable-dataset/01-proxy-postgresql-view/docker/init.sql: -------------------------------------------------------------------------------- 1 | CREATE SCHEMA dedp; -------------------------------------------------------------------------------- /chapter-04/04-immutable-dataset/01-proxy-postgresql-view/plugins/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-04/04-immutable-dataset/01-proxy-postgresql-view/plugins/__init__.py -------------------------------------------------------------------------------- /chapter-04/04-immutable-dataset/01-proxy-postgresql-view/plugins/operators/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-04/04-immutable-dataset/01-proxy-postgresql-view/plugins/operators/__init__.py -------------------------------------------------------------------------------- /chapter-04/04-immutable-dataset/01-proxy-postgresql-view/requirements.txt: -------------------------------------------------------------------------------- 1 | apache-airflow==2.7.3 2 | apache-airflow-providers-postgres==5.8.0 -------------------------------------------------------------------------------- /chapter-04/04-immutable-dataset/01-proxy-postgresql-view/sql/load_visits_to_weekly_table.sql: -------------------------------------------------------------------------------- 1 | {% set devices_internal_table = get_devices_table_name() %} 2 | 3 | CREATE TABLE {{ devices_internal_table }} ( 4 | type VARCHAR(10) NOT NULL, 5 | full_name TEXT NOT NULL, 6 | version VARCHAR(25) NOT NULL, 7 | PRIMARY KEY(type, full_name, version) 8 | ); 9 | 10 | COPY {{ devices_internal_table }} FROM '{{ get_input_csv_to_load() }}' CSV DELIMITER ';' HEADER; -------------------------------------------------------------------------------- /chapter-04/04-immutable-dataset/01-proxy-postgresql-view/sql/refresh_view.sql: -------------------------------------------------------------------------------- 1 | {% set devices_internal_table = get_devices_table_name() %} 2 | 3 | CREATE OR REPLACE VIEW dedp.devices AS SELECT * FROM {{ devices_internal_table }}; -------------------------------------------------------------------------------- /chapter-05/01-data-enrichment/01-static-joiner-api/requirements.txt: -------------------------------------------------------------------------------- 1 | pyspark==3.5.0 2 | pandas==2.0.2 3 | pyarrow==14.0.1 4 | confluent-kafka==2.0.2 5 | flask==2.2.2 6 | Werkzeug==2.3.7 7 | requests==2.26.0 -------------------------------------------------------------------------------- /chapter-05/01-data-enrichment/01-static-joiner-data-in-motion/config.py: -------------------------------------------------------------------------------- 1 | def get_base_dir() -> str: 2 | return '/tmp/dedp/ch05/01-data-enrichment/01-static-joiner-data-in-motion' 3 | 4 | 5 | def get_devices_table_dir() -> str: 6 | return f'{get_base_dir()}/devices' -------------------------------------------------------------------------------- /chapter-05/01-data-enrichment/01-static-joiner-data-in-motion/requirements.txt: -------------------------------------------------------------------------------- 1 | delta-spark==3.0.0 2 | pyspark==3.5.0 3 | -------------------------------------------------------------------------------- /chapter-05/01-data-enrichment/02-dynamic-joiner-flink/assets/flink_watermark_local_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-05/01-data-enrichment/02-dynamic-joiner-flink/assets/flink_watermark_local_time.png -------------------------------------------------------------------------------- /chapter-05/01-data-enrichment/02-dynamic-joiner-spark/requirements.txt: -------------------------------------------------------------------------------- 1 | pyspark==3.5.0 2 | pandas==2.0.2 3 | pyarrow==14.0.1 4 | 5 | -------------------------------------------------------------------------------- /chapter-05/02-data-decoration/01-wrapper-spark/requirements.txt: -------------------------------------------------------------------------------- 1 | pyspark==3.5.0 2 | -------------------------------------------------------------------------------- /chapter-05/02-data-decoration/01-wrapper-sql/config.py: -------------------------------------------------------------------------------- 1 | class DemoConfiguration: 2 | BASE_DIR: str = '/tmp/dedp/ch05/02-decorator/01-wrapper-sql' 3 | INPUT_PATH: str = f'{BASE_DIR}/input' 4 | TABLE_FLATTENED: str = f'{BASE_DIR}/visits-flattened-table' 5 | TABLE_DECORATED_STRUCT: str = f'{BASE_DIR}/visits-decorated-struct-table' 6 | TABLE_RAW_STRUCT: str = f'{BASE_DIR}/visits-raw-struct-table' 7 | -------------------------------------------------------------------------------- /chapter-05/02-data-decoration/01-wrapper-sql/requirements.txt: -------------------------------------------------------------------------------- 1 | delta-spark==3.0.0 2 | pyspark==3.5.0 -------------------------------------------------------------------------------- /chapter-05/02-data-decoration/02-metadata-airflow-postgresql/dags/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-05/02-data-decoration/02-metadata-airflow-postgresql/dags/__init__.py -------------------------------------------------------------------------------- /chapter-05/02-data-decoration/02-metadata-airflow-postgresql/dags/macros.py: -------------------------------------------------------------------------------- 1 | import pendulum 2 | 3 | from config import get_data_location_base_dir 4 | 5 | 6 | def get_weekly_table_name(execution_date: pendulum.date) -> str: 7 | ds_nodash = f'week_{execution_date.week_of_year}_{execution_date.year}' 8 | return f'dedp.visits_weekly_{ds_nodash}' 9 | 10 | 11 | def get_input_csv_to_load(ds: str) -> str: 12 | return f'{get_data_location_base_dir(True)}/date={ds}/dataset.csv' 13 | 14 | -------------------------------------------------------------------------------- /chapter-05/02-data-decoration/02-metadata-airflow-postgresql/docker/init.sql: -------------------------------------------------------------------------------- 1 | CREATE SCHEMA dedp; 2 | 3 | CREATE TABLE dedp.visits_context ( 4 | execution_date_time TIMESTAMPTZ NOT NULL, 5 | loading_time TIMESTAMPTZ NOT NULL, 6 | code_version VARCHAR(15) NOT NULL, 7 | loading_attempt SMALLINT NOT NULL, 8 | PRIMARY KEY (execution_date_time) 9 | ) -------------------------------------------------------------------------------- /chapter-05/02-data-decoration/02-metadata-airflow-postgresql/plugins/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-05/02-data-decoration/02-metadata-airflow-postgresql/plugins/__init__.py -------------------------------------------------------------------------------- /chapter-05/02-data-decoration/02-metadata-airflow-postgresql/plugins/operators/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-05/02-data-decoration/02-metadata-airflow-postgresql/plugins/operators/__init__.py -------------------------------------------------------------------------------- /chapter-05/02-data-decoration/02-metadata-airflow-postgresql/requirements.txt: -------------------------------------------------------------------------------- 1 | apache-airflow==2.7.3 2 | apache-airflow-providers-postgres==5.8.0 -------------------------------------------------------------------------------- /chapter-05/02-data-decoration/02-metadata-airflow-postgresql/sql/recreate_view.sql: -------------------------------------------------------------------------------- 1 | SELECT table_name FROM information_schema.tables WHERE table_catalog = 'dedp' 2 | AND table_schema = 'dedp' 3 | AND table_type = 'BASE TABLE' 4 | AND table_name LIKE 'visits_weekly_%' -------------------------------------------------------------------------------- /chapter-05/02-data-decoration/02-metadata-spark-kafka/config.py: -------------------------------------------------------------------------------- 1 | def get_base_dir() -> str: 2 | return '/tmp/dedp/ch05/decorator/visits/' 3 | -------------------------------------------------------------------------------- /chapter-05/02-data-decoration/02-metadata-spark-kafka/requirements.txt: -------------------------------------------------------------------------------- 1 | pyspark==3.5.0 2 | -------------------------------------------------------------------------------- /chapter-05/03-data-combination/01-distributed-json-postgresql/dataset/init.sql: -------------------------------------------------------------------------------- 1 | CREATE SCHEMA dedp; 2 | 3 | CREATE TABLE dedp.devices ( 4 | type VARCHAR(10) NOT NULL, 5 | full_name TEXT NOT NULL, 6 | version VARCHAR(25) NOT NULL, 7 | PRIMARY KEY(type, full_name, version) 8 | ); -------------------------------------------------------------------------------- /chapter-05/03-data-combination/01-distributed-json-postgresql/requirements.txt: -------------------------------------------------------------------------------- 1 | pyspark==3.5.0 -------------------------------------------------------------------------------- /chapter-05/03-data-combination/02-local-buckets-spark/config.py: -------------------------------------------------------------------------------- 1 | def get_base_dir() -> str: 2 | return '/tmp/dedp/ch05/03-data-combination/02-local-buckets-spark' 3 | -------------------------------------------------------------------------------- /chapter-05/03-data-combination/02-local-buckets-spark/requirements.txt: -------------------------------------------------------------------------------- 1 | pyspark==3.5.0 -------------------------------------------------------------------------------- /chapter-05/03-data-combination/02-local-kafka-spark/requirements.txt: -------------------------------------------------------------------------------- 1 | pyspark==3.5.0 2 | pandas==2.0.2 3 | pyarrow==14.0.1 4 | kafka-python==1.4.4 -------------------------------------------------------------------------------- /chapter-05/04-sessionization/01-incremental-airflow-windows-postgresql/dags/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-05/04-sessionization/01-incremental-airflow-windows-postgresql/dags/__init__.py -------------------------------------------------------------------------------- /chapter-05/04-sessionization/01-incremental-airflow-windows-postgresql/requirements.txt: -------------------------------------------------------------------------------- 1 | apache-airflow==2.7.3 2 | apache-airflow-providers-postgres==5.8.0 -------------------------------------------------------------------------------- /chapter-05/04-sessionization/01-incremental-airflow-windows-postgresql/sql/clean_previous_run_generated_sessions.sql: -------------------------------------------------------------------------------- 1 | DELETE FROM dedp.sessions WHERE execution_time_id >= '{{ ds }}'; -------------------------------------------------------------------------------- /chapter-05/04-sessionization/01-incremental-airflow-windows-postgresql/sql/clean_previous_run_pending_sessions.sql: -------------------------------------------------------------------------------- 1 | DELETE FROM dedp.pending_sessions WHERE execution_time_id >= '{{ ds }}'; -------------------------------------------------------------------------------- /chapter-05/04-sessionization/02-stateful-kafka-flink/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-05/04-sessionization/02-stateful-kafka-flink/__init__.py -------------------------------------------------------------------------------- /chapter-05/04-sessionization/02-stateful-kafka-flink/flink-connector-base-1.18.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-05/04-sessionization/02-stateful-kafka-flink/flink-connector-base-1.18.0.jar -------------------------------------------------------------------------------- /chapter-05/04-sessionization/02-stateful-kafka-flink/flink-connector-kafka-3.1.0-1.18.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-05/04-sessionization/02-stateful-kafka-flink/flink-connector-kafka-3.1.0-1.18.jar -------------------------------------------------------------------------------- /chapter-05/04-sessionization/02-stateful-kafka-flink/kafka-clients-3.7.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-05/04-sessionization/02-stateful-kafka-flink/kafka-clients-3.7.0.jar -------------------------------------------------------------------------------- /chapter-05/04-sessionization/02-stateful-kafka-flink/requirements.txt: -------------------------------------------------------------------------------- 1 | apache-flink==1.18.0 2 | -------------------------------------------------------------------------------- /chapter-05/04-sessionization/02-stateful-kafka-spark/requirements.txt: -------------------------------------------------------------------------------- 1 | pyspark==3.5.0 2 | pandas==2.0.2 3 | pyarrow==14.0.1 4 | 5 | -------------------------------------------------------------------------------- /chapter-05/05-data-ordering/01-bin-packer-kinesis-spark/requirements.txt: -------------------------------------------------------------------------------- 1 | pyspark==3.5.0 2 | pandas==2.0.2 3 | pyarrow==14.0.1 4 | localstack==3.0.2 5 | awscli-local==0.21.1 6 | awscli==1.31.8 7 | boto3==1.33.8 -------------------------------------------------------------------------------- /chapter-05/05-data-ordering/02-fifo-kafka-spark/config.py: -------------------------------------------------------------------------------- 1 | def get_base_dir() -> str: 2 | return '/tmp/dedp/ch05/05-data-ordering/02-fifo-kafka-spark' 3 | -------------------------------------------------------------------------------- /chapter-05/05-data-ordering/02-fifo-kafka-spark/requirements.txt: -------------------------------------------------------------------------------- 1 | pyspark==3.5.0 2 | pandas==2.0.2 3 | pyarrow==14.0.1 4 | confluent-kafka==2.0.2 -------------------------------------------------------------------------------- /chapter-06/01-sequence/01-local-sequencer-airflow/assets/devices_loader_isolated.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-06/01-sequence/01-local-sequencer-airflow/assets/devices_loader_isolated.png -------------------------------------------------------------------------------- /chapter-06/01-sequence/01-local-sequencer-airflow/assets/devices_loader_not_isolated.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-06/01-sequence/01-local-sequencer-airflow/assets/devices_loader_not_isolated.png -------------------------------------------------------------------------------- /chapter-06/01-sequence/01-local-sequencer-airflow/dags/macros.py: -------------------------------------------------------------------------------- 1 | def get_table_name(ds_nodash: str) -> str: 2 | return 'devices_'+ds_nodash 3 | 4 | 5 | def get_input_csv_to_load_for_host() -> str: 6 | return '/tmp/dedp/ch06/01-sequence/01-local-sequencer-airflow/input/dataset.csv' 7 | -------------------------------------------------------------------------------- /chapter-06/01-sequence/01-local-sequencer-airflow/docker/dataset/generation_configuration.yaml: -------------------------------------------------------------------------------- 1 | dataset: 2 | rows: 50 3 | composition_percentage: 4 | duplicates: 0 5 | missing_fields: 0 6 | unprocessable_rows: 0 7 | data_blocker: 8 | type: 'no' 9 | entity: 10 | type: device 11 | generator: 12 | type: one-shot 13 | writer: 14 | type: csv 15 | configuration: 16 | output_path: '/home/data_generator_user/data_generator_output/input' 17 | clean_path: true 18 | -------------------------------------------------------------------------------- /chapter-06/01-sequence/01-local-sequencer-airflow/docker/postgresql/init.sql: -------------------------------------------------------------------------------- 1 | CREATE SCHEMA dedp_test; 2 | -------------------------------------------------------------------------------- /chapter-06/01-sequence/01-local-sequencer-airflow/requirements.txt: -------------------------------------------------------------------------------- 1 | apache-airflow==2.7.3 2 | apache-airflow-providers-postgres==5.8.0 3 | pendulum==2.1.2 -------------------------------------------------------------------------------- /chapter-06/01-sequence/01-local-sequencer-airflow/sql/expose_new_table.sql: -------------------------------------------------------------------------------- 1 | {% set table_name = get_table_name(ds_nodash) %} 2 | CREATE OR REPLACE VIEW devices AS SELECT * FROM {{ table_name }} -------------------------------------------------------------------------------- /chapter-06/01-sequence/01-local-sequencer-airflow/sql/load_file_to_device_table.sql: -------------------------------------------------------------------------------- 1 | {% set table_name = get_table_name(ds_nodash) %} 2 | 3 | DROP VIEW IF EXISTS devices; 4 | DROP TABLE IF EXISTS {{ table_name }}; 5 | CREATE TABLE {{ table_name }} ( 6 | type VARCHAR(15) NOT NULL, 7 | full_name VARCHAR(50) NOT NULL, 8 | version VARCHAR(40) NOT NULL, 9 | PRIMARY KEY(full_name, version) 10 | ); 11 | 12 | COPY {{ table_name }} FROM '/data_to_load/dataset.csv' CSV DELIMITER ';' HEADER; -------------------------------------------------------------------------------- /chapter-06/01-sequence/01-local-sequencer-spark/config.py: -------------------------------------------------------------------------------- 1 | class DemoConfiguration: 2 | BASE_DIR: str = '/tmp/dedp/ch06/01-sequence/01-local-sequencer-spark' 3 | INPUT_PATH: str = f'{BASE_DIR}/input' 4 | DEVICES_TABLE: str = f'{BASE_DIR}/devices-table' 5 | -------------------------------------------------------------------------------- /chapter-06/01-sequence/01-local-sequencer-spark/dataset/generation_configuration.yaml: -------------------------------------------------------------------------------- 1 | dataset: 2 | rows: 50 3 | composition_percentage: 4 | duplicates: 0 5 | missing_fields: 0 6 | unprocessable_rows: 0 7 | data_blocker: 8 | type: 'no' 9 | entity: 10 | type: device 11 | generator: 12 | type: one-shot 13 | writer: 14 | type: json 15 | configuration: 16 | output_path: '/home/data_generator_user/data_generator_output/input' 17 | clean_path: true 18 | -------------------------------------------------------------------------------- /chapter-06/01-sequence/01-local-sequencer-spark/requirements.txt: -------------------------------------------------------------------------------- 1 | delta-spark==3.0.0 2 | pyspark==3.5.0 -------------------------------------------------------------------------------- /chapter-06/01-sequence/02-isolated-sequencer-dataset-dependency/dags/macros.py: -------------------------------------------------------------------------------- 1 | def get_internal_base_location_for_devices_file() -> str: 2 | return '/tmp/dedp/ch06/01-sequence/02-isolated-sequencer-dataset-dependency/input-internal' 3 | -------------------------------------------------------------------------------- /chapter-06/01-sequence/02-isolated-sequencer-dataset-dependency/docker/dataset/generation_configuration.yaml: -------------------------------------------------------------------------------- 1 | dataset: 2 | rows: 50 3 | composition_percentage: 4 | duplicates: 0 5 | missing_fields: 0 6 | unprocessable_rows: 0 7 | data_blocker: 8 | type: 'no' 9 | entity: 10 | type: device 11 | generator: 12 | type: one-shot 13 | writer: 14 | type: csv 15 | configuration: 16 | output_path: '/home/data_generator_user/data_generator_output/input' 17 | clean_path: true 18 | -------------------------------------------------------------------------------- /chapter-06/01-sequence/02-isolated-sequencer-dataset-dependency/requirements.txt: -------------------------------------------------------------------------------- 1 | apache-airflow==2.7.3 2 | apache-airflow-providers-postgres==5.8.0 3 | pendulum==2.1.2 -------------------------------------------------------------------------------- /chapter-06/01-sequence/02-isolated-sequencer-external-trigger/assets/backfill_recursive.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-06/01-sequence/02-isolated-sequencer-external-trigger/assets/backfill_recursive.png -------------------------------------------------------------------------------- /chapter-06/01-sequence/02-isolated-sequencer-external-trigger/assets/clean_task_recursive.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-06/01-sequence/02-isolated-sequencer-external-trigger/assets/clean_task_recursive.png -------------------------------------------------------------------------------- /chapter-06/01-sequence/02-isolated-sequencer-external-trigger/dags/macros.py: -------------------------------------------------------------------------------- 1 | def get_table_name(ds_nodash: str) -> str: 2 | return 'devices_' + ds_nodash 3 | 4 | 5 | def get_internal_storage_location_for_devices_file() -> str: 6 | return '/tmp/dedp/ch06/01-sequence/02-isolated-sequencer-external-trigger/input-internal/dataset.csv' 7 | -------------------------------------------------------------------------------- /chapter-06/01-sequence/02-isolated-sequencer-external-trigger/docker/dataset/generation_configuration.yaml: -------------------------------------------------------------------------------- 1 | dataset: 2 | rows: 50 3 | composition_percentage: 4 | duplicates: 0 5 | missing_fields: 0 6 | unprocessable_rows: 0 7 | data_blocker: 8 | type: 'no' 9 | entity: 10 | type: device 11 | generator: 12 | type: one-shot 13 | writer: 14 | type: csv 15 | configuration: 16 | output_path: '/home/data_generator_user/data_generator_output/input' 17 | clean_path: true 18 | -------------------------------------------------------------------------------- /chapter-06/01-sequence/02-isolated-sequencer-external-trigger/requirements.txt: -------------------------------------------------------------------------------- 1 | apache-airflow==2.7.3 2 | apache-airflow-providers-postgres==5.8.0 3 | pendulum==2.1.2 -------------------------------------------------------------------------------- /chapter-06/02-fan-in/01-aligned-fan-in-airflow-aggregates/dags/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-06/02-fan-in/01-aligned-fan-in-airflow-aggregates/dags/__init__.py -------------------------------------------------------------------------------- /chapter-06/02-fan-in/01-aligned-fan-in-airflow-aggregates/docker/init.sql: -------------------------------------------------------------------------------- 1 | CREATE SCHEMA dedp; 2 | 3 | CREATE TABLE dedp.visits_raw ( 4 | execution_time_id CHAR(13) NOT NULL, 5 | user_id TEXT NOT NULL, 6 | page TEXT NOT NULL, 7 | PRIMARY KEY(user_id, page, execution_time_id) 8 | ); 9 | 10 | CREATE TABLE dedp.visits_cube ( 11 | current_execution_time_id CHAR(10) NOT NULL, 12 | page TEXT, 13 | user_id TEXT, 14 | visits_number INT NOT NULL 15 | ); -------------------------------------------------------------------------------- /chapter-06/02-fan-in/01-aligned-fan-in-airflow-aggregates/requirements.txt: -------------------------------------------------------------------------------- 1 | apache-airflow==2.7.3 2 | apache-airflow-providers-postgres==5.8.0 -------------------------------------------------------------------------------- /chapter-06/02-fan-in/01-aligned-fan-in-airflow-aggregates/sql/clear_context.sql: -------------------------------------------------------------------------------- 1 | DELETE FROM dedp.visits_cube WHERE current_execution_time_id = '{{ ds }}'; 2 | -------------------------------------------------------------------------------- /chapter-06/02-fan-in/01-aligned-fan-in-airflow-aggregates/sql/generate_visits_cube.sql: -------------------------------------------------------------------------------- 1 | INSERT INTO dedp.visits_cube (current_execution_time_id, page, user_id, visits_number) 2 | SELECT '{{ ds }}', page, user_id, COUNT(*) FROM dedp.visits_raw GROUP BY CUBE(user_id, page); -------------------------------------------------------------------------------- /chapter-06/02-fan-in/01-aligned-fan-in-spark-union/config.py: -------------------------------------------------------------------------------- 1 | class DemoConfiguration: 2 | BASE_DIR: str = '/tmp/dedp/ch06/02-fan-int/01-aligned-fan-in-spark/' 3 | INPUT_PATH_1: str = f'{BASE_DIR}/input-1/input' 4 | INPUT_PATH_2: str = f'{BASE_DIR}/input-2/input' 5 | DEVICES_TABLE: str = f'{BASE_DIR}/devices-table' 6 | -------------------------------------------------------------------------------- /chapter-06/02-fan-in/01-aligned-fan-in-spark-union/dataset/generation_configuration.yaml: -------------------------------------------------------------------------------- 1 | dataset: 2 | rows: 50 3 | composition_percentage: 4 | duplicates: 0 5 | missing_fields: 0 6 | unprocessable_rows: 0 7 | data_blocker: 8 | type: 'no' 9 | entity: 10 | type: device 11 | generator: 12 | type: one-shot 13 | writer: 14 | type: json 15 | configuration: 16 | output_path: '/home/data_generator_user/data_generator_output/input' 17 | clean_path: true 18 | -------------------------------------------------------------------------------- /chapter-06/02-fan-in/01-aligned-fan-in-spark-union/requirements.txt: -------------------------------------------------------------------------------- 1 | delta-spark==3.0.0 2 | pyspark==3.5.0 -------------------------------------------------------------------------------- /chapter-06/02-fan-in/02-unaligned-fan-in-airflow-aggregates-partial/dags/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-06/02-fan-in/02-unaligned-fan-in-airflow-aggregates-partial/dags/__init__.py -------------------------------------------------------------------------------- /chapter-06/02-fan-in/02-unaligned-fan-in-airflow-aggregates-partial/requirements.txt: -------------------------------------------------------------------------------- 1 | apache-airflow==2.7.3 2 | apache-airflow-providers-postgres==5.8.0 -------------------------------------------------------------------------------- /chapter-06/02-fan-in/02-unaligned-fan-in-airflow-aggregates-partial/sql/clear_context.sql: -------------------------------------------------------------------------------- 1 | DELETE FROM dedp.visits_cube WHERE current_execution_time_id = '{{ ds }}'; 2 | -------------------------------------------------------------------------------- /chapter-06/02-fan-in/02-unaligned-fan-in-aws-step-functions/lambda-partitions-detector/detect_partitions.py: -------------------------------------------------------------------------------- 1 | def lambda_handler(event, context): 2 | # let's keep it simple for the demo; IRL it could detect new partitions dynamically 3 | return list(range(0, 2)) 4 | -------------------------------------------------------------------------------- /chapter-06/02-fan-in/02-unaligned-fan-in-aws-step-functions/lambda-partitions-processor/process_partition.py: -------------------------------------------------------------------------------- 1 | def lambda_handler(event, context): 2 | print(f'event={event}') 3 | if event['PartitionNumber'] == 1: 4 | return False 5 | else: 6 | return True 7 | -------------------------------------------------------------------------------- /chapter-06/02-fan-in/02-unaligned-fan-in-aws-step-functions/lambda-table-creator/create_table_from_processed_partitions.py: -------------------------------------------------------------------------------- 1 | def lambda_handler(event, context): 2 | print(f'event={event}') 3 | if False in event['ProcessorResults']: 4 | print('Marking the job as partially valid') 5 | return True 6 | -------------------------------------------------------------------------------- /chapter-06/02-fan-in/02-unaligned-fan-in-aws-step-functions/requirements.txt: -------------------------------------------------------------------------------- 1 | localstack==3.0.2 2 | awscli-local==0.21.1 3 | awscli==1.31.8 4 | boto3==1.33.8 5 | -------------------------------------------------------------------------------- /chapter-06/03-fan-out/01-parallel-split-airflow-jobs/assets/parallel_split_graph.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-06/03-fan-out/01-parallel-split-airflow-jobs/assets/parallel_split_graph.png -------------------------------------------------------------------------------- /chapter-06/03-fan-out/01-parallel-split-airflow-jobs/dags/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-06/03-fan-out/01-parallel-split-airflow-jobs/dags/__init__.py -------------------------------------------------------------------------------- /chapter-06/03-fan-out/01-parallel-split-airflow-jobs/requirements.txt: -------------------------------------------------------------------------------- 1 | apache-airflow==2.7.3 2 | apache-airflow-providers-postgres==5.8.0 -------------------------------------------------------------------------------- /chapter-06/03-fan-out/01-parallel-split-airflow-jobs/start.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | airflow db reset 3 | airflow db migrate 4 | airflow users create --username "dedp" --role "Admin" --password "dedp" --email "empty" --firstname "admin" --lastname "admin" 5 | export AIRFLOW__CORE__DAGS_FOLDER=./dags 6 | export AIRFLOW__CORE__LOAD_EXAMPLES=false 7 | export AIRFLOW__WEBSERVER__WORKERS=2 8 | airflow webserver & airflow scheduler -------------------------------------------------------------------------------- /chapter-06/03-fan-out/01-parallel-split-airflow-jobs/visits-loader-job/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM apache/spark:3.5.1 2 | 3 | COPY ./visits_loader.py /tmp -------------------------------------------------------------------------------- /chapter-06/03-fan-out/01-parallel-split-airflow-jobs/visits-loader-job/requirements.txt: -------------------------------------------------------------------------------- 1 | pyspark==3.5.0 2 | delta-spark==3.1.0 3 | -------------------------------------------------------------------------------- /chapter-06/03-fan-out/01-parallel-split-spark-foreachbatch/config.py: -------------------------------------------------------------------------------- 1 | class DemoConfiguration: 2 | BASE_DIR: str = '/tmp/dedp/ch06/03-fan-out/01-parallel-split-spark' 3 | INPUT_PATH: str = f'{BASE_DIR}/input' 4 | DEVICES_TABLE: str = f'{BASE_DIR}/devices-table' 5 | DEVICES_TABLE_ENRICHED: str = f'{BASE_DIR}/devices-table-enriched' 6 | -------------------------------------------------------------------------------- /chapter-06/03-fan-out/01-parallel-split-spark-foreachbatch/dataset/generation_configuration.yaml: -------------------------------------------------------------------------------- 1 | dataset: 2 | rows: 50 3 | composition_percentage: 4 | duplicates: 0 5 | missing_fields: 0 6 | unprocessable_rows: 0 7 | data_blocker: 8 | type: 'no' 9 | entity: 10 | type: device 11 | generator: 12 | type: one-shot 13 | writer: 14 | type: json 15 | configuration: 16 | output_path: '/home/data_generator_user/data_generator_output/input' 17 | clean_path: true 18 | -------------------------------------------------------------------------------- /chapter-06/03-fan-out/01-parallel-split-spark-foreachbatch/requirements.txt: -------------------------------------------------------------------------------- 1 | delta-spark==3.0.0 2 | pyspark==3.5.0 -------------------------------------------------------------------------------- /chapter-06/03-fan-out/02-exclusive-choice-airflow-migration/assets/exclusive_choice_migration.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-06/03-fan-out/02-exclusive-choice-airflow-migration/assets/exclusive_choice_migration.png -------------------------------------------------------------------------------- /chapter-06/03-fan-out/02-exclusive-choice-airflow-migration/assets/exclusive_choice_migration_colors.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-06/03-fan-out/02-exclusive-choice-airflow-migration/assets/exclusive_choice_migration_colors.png -------------------------------------------------------------------------------- /chapter-06/03-fan-out/02-exclusive-choice-airflow-migration/dags/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-06/03-fan-out/02-exclusive-choice-airflow-migration/dags/__init__.py -------------------------------------------------------------------------------- /chapter-06/03-fan-out/02-exclusive-choice-airflow-migration/requirements.txt: -------------------------------------------------------------------------------- 1 | apache-airflow==2.7.3 2 | apache-airflow-providers-postgres==5.8.0 -------------------------------------------------------------------------------- /chapter-06/03-fan-out/02-exclusive-choice-airflow-migration/start.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | airflow db reset 3 | airflow db migrate 4 | airflow users create --username "dedp" --role "Admin" --password "dedp" --email "empty" --firstname "admin" --lastname "admin" 5 | export AIRFLOW__CORE__DAGS_FOLDER=./dags 6 | export AIRFLOW__CORE__LOAD_EXAMPLES=false 7 | export AIRFLOW__WEBSERVER__WORKERS=2 8 | airflow webserver & airflow scheduler -------------------------------------------------------------------------------- /chapter-06/03-fan-out/02-exclusive-choice-airflow-migration/visits-loader-job/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM apache/spark:3.5.1 2 | 3 | COPY ./visits_loader.py /tmp -------------------------------------------------------------------------------- /chapter-06/03-fan-out/02-exclusive-choice-airflow-migration/visits-loader-job/requirements.txt: -------------------------------------------------------------------------------- 1 | pyspark==3.5.0 2 | delta-spark==3.1.0 3 | -------------------------------------------------------------------------------- /chapter-06/03-fan-out/02-exclusive-choice-spark-dataset-criteria/config.py: -------------------------------------------------------------------------------- 1 | class DemoConfiguration: 2 | BASE_DIR: str = '/tmp/dedp/ch06/03-fan-out/02-exclusive-choice-spark/' 3 | INPUT_PATH: str = f'{BASE_DIR}/input' 4 | DEVICES_TABLE_LEGACY: str = f'{BASE_DIR}/output/devices-table' 5 | DEVICES_TABLE_SCHEMA_CHANGED: str = f'{BASE_DIR}/output/devices-table-schema-changed' 6 | -------------------------------------------------------------------------------- /chapter-06/03-fan-out/02-exclusive-choice-spark-dataset-criteria/dataset/generation_configuration.yaml: -------------------------------------------------------------------------------- 1 | dataset: 2 | rows: 50 3 | composition_percentage: 4 | duplicates: 0 5 | missing_fields: 0 6 | unprocessable_rows: 0 7 | data_blocker: 8 | type: 'no' 9 | entity: 10 | type: device 11 | generator: 12 | type: one-shot 13 | writer: 14 | type: json 15 | configuration: 16 | output_path: '/home/data_generator_user/data_generator_output/input' 17 | clean_path: true 18 | -------------------------------------------------------------------------------- /chapter-06/03-fan-out/02-exclusive-choice-spark-dataset-criteria/requirements.txt: -------------------------------------------------------------------------------- 1 | delta-spark==3.0.0 2 | pyspark==3.5.0 -------------------------------------------------------------------------------- /chapter-06/04-orchestration/01-singler-runner-airflow-visits-trends/dags/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-06/04-orchestration/01-singler-runner-airflow-visits-trends/dags/__init__.py -------------------------------------------------------------------------------- /chapter-06/04-orchestration/01-singler-runner-airflow-visits-trends/requirements.txt: -------------------------------------------------------------------------------- 1 | apache-airflow==2.7.3 2 | apache-airflow-providers-postgres==5.8.0 -------------------------------------------------------------------------------- /chapter-06/04-orchestration/02-concurrent-runner-airflow-ingestion/assets/concurrent_runs.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-06/04-orchestration/02-concurrent-runner-airflow-ingestion/assets/concurrent_runs.png -------------------------------------------------------------------------------- /chapter-06/04-orchestration/02-concurrent-runner-airflow-ingestion/dataset/generation_configuration.yaml: -------------------------------------------------------------------------------- 1 | dataset: 2 | rows: 50 3 | composition_percentage: 4 | duplicates: 0 5 | missing_fields: 0 6 | unprocessable_rows: 0 7 | data_blocker: 8 | type: 'no' 9 | entity: 10 | type: device 11 | generator: 12 | type: one-shot 13 | writer: 14 | type: csv 15 | configuration: 16 | output_path: '/home/data_generator_user/data_generator_output/input' 17 | clean_path: true 18 | -------------------------------------------------------------------------------- /chapter-06/04-orchestration/02-concurrent-runner-airflow-ingestion/requirements.txt: -------------------------------------------------------------------------------- 1 | apache-airflow==2.7.3 2 | apache-airflow-providers-postgres==5.8.0 3 | pendulum==2.1.2 -------------------------------------------------------------------------------- /chapter-07/01-personal-data-removal/01-vertical-partitioner-kafka-spark-delta/config.py: -------------------------------------------------------------------------------- 1 | def get_base_dir() -> str: 2 | return '/tmp/dedp/ch07/01-personal-data-removal/01-vertical-partitioner-kafka-spark-delta' 3 | 4 | 5 | def get_delta_users_table_dir() -> str: 6 | return '/tmp/dedp/ch07/01-personal-data-removal/01-vertical-partitioner-kafka-spark-delta/table/users' 7 | -------------------------------------------------------------------------------- /chapter-07/01-personal-data-removal/01-vertical-partitioner-kafka-spark-delta/requirements.txt: -------------------------------------------------------------------------------- 1 | delta-spark==3.0.0 2 | pyspark==3.5.0 -------------------------------------------------------------------------------- /chapter-07/01-personal-data-removal/02-in-place-overwriter-delta-lake/config.py: -------------------------------------------------------------------------------- 1 | def get_base_dir() -> str: 2 | return '/tmp/dedp/ch07/01-personal-data-removal/02-in-place-overwrite-delta-lake' 3 | 4 | 5 | def get_input_table_dir() -> str: 6 | return f'{get_base_dir()}/input' 7 | 8 | 9 | def get_delta_visits_table_dir() -> str: 10 | return f'{get_base_dir()}/table/visits' 11 | -------------------------------------------------------------------------------- /chapter-07/01-personal-data-removal/02-in-place-overwriter-delta-lake/requirements.txt: -------------------------------------------------------------------------------- 1 | delta-spark==3.0.0 2 | pyspark==3.5.0 -------------------------------------------------------------------------------- /chapter-07/01-personal-data-removal/02-in-place-overwriter-spark-json/config.py: -------------------------------------------------------------------------------- 1 | def get_base_dir() -> str: 2 | return '/tmp/dedp/ch07/01-personal-data-removal/02-in-place-overwrite-delta-lake' 3 | 4 | 5 | def get_input_table_dir() -> str: 6 | return f'{get_base_dir()}/input' 7 | 8 | 9 | def get_staging_table_dir() -> str: 10 | return f'{get_base_dir()}/staging' 11 | 12 | 13 | def get_output_table_dir() -> str: 14 | return f'{get_base_dir()}/output' 15 | -------------------------------------------------------------------------------- /chapter-07/01-personal-data-removal/02-in-place-overwriter-spark-json/requirements.txt: -------------------------------------------------------------------------------- 1 | delta-spark==3.0.0 2 | pyspark==3.5.0 -------------------------------------------------------------------------------- /chapter-07/02-access-control/01-fine-grained-accessor-postgresql-rows-view/docker/init.sql: -------------------------------------------------------------------------------- 1 | CREATE SCHEMA dedp; 2 | 3 | CREATE TABLE dedp.users ( 4 | id TEXT NOT NULL, 5 | login VARCHAR(45) NOT NULL, 6 | email VARCHAR(45) NULL, 7 | PRIMARY KEY(id) 8 | ); 9 | 10 | INSERT INTO dedp.users (id, login, email) VALUES 11 | ('id_user_a', 'user_a', 'user_a@email.com'), 12 | ('id_user_b', 'user_b', 'user_b@email.com'); 13 | -------------------------------------------------------------------------------- /chapter-07/02-access-control/01-fine-grained-accessor-postgresql-rows/docker/init.sql: -------------------------------------------------------------------------------- 1 | CREATE SCHEMA dedp; 2 | 3 | CREATE TABLE dedp.users ( 4 | id TEXT NOT NULL, 5 | login VARCHAR(45) NOT NULL, 6 | email VARCHAR(45) NULL, 7 | PRIMARY KEY(id) 8 | ); 9 | 10 | ALTER TABLE dedp.users ENABLE ROW LEVEL SECURITY; 11 | 12 | INSERT INTO dedp.users (id, login, email) VALUES ('id_user_a', 'user_a', 'user_a@email.com'), ('id_user_b', 'user_b', 'user_b@email.com'); 13 | -------------------------------------------------------------------------------- /chapter-07/03-data-protection/01-encryptor-s3/ec2.tf: -------------------------------------------------------------------------------- 1 | resource "aws_instance" "s3_dedp_reader_instance" { 2 | ami = "ami-0fb653ca2d3203ac1" 3 | instance_type = "t2.micro" 4 | iam_instance_profile = "${aws_iam_instance_profile.iam_key_reader_profile.name}" 5 | } 6 | -------------------------------------------------------------------------------- /chapter-07/03-data-protection/01-encryptor-s3/encrypted_file_on_s3.txt: -------------------------------------------------------------------------------- 1 | This is the content of an encrypted file. -------------------------------------------------------------------------------- /chapter-07/03-data-protection/01-encryptor-s3/kms.tf: -------------------------------------------------------------------------------- 1 | module "kms" { 2 | source = "terraform-aws-modules/kms/aws" 3 | key_usage = "ENCRYPT_DECRYPT" 4 | deletion_window_in_days = 14 5 | aliases = ["visits-bucket-encryption-key"] 6 | grants = { 7 | ec2_instance_reader = { 8 | grantee_principal = aws_iam_role.iam_key_reader.arn 9 | operations = ["Encrypt", "Decrypt", "GenerateDataKey"] 10 | } 11 | } 12 | } 13 | 14 | -------------------------------------------------------------------------------- /chapter-07/03-data-protection/02-anonymizer-apache-spark/config.py: -------------------------------------------------------------------------------- 1 | def get_base_input_dir() -> str: 2 | return '/tmp/dedp/ch07/03-data-protection/02-anonymizer-apache-spark' 3 | -------------------------------------------------------------------------------- /chapter-07/03-data-protection/02-anonymizer-apache-spark/requirements.txt: -------------------------------------------------------------------------------- 1 | pyspark==3.5.0 2 | pandas==2.0.2 3 | pyarrow==14.0.1 4 | Faker==20.1.0 -------------------------------------------------------------------------------- /chapter-07/03-data-protection/03-pseudo-anonymizer-apache-spark/config.py: -------------------------------------------------------------------------------- 1 | def get_base_input_dir() -> str: 2 | return '/tmp/dedp/ch07/03-data-protection/03-pseudo-anonymizer-apache-spark' 3 | -------------------------------------------------------------------------------- /chapter-07/03-data-protection/03-pseudo-anonymizer-apache-spark/requirements.txt: -------------------------------------------------------------------------------- 1 | pyspark==3.5.0 2 | pandas==2.0.2 3 | pyarrow==14.0.1 -------------------------------------------------------------------------------- /chapter-07/04-connectivity/01-secrets-pointer-spark-postgresql/dataset/init.sql: -------------------------------------------------------------------------------- 1 | CREATE SCHEMA dedp; 2 | 3 | CREATE TABLE dedp.devices ( 4 | type VARCHAR(10) NOT NULL, 5 | full_name TEXT NOT NULL, 6 | version VARCHAR(25) NOT NULL, 7 | PRIMARY KEY(type, full_name, version) 8 | ); -------------------------------------------------------------------------------- /chapter-07/04-connectivity/01-secrets-pointer-spark-postgresql/requirements.txt: -------------------------------------------------------------------------------- 1 | pyspark==3.5.0 2 | localstack==3.0.2 3 | awscli-local==0.21.1 4 | awscli==1.31.8 5 | boto3==1.33.8 6 | localstack-client==2.5 -------------------------------------------------------------------------------- /chapter-07/04-connectivity/02-secretless-connector-apache-spark-postgresql/dataset/init.sql: -------------------------------------------------------------------------------- 1 | CREATE SCHEMA dedp; 2 | 3 | CREATE TABLE dedp.devices ( 4 | type VARCHAR(10) NOT NULL, 5 | full_name TEXT NOT NULL, 6 | version VARCHAR(25) NOT NULL, 7 | PRIMARY KEY(type, full_name, version) 8 | ); -------------------------------------------------------------------------------- /chapter-07/04-connectivity/02-secretless-connector-apache-spark-postgresql/requirements.txt: -------------------------------------------------------------------------------- 1 | pyspark==3.5.0 -------------------------------------------------------------------------------- /chapter-08/01-partitioning/01-horizontal-partitioner-apache-spark/config.py: -------------------------------------------------------------------------------- 1 | def get_base_dir() -> str: 2 | return '/tmp/dedp/ch08/01-partitioning/01-horizontal-partitioner-apache-spark' 3 | 4 | 5 | def get_delta_table_dir() -> str: 6 | return f'{get_base_dir()}/delta-users' 7 | 8 | 9 | def get_json_table_dir() -> str: 10 | return f'{get_base_dir()}/json-users' 11 | -------------------------------------------------------------------------------- /chapter-08/01-partitioning/01-horizontal-partitioner-apache-spark/reader_json.py: -------------------------------------------------------------------------------- 1 | from pyspark.sql import SparkSession 2 | 3 | from config import get_json_table_dir 4 | 5 | if __name__ == '__main__': 6 | spark_session = SparkSession.builder.master("local[*]").getOrCreate() 7 | 8 | users_from_delta = spark_session.read.format('json').load(get_json_table_dir()) 9 | users_from_delta.show(truncate=False) 10 | -------------------------------------------------------------------------------- /chapter-08/01-partitioning/01-horizontal-partitioner-apache-spark/requirements.txt: -------------------------------------------------------------------------------- 1 | delta-spark==3.0.0 2 | pyspark==3.5.0 -------------------------------------------------------------------------------- /chapter-08/01-partitioning/01-horizontal-partitioner-kafka/requirements.txt: -------------------------------------------------------------------------------- 1 | pyspark==3.5.0 2 | pandas==2.0.2 3 | pyarrow==14.0.1 4 | confluent-kafka==2.0.2 -------------------------------------------------------------------------------- /chapter-08/01-partitioning/02-vertical-partitioner-apache-spark/config.py: -------------------------------------------------------------------------------- 1 | def get_base_dir() -> str: 2 | return '/tmp/dedp/ch08/01-partitioning/02-vertical-partitioner-apache-spark/' 3 | 4 | 5 | def get_delta_users_table_dir() -> str: 6 | return f'{get_base_dir()}/table/users' 7 | 8 | 9 | def get_delta_technical_table_dir() -> str: 10 | return f'{get_base_dir()}/table/technical' 11 | 12 | 13 | def get_delta_visits_table_dir() -> str: 14 | return f'{get_base_dir()}/table/visits' 15 | -------------------------------------------------------------------------------- /chapter-08/01-partitioning/02-vertical-partitioner-apache-spark/dataset/generation_configuration.yaml: -------------------------------------------------------------------------------- 1 | dataset: 2 | rows: 500 3 | data_blocker: 4 | type: 'no' 5 | entity: 6 | type: visit 7 | configuration: 8 | start_time: '2023-11-24T00:00:00Z' 9 | generator: 10 | type: one-shot 11 | writer: 12 | type: json 13 | configuration: 14 | output_path: '/home/data_generator_user/data_generator_output' 15 | clean_path: true 16 | partitions: ['partition-1', 'partition-2'] 17 | -------------------------------------------------------------------------------- /chapter-08/01-partitioning/02-vertical-partitioner-apache-spark/requirements.txt: -------------------------------------------------------------------------------- 1 | delta-spark==3.0.0 2 | pyspark==3.5.0 -------------------------------------------------------------------------------- /chapter-08/01-partitioning/02-vertical-partitioner-postgresql/dataset/init.sql: -------------------------------------------------------------------------------- 1 | CREATE SCHEMA dedp; 2 | 3 | CREATE TABLE dedp.visits_all ( 4 | visit_id CHAR(36) NOT NULL, 5 | event_time TIMESTAMP NOT NULL, 6 | user_id TEXT NOT NULL, 7 | page VARCHAR(20) NULL, 8 | context JSONB NOT NULL, 9 | PRIMARY KEY(visit_id, event_time) 10 | ); 11 | -------------------------------------------------------------------------------- /chapter-08/02-records-organization/01-bucket-apache-spark/config.py: -------------------------------------------------------------------------------- 1 | def get_base_dir() -> str: 2 | return '/tmp/dedp/ch08/01-records-organization/01-buckets-apache-spark' 3 | -------------------------------------------------------------------------------- /chapter-08/02-records-organization/01-bucket-apache-spark/requirements.txt: -------------------------------------------------------------------------------- 1 | pyspark==3.5.0 -------------------------------------------------------------------------------- /chapter-08/02-records-organization/02-sorter-delta-lake/assets/flat_table.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-08/02-records-organization/02-sorter-delta-lake/assets/flat_table.png -------------------------------------------------------------------------------- /chapter-08/02-records-organization/02-sorter-delta-lake/assets/z_ordered_table.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-08/02-records-organization/02-sorter-delta-lake/assets/z_ordered_table.png -------------------------------------------------------------------------------- /chapter-08/02-records-organization/02-sorter-delta-lake/requirements.txt: -------------------------------------------------------------------------------- 1 | delta-spark==3.0.0 2 | pyspark==3.5.0 -------------------------------------------------------------------------------- /chapter-08/03-read-performance-optimization/01-metadata-enhancer-apache-spark-apache-parquet/assets/json_read.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-08/03-read-performance-optimization/01-metadata-enhancer-apache-spark-apache-parquet/assets/json_read.png -------------------------------------------------------------------------------- /chapter-08/03-read-performance-optimization/01-metadata-enhancer-apache-spark-apache-parquet/assets/parquet_read.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-08/03-read-performance-optimization/01-metadata-enhancer-apache-spark-apache-parquet/assets/parquet_read.png -------------------------------------------------------------------------------- /chapter-08/03-read-performance-optimization/01-metadata-enhancer-apache-spark-apache-parquet/config.py: -------------------------------------------------------------------------------- 1 | def get_base_dir() -> str: 2 | return '/tmp/dedp/ch08/03-access-optimization/01-metadata-enhancer-apache-spark-apache-parquet' 3 | 4 | 5 | def get_json_dir() -> str: 6 | return f'{get_base_dir()}/output-json' 7 | 8 | 9 | def get_parquet_dir() -> str: 10 | return f'{get_base_dir()}/output-parquet' 11 | -------------------------------------------------------------------------------- /chapter-08/03-read-performance-optimization/01-metadata-enhancer-apache-spark-apache-parquet/dataset/generation_configuration.yaml: -------------------------------------------------------------------------------- 1 | dataset: 2 | rows: 50000 3 | data_blocker: 4 | type: 'no' 5 | entity: 6 | type: user 7 | generator: 8 | type: one-shot 9 | writer: 10 | type: json 11 | configuration: 12 | output_path: '/home/data_generator_user/data_generator_output/' 13 | clean_path: true -------------------------------------------------------------------------------- /chapter-08/03-read-performance-optimization/01-metadata-enhancer-apache-spark-apache-parquet/requirements.txt: -------------------------------------------------------------------------------- 1 | pyspark==3.5.0 -------------------------------------------------------------------------------- /chapter-08/03-read-performance-optimization/01-metadata-enhancer-delta-lake/assets/delta_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-08/03-read-performance-optimization/01-metadata-enhancer-delta-lake/assets/delta_1.png -------------------------------------------------------------------------------- /chapter-08/03-read-performance-optimization/01-metadata-enhancer-delta-lake/assets/delta_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-08/03-read-performance-optimization/01-metadata-enhancer-delta-lake/assets/delta_2.png -------------------------------------------------------------------------------- /chapter-08/03-read-performance-optimization/01-metadata-enhancer-delta-lake/assets/delta_filter_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-08/03-read-performance-optimization/01-metadata-enhancer-delta-lake/assets/delta_filter_1.png -------------------------------------------------------------------------------- /chapter-08/03-read-performance-optimization/01-metadata-enhancer-delta-lake/assets/delta_filter_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-08/03-read-performance-optimization/01-metadata-enhancer-delta-lake/assets/delta_filter_2.png -------------------------------------------------------------------------------- /chapter-08/03-read-performance-optimization/01-metadata-enhancer-delta-lake/assets/json.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-08/03-read-performance-optimization/01-metadata-enhancer-delta-lake/assets/json.png -------------------------------------------------------------------------------- /chapter-08/03-read-performance-optimization/01-metadata-enhancer-delta-lake/assets/json_filter.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-08/03-read-performance-optimization/01-metadata-enhancer-delta-lake/assets/json_filter.png -------------------------------------------------------------------------------- /chapter-08/03-read-performance-optimization/01-metadata-enhancer-delta-lake/config.py: -------------------------------------------------------------------------------- 1 | class DemoConfiguration: 2 | BASE_DIR: str = '/tmp/dedp/ch08/03-access-optimization/01-metadata-enhancer-delta-lake' 3 | INPUT_PATH: str = f'{BASE_DIR}/input' 4 | DEVICES_TABLE_DELTA_LAKE: str = f'{BASE_DIR}/devices-table-delta-lake' 5 | DEVICES_TABLE_JSON: str = f'{BASE_DIR}/devices-table-json' 6 | -------------------------------------------------------------------------------- /chapter-08/03-read-performance-optimization/01-metadata-enhancer-delta-lake/dataset/generation_configuration.yaml: -------------------------------------------------------------------------------- 1 | dataset: 2 | rows: 6000 3 | composition_percentage: 4 | duplicates: 0 5 | missing_fields: 0 6 | unprocessable_rows: 0 7 | data_blocker: 8 | type: 'no' 9 | entity: 10 | type: device 11 | generator: 12 | type: one-shot 13 | writer: 14 | type: json 15 | configuration: 16 | output_path: '/home/data_generator_user/data_generator_output' 17 | clean_path: true 18 | -------------------------------------------------------------------------------- /chapter-08/03-read-performance-optimization/01-metadata-enhancer-delta-lake/requirements.txt: -------------------------------------------------------------------------------- 1 | delta-spark==3.0.0 2 | pyspark==3.5.0 -------------------------------------------------------------------------------- /chapter-08/03-read-performance-optimization/01-metadata-enhancer-postgresql/docker/init.sql: -------------------------------------------------------------------------------- 1 | CREATE SCHEMA dedp; 2 | 3 | CREATE TABLE dedp.visits ( 4 | visit_id CHAR(36) NOT NULL, 5 | event_time TIMESTAMP NOT NULL, 6 | user_id TEXT NOT NULL, 7 | page VARCHAR(20) NULL, 8 | context JSONB NOT NULL, 9 | PRIMARY KEY(visit_id, event_time) 10 | ); -------------------------------------------------------------------------------- /chapter-08/03-read-performance-optimization/02-dataset-materializer-incremental-table-postgresql/docker/init.sql: -------------------------------------------------------------------------------- 1 | CREATE SCHEMA dedp; 2 | 3 | CREATE TABLE dedp.visits ( 4 | visit_id CHAR(36) NOT NULL, 5 | event_time TIMESTAMP NOT NULL, 6 | user_id TEXT NOT NULL, 7 | page VARCHAR(20) NULL, 8 | insertion_time TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP, 9 | PRIMARY KEY(visit_id, event_time) 10 | ); 11 | -------------------------------------------------------------------------------- /chapter-08/03-read-performance-optimization/02-dataset-materializer-materialized-view-postgresql/dataset/init.sql: -------------------------------------------------------------------------------- 1 | CREATE SCHEMA dedp; 2 | 3 | CREATE TABLE dedp.visits_all ( 4 | visit_id CHAR(36) NOT NULL, 5 | event_time TIMESTAMP NOT NULL, 6 | user_id TEXT NOT NULL, 7 | page VARCHAR(20) NULL, 8 | context JSONB NOT NULL, 9 | PRIMARY KEY(visit_id, event_time) 10 | ); 11 | -------------------------------------------------------------------------------- /chapter-08/03-read-performance-optimization/03-manifest-delta-lake/config.py: -------------------------------------------------------------------------------- 1 | class DemoConfiguration: 2 | BASE_DIR: str = '/tmp/dedp/ch08/03-access-optimization/03-manifest-delta-lake' 3 | INPUT_PATH: str = f'{BASE_DIR}/input' 4 | DEVICES_TABLE: str = f'{BASE_DIR}/devices-table' 5 | -------------------------------------------------------------------------------- /chapter-08/03-read-performance-optimization/03-manifest-delta-lake/dataset/generation_configuration.yaml: -------------------------------------------------------------------------------- 1 | dataset: 2 | rows: 5000 3 | composition_percentage: 4 | duplicates: 0 5 | missing_fields: 0 6 | unprocessable_rows: 0 7 | data_blocker: 8 | type: 'no' 9 | entity: 10 | type: device 11 | generator: 12 | type: one-shot 13 | writer: 14 | type: json 15 | configuration: 16 | output_path: '/home/data_generator_user/data_generator_output' 17 | clean_path: true 18 | -------------------------------------------------------------------------------- /chapter-08/03-read-performance-optimization/03-manifest-delta-lake/requirements.txt: -------------------------------------------------------------------------------- 1 | delta-spark==3.0.0 2 | pyspark==3.5.0 -------------------------------------------------------------------------------- /chapter-08/04-data-representation/01-normalizer-apache-spark/dataset/generation_configuration.yaml: -------------------------------------------------------------------------------- 1 | dataset: 2 | rows: 500 3 | data_blocker: 4 | type: 'no' 5 | entity: 6 | type: visit 7 | configuration: 8 | start_time: '2023-11-24T00:00:00Z' 9 | generator: 10 | type: one-shot 11 | writer: 12 | type: json 13 | configuration: 14 | output_path: '/home/data_generator_user/data_generator_output/input' 15 | clean_path: true 16 | partitions: ['partition1', 'partition2'] -------------------------------------------------------------------------------- /chapter-08/04-data-representation/01-normalizer-apache-spark/requirements.txt: -------------------------------------------------------------------------------- 1 | delta-spark==3.0.0 2 | pyspark==3.5.0 -------------------------------------------------------------------------------- /chapter-08/04-data-representation/01-normalizer-normal-forms/dataset/generation_configuration.yaml: -------------------------------------------------------------------------------- 1 | dataset: 2 | rows: 500 3 | data_blocker: 4 | type: 'no' 5 | entity: 6 | type: visit 7 | configuration: 8 | start_time: '2023-11-24T00:00:00Z' 9 | generator: 10 | type: one-shot 11 | writer: 12 | type: json 13 | configuration: 14 | output_path: '/home/data_generator_user/data_generator_output/input' 15 | clean_path: true 16 | partitions: ['partition1', 'partition2'] -------------------------------------------------------------------------------- /chapter-08/04-data-representation/01-normalizer-normal-forms/requirements.txt: -------------------------------------------------------------------------------- 1 | delta-spark==3.0.0 2 | pyspark==3.5.0 -------------------------------------------------------------------------------- /chapter-08/04-data-representation/01-normalizer-snowflake-schema/dataset/generation_configuration.yaml: -------------------------------------------------------------------------------- 1 | dataset: 2 | rows: 500 3 | data_blocker: 4 | type: 'no' 5 | entity: 6 | type: visit 7 | configuration: 8 | start_time: '2023-11-24T00:00:00Z' 9 | generator: 10 | type: one-shot 11 | writer: 12 | type: json 13 | configuration: 14 | output_path: '/home/data_generator_user/data_generator_output/input' 15 | clean_path: true 16 | partitions: ['partition1', 'partition2'] -------------------------------------------------------------------------------- /chapter-08/04-data-representation/01-normalizer-snowflake-schema/requirements.txt: -------------------------------------------------------------------------------- 1 | delta-spark==3.0.0 2 | pyspark==3.5.0 -------------------------------------------------------------------------------- /chapter-08/04-data-representation/02-denormalizer-one-big-table/dataset/generation_configuration.yaml: -------------------------------------------------------------------------------- 1 | dataset: 2 | rows: 500 3 | data_blocker: 4 | type: 'no' 5 | entity: 6 | type: visit 7 | configuration: 8 | start_time: '2023-11-24T00:00:00Z' 9 | generator: 10 | type: one-shot 11 | writer: 12 | type: json 13 | configuration: 14 | output_path: '/home/data_generator_user/data_generator_output/input' 15 | clean_path: true 16 | partitions: ['partition1', 'partition2'] -------------------------------------------------------------------------------- /chapter-08/04-data-representation/02-denormalizer-one-big-table/requirements.txt: -------------------------------------------------------------------------------- 1 | delta-spark==3.0.0 2 | pyspark==3.5.0 -------------------------------------------------------------------------------- /chapter-08/04-data-representation/02-denormalizer-star-schema/dataset/generation_configuration.yaml: -------------------------------------------------------------------------------- 1 | dataset: 2 | rows: 500 3 | data_blocker: 4 | type: 'no' 5 | entity: 6 | type: visit 7 | configuration: 8 | start_time: '2023-11-24T00:00:00Z' 9 | generator: 10 | type: one-shot 11 | writer: 12 | type: json 13 | configuration: 14 | output_path: '/home/data_generator_user/data_generator_output/input' 15 | clean_path: true 16 | partitions: ['partition1', 'partition2'] -------------------------------------------------------------------------------- /chapter-08/04-data-representation/02-denormalizer-star-schema/requirements.txt: -------------------------------------------------------------------------------- 1 | delta-spark==3.0.0 2 | pyspark==3.5.0 -------------------------------------------------------------------------------- /chapter-09/01-quality-enforcement/01-audit-write-audit-publish-apache-airflow-postgresql/assets/airflow_status.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-09/01-quality-enforcement/01-audit-write-audit-publish-apache-airflow-postgresql/assets/airflow_status.png -------------------------------------------------------------------------------- /chapter-09/01-quality-enforcement/01-audit-write-audit-publish-apache-airflow-postgresql/dags/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-09/01-quality-enforcement/01-audit-write-audit-publish-apache-airflow-postgresql/dags/__init__.py -------------------------------------------------------------------------------- /chapter-09/01-quality-enforcement/01-audit-write-audit-publish-apache-airflow-postgresql/dags/lib/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-09/01-quality-enforcement/01-audit-write-audit-publish-apache-airflow-postgresql/dags/lib/__init__.py -------------------------------------------------------------------------------- /chapter-09/01-quality-enforcement/01-audit-write-audit-publish-apache-airflow-postgresql/requirements.txt: -------------------------------------------------------------------------------- 1 | apache-airflow==2.7.3 2 | apache-airflow-providers-postgres==5.8.0 3 | -------------------------------------------------------------------------------- /chapter-09/01-quality-enforcement/01-audit-write-audit-publish-apache-airflow-postgresql/sql/load_file_to_visits_table.sql: -------------------------------------------------------------------------------- 1 | DELETE FROM dedp.visits_flattened WHERE execution_time = '{{ execution_date }}'; 2 | 3 | COPY dedp.visits_flattened FROM '/data_to_load/date={{ ds }}/dataset.csv' CSV DELIMITER ';' HEADER; -------------------------------------------------------------------------------- /chapter-09/01-quality-enforcement/01-audit-write-audit-publish-apache-kafka-apache-flink-delta-lake/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-09/01-quality-enforcement/01-audit-write-audit-publish-apache-kafka-apache-flink-delta-lake/__init__.py -------------------------------------------------------------------------------- /chapter-09/01-quality-enforcement/01-audit-write-audit-publish-apache-kafka-apache-flink-delta-lake/flink-connector-base-1.18.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-09/01-quality-enforcement/01-audit-write-audit-publish-apache-kafka-apache-flink-delta-lake/flink-connector-base-1.18.0.jar -------------------------------------------------------------------------------- /chapter-09/01-quality-enforcement/01-audit-write-audit-publish-apache-kafka-apache-flink-delta-lake/flink-connector-kafka-3.1.0-1.18.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-09/01-quality-enforcement/01-audit-write-audit-publish-apache-kafka-apache-flink-delta-lake/flink-connector-kafka-3.1.0-1.18.jar -------------------------------------------------------------------------------- /chapter-09/01-quality-enforcement/01-audit-write-audit-publish-apache-kafka-apache-flink-delta-lake/kafka-clients-3.7.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-09/01-quality-enforcement/01-audit-write-audit-publish-apache-kafka-apache-flink-delta-lake/kafka-clients-3.7.0.jar -------------------------------------------------------------------------------- /chapter-09/01-quality-enforcement/01-audit-write-audit-publish-apache-kafka-apache-flink-delta-lake/requirements.txt: -------------------------------------------------------------------------------- 1 | apache-flink==1.18.0 2 | -------------------------------------------------------------------------------- /chapter-09/01-quality-enforcement/01-audit-write-audit-publish-apache-kafka-apache-spark-delta-lake-staging/requirements.txt: -------------------------------------------------------------------------------- 1 | pyspark==3.5.0 2 | -------------------------------------------------------------------------------- /chapter-09/01-quality-enforcement/01-audit-write-audit-publish-apache-kafka-apache-spark-delta-lake/requirements.txt: -------------------------------------------------------------------------------- 1 | pyspark==3.5.0 2 | -------------------------------------------------------------------------------- /chapter-09/01-quality-enforcement/02-constraints-apache-kafka-protobuf/buf.gen.yaml: -------------------------------------------------------------------------------- 1 | version: v2 2 | managed: 3 | enabled: true 4 | plugins: 5 | - remote: buf.build/protocolbuffers/python:v27.2 6 | out: protobuf_output/python 7 | -------------------------------------------------------------------------------- /chapter-09/01-quality-enforcement/02-constraints-apache-kafka-protobuf/buf.lock: -------------------------------------------------------------------------------- 1 | # Generated by buf. DO NOT EDIT. 2 | version: v2 3 | deps: 4 | - name: buf.build/bufbuild/protovalidate 5 | commit: 46a4cf4ba1094a34bcd89a6c67163b4b 6 | digest: b5:2076a950fdf4a8047064d55fd1d20ef21e6d745bf56e3edf557071abd4488ed48c9466d60831d8a03489dc1fcc8ceaa073d197411b59ecd873e28b1328034e0b 7 | -------------------------------------------------------------------------------- /chapter-09/01-quality-enforcement/02-constraints-apache-kafka-protobuf/buf.yaml: -------------------------------------------------------------------------------- 1 | version: v2 2 | deps: 3 | - buf.build/bufbuild/protovalidate -------------------------------------------------------------------------------- /chapter-09/01-quality-enforcement/02-constraints-apache-kafka-protobuf/definitons/invalid_visit.proto: -------------------------------------------------------------------------------- 1 | syntax = "proto3"; 2 | 3 | import "definitons/visit.proto"; 4 | 5 | package com.waitingforcode; 6 | 7 | message ValidationError { 8 | string field = 1; 9 | string message = 2; 10 | } 11 | 12 | message InvalidVisit { 13 | Visit visit = 1; 14 | repeated ValidationError errors = 2; 15 | } -------------------------------------------------------------------------------- /chapter-09/01-quality-enforcement/02-constraints-apache-kafka-protobuf/requirements.txt: -------------------------------------------------------------------------------- 1 | protovalidate==0.3.1 2 | protobuf==5.27.2 3 | confluent-kafka==2.0.2 -------------------------------------------------------------------------------- /chapter-09/01-quality-enforcement/02-constraints-delta-lake/requirements.txt: -------------------------------------------------------------------------------- 1 | delta-spark==3.0.0 2 | pyspark==3.5.0 -------------------------------------------------------------------------------- /chapter-09/02-schema-consistency/01-schema-enforcer-apache-kafka/assets/add_new_schema.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-09/02-schema-consistency/01-schema-enforcer-apache-kafka/assets/add_new_schema.png -------------------------------------------------------------------------------- /chapter-09/02-schema-consistency/01-schema-enforcer-apache-kafka/assets/compatibility_click.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-09/02-schema-consistency/01-schema-enforcer-apache-kafka/assets/compatibility_click.png -------------------------------------------------------------------------------- /chapter-09/02-schema-consistency/01-schema-enforcer-apache-kafka/assets/compatibility_forward.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-09/02-schema-consistency/01-schema-enforcer-apache-kafka/assets/compatibility_forward.png -------------------------------------------------------------------------------- /chapter-09/02-schema-consistency/01-schema-enforcer-apache-kafka/requirements.txt: -------------------------------------------------------------------------------- 1 | avro==1.11.0 2 | confluent-kafka==2.5.0 3 | requests==2.26.0 4 | fastavro==1.4.7 5 | -------------------------------------------------------------------------------- /chapter-09/02-schema-consistency/01-schema-enforcer-apache-kafka/schemas/v1_visit.avsc: -------------------------------------------------------------------------------- 1 | { 2 | "type": "record", 3 | "namespace": "com.waitingforcode.model", 4 | "name": "Visit", 5 | "fields": [ 6 | {"name": "visit_id", "type": "string"}, 7 | {"name": "event_time", "type": "int", "logicalType": "time"} 8 | ] 9 | } -------------------------------------------------------------------------------- /chapter-09/02-schema-consistency/01-schema-enforcer-apache-kafka/schemas/v2_visit_without_visit_id.avsc: -------------------------------------------------------------------------------- 1 | { 2 | "type": "record", 3 | "namespace": "com.waitingforcode.model", 4 | "name": "Visit", 5 | "fields": [ 6 | {"name": "event_time", "type": "int", "logicalType": "time"} 7 | ] 8 | } -------------------------------------------------------------------------------- /chapter-09/02-schema-consistency/01-schema-enforcer-delta-lake/requirements.txt: -------------------------------------------------------------------------------- 1 | delta-spark==3.0.0 2 | pyspark==3.5.0 -------------------------------------------------------------------------------- /chapter-09/02-schema-consistency/01-schema-enforcer-postgresql/docker/docker-compose.yaml: -------------------------------------------------------------------------------- 1 | version: '3.1' 2 | services: 3 | postgresql: 4 | image: postgres:15 5 | container_name: dedp_postgresql 6 | environment: 7 | - POSTGRES_USER=dedp_test 8 | - POSTGRES_PASSWORD=dedp_test 9 | - POSTGRES_DB=dedp 10 | ports: 11 | - "5432:5432" 12 | volumes: 13 | - ./init.sql:/docker-entrypoint-initdb.d/init.sql -------------------------------------------------------------------------------- /chapter-09/02-schema-consistency/02-schema-migrator-apache-spark-delta-lake/requirements.txt: -------------------------------------------------------------------------------- 1 | delta-spark==3.0.0 2 | pyspark==3.5.0 -------------------------------------------------------------------------------- /chapter-09/02-schema-consistency/02-schema-migrator-protobuf-apache-spark/buf.gen.yaml: -------------------------------------------------------------------------------- 1 | version: v2 2 | managed: 3 | enabled: true 4 | plugins: 5 | - remote: buf.build/protocolbuffers/python:v27.2 6 | out: protobuf_output/python 7 | -------------------------------------------------------------------------------- /chapter-09/02-schema-consistency/02-schema-migrator-protobuf-apache-spark/buf.lock: -------------------------------------------------------------------------------- 1 | # Generated by buf. DO NOT EDIT. 2 | version: v2 3 | deps: 4 | - name: buf.build/bufbuild/protovalidate 5 | commit: 46a4cf4ba1094a34bcd89a6c67163b4b 6 | digest: b5:2076a950fdf4a8047064d55fd1d20ef21e6d745bf56e3edf557071abd4488ed48c9466d60831d8a03489dc1fcc8ceaa073d197411b59ecd873e28b1328034e0b 7 | -------------------------------------------------------------------------------- /chapter-09/02-schema-consistency/02-schema-migrator-protobuf-apache-spark/buf.yaml: -------------------------------------------------------------------------------- 1 | version: v2 2 | deps: 3 | - buf.build/bufbuild/protovalidate -------------------------------------------------------------------------------- /chapter-09/02-schema-consistency/02-schema-migrator-protobuf-apache-spark/definitons/visit.proto: -------------------------------------------------------------------------------- 1 | syntax = "proto3"; 2 | 3 | package com.waitingforcode; 4 | 5 | import "google/protobuf/timestamp.proto"; 6 | 7 | message Visit { 8 | string visit_id = 1; 9 | google.protobuf.Timestamp event_time = 2; 10 | string user_id = 3; 11 | string page = 4; 12 | string ip = 5; 13 | string login = 6; 14 | bool is_connected = 7; 15 | string from_page = 8; 16 | } -------------------------------------------------------------------------------- /chapter-09/02-schema-consistency/02-schema-migrator-protobuf-apache-spark/protobuf_output/visit.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-09/02-schema-consistency/02-schema-migrator-protobuf-apache-spark/protobuf_output/visit.bin -------------------------------------------------------------------------------- /chapter-09/02-schema-consistency/02-schema-migrator-protobuf-apache-spark/requirements.txt: -------------------------------------------------------------------------------- 1 | protovalidate==0.3.1 2 | protobuf==5.27.2 3 | confluent-kafka==2.0.2 4 | pyspark==3.5.0 -------------------------------------------------------------------------------- /chapter-09/03-quality-observation/01-offline-observer-airflow-postgresql/assets/alerts_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-09/03-quality-observation/01-offline-observer-airflow-postgresql/assets/alerts_1.png -------------------------------------------------------------------------------- /chapter-09/03-quality-observation/01-offline-observer-airflow-postgresql/assets/alerts_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-09/03-quality-observation/01-offline-observer-airflow-postgresql/assets/alerts_2.png -------------------------------------------------------------------------------- /chapter-09/03-quality-observation/01-offline-observer-airflow-postgresql/assets/dash_errors_all_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-09/03-quality-observation/01-offline-observer-airflow-postgresql/assets/dash_errors_all_1.png -------------------------------------------------------------------------------- /chapter-09/03-quality-observation/01-offline-observer-airflow-postgresql/assets/dash_errors_all_zoom.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-09/03-quality-observation/01-offline-observer-airflow-postgresql/assets/dash_errors_all_zoom.png -------------------------------------------------------------------------------- /chapter-09/03-quality-observation/01-offline-observer-airflow-postgresql/assets/dash_errors_distribution.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-09/03-quality-observation/01-offline-observer-airflow-postgresql/assets/dash_errors_distribution.png -------------------------------------------------------------------------------- /chapter-09/03-quality-observation/01-offline-observer-airflow-postgresql/assets/dash_lag.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-09/03-quality-observation/01-offline-observer-airflow-postgresql/assets/dash_lag.png -------------------------------------------------------------------------------- /chapter-09/03-quality-observation/01-offline-observer-airflow-postgresql/assets/postgresql_check.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-09/03-quality-observation/01-offline-observer-airflow-postgresql/assets/postgresql_check.png -------------------------------------------------------------------------------- /chapter-09/03-quality-observation/01-offline-observer-airflow-postgresql/assets/postgresql_config.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-09/03-quality-observation/01-offline-observer-airflow-postgresql/assets/postgresql_config.png -------------------------------------------------------------------------------- /chapter-09/03-quality-observation/01-offline-observer-airflow-postgresql/dags/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-09/03-quality-observation/01-offline-observer-airflow-postgresql/dags/__init__.py -------------------------------------------------------------------------------- /chapter-09/03-quality-observation/01-offline-observer-airflow-postgresql/requirements.txt: -------------------------------------------------------------------------------- 1 | apache-airflow==2.7.3 2 | apache-airflow-providers-postgres==5.8.0 -------------------------------------------------------------------------------- /chapter-09/03-quality-observation/01-offline-observer-airflow-postgresql/sql/clean_previously_inserted_visits.sql: -------------------------------------------------------------------------------- 1 | DELETE FROM dedp.visits_output 2 | WHERE id BETWEEN 3 | (SELECT first_row_id FROM dedp.visits_state WHERE execution_time = '{{ execution_date }}') 4 | AND 5 | (SELECT last_row_id FROM dedp.visits_state WHERE execution_time = '{{ execution_date }}'); -------------------------------------------------------------------------------- /chapter-09/03-quality-observation/01-offline-observer-airflow-postgresql/sql/copy_new_visits.sql: -------------------------------------------------------------------------------- 1 | INSERT INTO dedp.visits_output 2 | SELECT * FROM dedp.visits_input 3 | WHERE id BETWEEN 4 | (SELECT first_row_id FROM dedp.visits_state WHERE execution_time = '{{ execution_date }}') 5 | AND 6 | (SELECT last_row_id FROM dedp.visits_state WHERE execution_time = '{{ execution_date }}'); -------------------------------------------------------------------------------- /chapter-09/03-quality-observation/01-offline-observer-airflow-postgresql/sql/wait_for_new_data.sql: -------------------------------------------------------------------------------- 1 | SELECT COUNT(*) FROM dedp.visits_input WHERE id > 2 | COALESCE((SELECT last_row_id FROM dedp.visits_state WHERE execution_time = '{{ prev_execution_date }}'), 0) 3 | ; -------------------------------------------------------------------------------- /chapter-09/03-quality-observation/01-offline-observer-airflow-postgresql/sql/wait_for_new_data_to_observe.sql: -------------------------------------------------------------------------------- 1 | SELECT COUNT(*) FROM dedp.visits_output WHERE id > 2 | COALESCE((SELECT last_row_id FROM dedp.visits_monitoring_state WHERE execution_time = '{{ prev_execution_date }}'), 0) 3 | ; -------------------------------------------------------------------------------- /chapter-09/03-quality-observation/01-offline-observer-apache-spark-apache-kafka/assets/errors_minute.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-09/03-quality-observation/01-offline-observer-apache-spark-apache-kafka/assets/errors_minute.png -------------------------------------------------------------------------------- /chapter-09/03-quality-observation/01-offline-observer-apache-spark-apache-kafka/assets/es_config.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-09/03-quality-observation/01-offline-observer-apache-spark-apache-kafka/assets/es_config.png -------------------------------------------------------------------------------- /chapter-09/03-quality-observation/01-offline-observer-apache-spark-apache-kafka/assets/es_config_ok.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-09/03-quality-observation/01-offline-observer-apache-spark-apache-kafka/assets/es_config_ok.png -------------------------------------------------------------------------------- /chapter-09/03-quality-observation/01-offline-observer-apache-spark-apache-kafka/assets/lag_alert.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-09/03-quality-observation/01-offline-observer-apache-spark-apache-kafka/assets/lag_alert.png -------------------------------------------------------------------------------- /chapter-09/03-quality-observation/01-offline-observer-apache-spark-apache-kafka/assets/offset_lag.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-09/03-quality-observation/01-offline-observer-apache-spark-apache-kafka/assets/offset_lag.png -------------------------------------------------------------------------------- /chapter-09/03-quality-observation/01-offline-observer-apache-spark-apache-kafka/assets/profile.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-09/03-quality-observation/01-offline-observer-apache-spark-apache-kafka/assets/profile.png -------------------------------------------------------------------------------- /chapter-09/03-quality-observation/01-offline-observer-apache-spark-apache-kafka/connection_parameters.py: -------------------------------------------------------------------------------- 1 | from elasticsearch import Elasticsearch 2 | 3 | 4 | def get_elasticsearch_client() -> Elasticsearch: 5 | return Elasticsearch('http://localhost:9200', maxsize=5, http_auth=('elastic', 'changeme')) 6 | 7 | 8 | def get_output_path() -> str: 9 | return '/tmp/bde/module4/lesson12/' 10 | 11 | 12 | def get_validated_users_index_name() -> str: 13 | return 'visits_observation_stats' 14 | -------------------------------------------------------------------------------- /chapter-09/03-quality-observation/01-offline-observer-apache-spark-apache-kafka/requirements.txt: -------------------------------------------------------------------------------- 1 | pyspark==3.5.0 2 | pandas==2.0.2 3 | pyarrow==14.0.1 4 | confluent-kafka==2.0.2 5 | elasticsearch==8.6.2 6 | ydata-profiling[pyspark]==4.9.0 7 | -------------------------------------------------------------------------------- /chapter-09/03-quality-observation/02-online-observer-airflow-postgresql/assets/alerts_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-09/03-quality-observation/02-online-observer-airflow-postgresql/assets/alerts_1.png -------------------------------------------------------------------------------- /chapter-09/03-quality-observation/02-online-observer-airflow-postgresql/assets/alerts_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-09/03-quality-observation/02-online-observer-airflow-postgresql/assets/alerts_2.png -------------------------------------------------------------------------------- /chapter-09/03-quality-observation/02-online-observer-airflow-postgresql/assets/dash_errors_all_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-09/03-quality-observation/02-online-observer-airflow-postgresql/assets/dash_errors_all_1.png -------------------------------------------------------------------------------- /chapter-09/03-quality-observation/02-online-observer-airflow-postgresql/assets/dash_errors_all_zoom.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-09/03-quality-observation/02-online-observer-airflow-postgresql/assets/dash_errors_all_zoom.png -------------------------------------------------------------------------------- /chapter-09/03-quality-observation/02-online-observer-airflow-postgresql/assets/dash_errors_distribution.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-09/03-quality-observation/02-online-observer-airflow-postgresql/assets/dash_errors_distribution.png -------------------------------------------------------------------------------- /chapter-09/03-quality-observation/02-online-observer-airflow-postgresql/assets/dash_lag.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-09/03-quality-observation/02-online-observer-airflow-postgresql/assets/dash_lag.png -------------------------------------------------------------------------------- /chapter-09/03-quality-observation/02-online-observer-airflow-postgresql/assets/postgresql_check.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-09/03-quality-observation/02-online-observer-airflow-postgresql/assets/postgresql_check.png -------------------------------------------------------------------------------- /chapter-09/03-quality-observation/02-online-observer-airflow-postgresql/assets/postgresql_config.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-09/03-quality-observation/02-online-observer-airflow-postgresql/assets/postgresql_config.png -------------------------------------------------------------------------------- /chapter-09/03-quality-observation/02-online-observer-airflow-postgresql/dags/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-09/03-quality-observation/02-online-observer-airflow-postgresql/dags/__init__.py -------------------------------------------------------------------------------- /chapter-09/03-quality-observation/02-online-observer-airflow-postgresql/requirements.txt: -------------------------------------------------------------------------------- 1 | apache-airflow==2.7.3 2 | apache-airflow-providers-postgres==5.8.0 -------------------------------------------------------------------------------- /chapter-09/03-quality-observation/02-online-observer-airflow-postgresql/sql/clean_previously_inserted_visits.sql: -------------------------------------------------------------------------------- 1 | DELETE FROM dedp.visits_output 2 | WHERE id BETWEEN 3 | (SELECT first_row_id FROM dedp.visits_state WHERE execution_time = '{{ execution_date }}') 4 | AND 5 | (SELECT last_row_id FROM dedp.visits_state WHERE execution_time = '{{ execution_date }}'); -------------------------------------------------------------------------------- /chapter-09/03-quality-observation/02-online-observer-airflow-postgresql/sql/copy_new_visits.sql: -------------------------------------------------------------------------------- 1 | INSERT INTO dedp.visits_output 2 | SELECT * FROM dedp.visits_input 3 | WHERE id BETWEEN 4 | (SELECT first_row_id FROM dedp.visits_state WHERE execution_time = '{{ execution_date }}') 5 | AND 6 | (SELECT last_row_id FROM dedp.visits_state WHERE execution_time = '{{ execution_date }}'); -------------------------------------------------------------------------------- /chapter-09/03-quality-observation/02-online-observer-airflow-postgresql/sql/wait_for_new_data.sql: -------------------------------------------------------------------------------- 1 | SELECT COUNT(*) FROM dedp.visits_input WHERE id > 2 | COALESCE((SELECT last_row_id FROM dedp.visits_state WHERE execution_time = '{{ prev_execution_date }}'), 0) 3 | ; -------------------------------------------------------------------------------- /chapter-09/03-quality-observation/02-online-observer-apache-spark-apache-kafka/assets/dash_brokers.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-09/03-quality-observation/02-online-observer-apache-spark-apache-kafka/assets/dash_brokers.png -------------------------------------------------------------------------------- /chapter-09/03-quality-observation/02-online-observer-apache-spark-apache-kafka/assets/dash_global_healthcheck.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-09/03-quality-observation/02-online-observer-apache-spark-apache-kafka/assets/dash_global_healthcheck.png -------------------------------------------------------------------------------- /chapter-09/03-quality-observation/02-online-observer-apache-spark-apache-kafka/assets/dash_visits_topic.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-09/03-quality-observation/02-online-observer-apache-spark-apache-kafka/assets/dash_visits_topic.png -------------------------------------------------------------------------------- /chapter-09/03-quality-observation/02-online-observer-apache-spark-apache-kafka/assets/errors_minute.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-09/03-quality-observation/02-online-observer-apache-spark-apache-kafka/assets/errors_minute.png -------------------------------------------------------------------------------- /chapter-09/03-quality-observation/02-online-observer-apache-spark-apache-kafka/assets/es_config.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-09/03-quality-observation/02-online-observer-apache-spark-apache-kafka/assets/es_config.png -------------------------------------------------------------------------------- /chapter-09/03-quality-observation/02-online-observer-apache-spark-apache-kafka/assets/es_config_ok.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-09/03-quality-observation/02-online-observer-apache-spark-apache-kafka/assets/es_config_ok.png -------------------------------------------------------------------------------- /chapter-09/03-quality-observation/02-online-observer-apache-spark-apache-kafka/assets/lag_alert.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-09/03-quality-observation/02-online-observer-apache-spark-apache-kafka/assets/lag_alert.png -------------------------------------------------------------------------------- /chapter-09/03-quality-observation/02-online-observer-apache-spark-apache-kafka/assets/offset_lag.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-09/03-quality-observation/02-online-observer-apache-spark-apache-kafka/assets/offset_lag.png -------------------------------------------------------------------------------- /chapter-09/03-quality-observation/02-online-observer-apache-spark-apache-kafka/assets/profile.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-09/03-quality-observation/02-online-observer-apache-spark-apache-kafka/assets/profile.png -------------------------------------------------------------------------------- /chapter-09/03-quality-observation/02-online-observer-apache-spark-apache-kafka/connection_parameters.py: -------------------------------------------------------------------------------- 1 | from elasticsearch import Elasticsearch 2 | 3 | 4 | def get_elasticsearch_client() -> Elasticsearch: 5 | return Elasticsearch('http://localhost:9200', maxsize=5, http_auth=('elastic', 'changeme')) 6 | 7 | 8 | def get_validated_users_index_name() -> str: 9 | return 'visits_observation_stats' 10 | -------------------------------------------------------------------------------- /chapter-09/03-quality-observation/02-online-observer-apache-spark-apache-kafka/dataset/grafana/provisioning/dashboards/kafka.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: 1 2 | 3 | providers: 4 | - name: 'default' 5 | orgId: 1 6 | folder: '' 7 | type: file 8 | disableDeletion: false 9 | editable: true 10 | updateIntervalSeconds: 3 #how often Grafana will scan for changed dashboards 11 | options: 12 | path: /var/lib/grafana/dashboards 13 | -------------------------------------------------------------------------------- /chapter-09/03-quality-observation/02-online-observer-apache-spark-apache-kafka/dataset/kafka_healthcheck.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | JMX_PORT= 3 | kafka-topics.sh --bootstrap-server=localhost:9092 --list 4 | -------------------------------------------------------------------------------- /chapter-09/03-quality-observation/02-online-observer-apache-spark-apache-kafka/requirements.txt: -------------------------------------------------------------------------------- 1 | pyspark==3.5.0 2 | pandas==2.0.2 3 | pyarrow==14.0.1 4 | confluent-kafka==2.0.2 5 | elasticsearch==8.6.2 6 | ydata-profiling[pyspark]==4.9.0 -------------------------------------------------------------------------------- /chapter-10/01-data-detectors/01-flow-interruption-apache-airflow-postgresql-grafana/assets/alert_config_b_part.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-10/01-data-detectors/01-flow-interruption-apache-airflow-postgresql-grafana/assets/alert_config_b_part.png -------------------------------------------------------------------------------- /chapter-10/01-data-detectors/01-flow-interruption-apache-airflow-postgresql-grafana/assets/alert_config_c_part.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-10/01-data-detectors/01-flow-interruption-apache-airflow-postgresql-grafana/assets/alert_config_c_part.png -------------------------------------------------------------------------------- /chapter-10/01-data-detectors/01-flow-interruption-apache-airflow-postgresql-grafana/assets/dag_run_success.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-10/01-data-detectors/01-flow-interruption-apache-airflow-postgresql-grafana/assets/dag_run_success.png -------------------------------------------------------------------------------- /chapter-10/01-data-detectors/01-flow-interruption-apache-airflow-postgresql-grafana/assets/state_1_pending.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-10/01-data-detectors/01-flow-interruption-apache-airflow-postgresql-grafana/assets/state_1_pending.png -------------------------------------------------------------------------------- /chapter-10/01-data-detectors/01-flow-interruption-apache-airflow-postgresql-grafana/assets/state_2_firing.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-10/01-data-detectors/01-flow-interruption-apache-airflow-postgresql-grafana/assets/state_2_firing.png -------------------------------------------------------------------------------- /chapter-10/01-data-detectors/01-flow-interruption-apache-airflow-postgresql-grafana/assets/state_3_normal.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-10/01-data-detectors/01-flow-interruption-apache-airflow-postgresql-grafana/assets/state_3_normal.png -------------------------------------------------------------------------------- /chapter-10/01-data-detectors/01-flow-interruption-apache-airflow-postgresql-grafana/assets/state_history.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-10/01-data-detectors/01-flow-interruption-apache-airflow-postgresql-grafana/assets/state_history.png -------------------------------------------------------------------------------- /chapter-10/01-data-detectors/01-flow-interruption-apache-airflow-postgresql-grafana/dags/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-10/01-data-detectors/01-flow-interruption-apache-airflow-postgresql-grafana/dags/__init__.py -------------------------------------------------------------------------------- /chapter-10/01-data-detectors/01-flow-interruption-apache-airflow-postgresql-grafana/dags/lib/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-10/01-data-detectors/01-flow-interruption-apache-airflow-postgresql-grafana/dags/lib/__init__.py -------------------------------------------------------------------------------- /chapter-10/01-data-detectors/01-flow-interruption-apache-airflow-postgresql-grafana/requirements.txt: -------------------------------------------------------------------------------- 1 | apache-airflow==2.7.3 2 | apache-airflow-providers-postgres==5.8.0 3 | -------------------------------------------------------------------------------- /chapter-10/01-data-detectors/01-flow-interruption-apache-airflow-postgresql-grafana/sql/load_file_to_visits_table.sql: -------------------------------------------------------------------------------- 1 | DELETE FROM dedp.visits_flattened WHERE execution_time = '{{ execution_date }}'; 2 | 3 | COPY dedp.visits_flattened FROM '/data_to_load/date={{ ds }}/dataset.csv' CSV DELIMITER ';' HEADER; -------------------------------------------------------------------------------- /chapter-10/01-data-detectors/01-flow-interruption-detector-apache-kafka-grafana/assets/alert_condition.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-10/01-data-detectors/01-flow-interruption-detector-apache-kafka-grafana/assets/alert_condition.png -------------------------------------------------------------------------------- /chapter-10/01-data-detectors/01-flow-interruption-detector-apache-kafka-grafana/assets/alert_setting.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-10/01-data-detectors/01-flow-interruption-detector-apache-kafka-grafana/assets/alert_setting.png -------------------------------------------------------------------------------- /chapter-10/01-data-detectors/01-flow-interruption-detector-apache-kafka-grafana/assets/alerting_state.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-10/01-data-detectors/01-flow-interruption-detector-apache-kafka-grafana/assets/alerting_state.png -------------------------------------------------------------------------------- /chapter-10/01-data-detectors/01-flow-interruption-detector-apache-kafka-grafana/assets/state_firing.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-10/01-data-detectors/01-flow-interruption-detector-apache-kafka-grafana/assets/state_firing.png -------------------------------------------------------------------------------- /chapter-10/01-data-detectors/01-flow-interruption-detector-apache-kafka-grafana/assets/state_history.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-10/01-data-detectors/01-flow-interruption-detector-apache-kafka-grafana/assets/state_history.png -------------------------------------------------------------------------------- /chapter-10/01-data-detectors/01-flow-interruption-detector-apache-kafka-grafana/assets/state_normal_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-10/01-data-detectors/01-flow-interruption-detector-apache-kafka-grafana/assets/state_normal_1.png -------------------------------------------------------------------------------- /chapter-10/01-data-detectors/01-flow-interruption-detector-apache-kafka-grafana/assets/state_normal_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-10/01-data-detectors/01-flow-interruption-detector-apache-kafka-grafana/assets/state_normal_2.png -------------------------------------------------------------------------------- /chapter-10/01-data-detectors/01-flow-interruption-detector-apache-kafka-grafana/assets/state_pending.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-10/01-data-detectors/01-flow-interruption-detector-apache-kafka-grafana/assets/state_pending.png -------------------------------------------------------------------------------- /chapter-10/01-data-detectors/01-flow-interruption-detector-apache-kafka-grafana/docker/kafka_healthcheck.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | JMX_PORT= 3 | kafka-topics.sh --bootstrap-server=localhost:9092 --list 4 | -------------------------------------------------------------------------------- /chapter-10/01-data-detectors/01-flow-interruption-detector-apache-kafka-grafana/requirements.txt: -------------------------------------------------------------------------------- 1 | pyspark==3.5.0 2 | pandas==2.0.2 3 | pyarrow==14.0.1 -------------------------------------------------------------------------------- /chapter-10/01-data-detectors/01-flow-interruption-detector-delta-lake-grafana/assets/alert_b_part.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-10/01-data-detectors/01-flow-interruption-detector-delta-lake-grafana/assets/alert_b_part.png -------------------------------------------------------------------------------- /chapter-10/01-data-detectors/01-flow-interruption-detector-delta-lake-grafana/assets/alert_conditions_part.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-10/01-data-detectors/01-flow-interruption-detector-delta-lake-grafana/assets/alert_conditions_part.png -------------------------------------------------------------------------------- /chapter-10/01-data-detectors/01-flow-interruption-detector-delta-lake-grafana/assets/alert_firing.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-10/01-data-detectors/01-flow-interruption-detector-delta-lake-grafana/assets/alert_firing.png -------------------------------------------------------------------------------- /chapter-10/01-data-detectors/01-flow-interruption-detector-delta-lake-grafana/assets/alert_history.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-10/01-data-detectors/01-flow-interruption-detector-delta-lake-grafana/assets/alert_history.png -------------------------------------------------------------------------------- /chapter-10/01-data-detectors/01-flow-interruption-detector-delta-lake-grafana/assets/alert_normal.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-10/01-data-detectors/01-flow-interruption-detector-delta-lake-grafana/assets/alert_normal.png -------------------------------------------------------------------------------- /chapter-10/01-data-detectors/01-flow-interruption-detector-delta-lake-grafana/assets/alert_pending.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-10/01-data-detectors/01-flow-interruption-detector-delta-lake-grafana/assets/alert_pending.png -------------------------------------------------------------------------------- /chapter-10/01-data-detectors/01-flow-interruption-detector-delta-lake-grafana/docker/kafka_healthcheck.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | JMX_PORT= 3 | kafka-topics.sh --bootstrap-server=localhost:9092 --list 4 | -------------------------------------------------------------------------------- /chapter-10/01-data-detectors/01-flow-interruption-detector-delta-lake-grafana/requirements.txt: -------------------------------------------------------------------------------- 1 | pyspark==3.5.0 2 | prometheus-client==0.20.0 -------------------------------------------------------------------------------- /chapter-10/01-data-detectors/02-skew-detector-apache-airflow-postgresql/assets/failed_executions.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-10/01-data-detectors/02-skew-detector-apache-airflow-postgresql/assets/failed_executions.png -------------------------------------------------------------------------------- /chapter-10/01-data-detectors/02-skew-detector-apache-airflow-postgresql/assets/sensor_clear.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-10/01-data-detectors/02-skew-detector-apache-airflow-postgresql/assets/sensor_clear.png -------------------------------------------------------------------------------- /chapter-10/01-data-detectors/02-skew-detector-apache-airflow-postgresql/dags/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-10/01-data-detectors/02-skew-detector-apache-airflow-postgresql/dags/__init__.py -------------------------------------------------------------------------------- /chapter-10/01-data-detectors/02-skew-detector-apache-airflow-postgresql/dags/lib/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-10/01-data-detectors/02-skew-detector-apache-airflow-postgresql/dags/lib/__init__.py -------------------------------------------------------------------------------- /chapter-10/01-data-detectors/02-skew-detector-apache-airflow-postgresql/requirements.txt: -------------------------------------------------------------------------------- 1 | apache-airflow==2.9.0 2 | apache-airflow-providers-postgres==5.12.0 3 | -------------------------------------------------------------------------------- /chapter-10/01-data-detectors/02-skew-detector-apache-airflow-postgresql/sql/load_file_to_visits_table.sql: -------------------------------------------------------------------------------- 1 | DELETE FROM dedp.visits_flattened WHERE execution_time = '{{ execution_date }}'; 2 | 3 | COPY dedp.visits_flattened FROM '/data_to_load/date={{ ds }}/dataset.csv' CSV DELIMITER ';' HEADER; -------------------------------------------------------------------------------- /chapter-10/01-data-detectors/02-skew-detector-apache-kafka-grafana/assets/alert_firing.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-10/01-data-detectors/02-skew-detector-apache-kafka-grafana/assets/alert_firing.png -------------------------------------------------------------------------------- /chapter-10/01-data-detectors/02-skew-detector-apache-kafka-grafana/assets/alert_history.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-10/01-data-detectors/02-skew-detector-apache-kafka-grafana/assets/alert_history.png -------------------------------------------------------------------------------- /chapter-10/01-data-detectors/02-skew-detector-apache-kafka-grafana/assets/alert_normal.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-10/01-data-detectors/02-skew-detector-apache-kafka-grafana/assets/alert_normal.png -------------------------------------------------------------------------------- /chapter-10/01-data-detectors/02-skew-detector-apache-kafka-grafana/assets/alert_pending.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-10/01-data-detectors/02-skew-detector-apache-kafka-grafana/assets/alert_pending.png -------------------------------------------------------------------------------- /chapter-10/01-data-detectors/02-skew-detector-apache-kafka-grafana/assets/alert_status.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-10/01-data-detectors/02-skew-detector-apache-kafka-grafana/assets/alert_status.png -------------------------------------------------------------------------------- /chapter-10/01-data-detectors/02-skew-detector-apache-kafka-grafana/assets/config_b.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-10/01-data-detectors/02-skew-detector-apache-kafka-grafana/assets/config_b.png -------------------------------------------------------------------------------- /chapter-10/01-data-detectors/02-skew-detector-apache-kafka-grafana/assets/config_conditions.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-10/01-data-detectors/02-skew-detector-apache-kafka-grafana/assets/config_conditions.png -------------------------------------------------------------------------------- /chapter-10/01-data-detectors/02-skew-detector-apache-kafka-grafana/docker/kafka_healthcheck.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | JMX_PORT= 3 | kafka-topics.sh --bootstrap-server=localhost:9092 --list 4 | -------------------------------------------------------------------------------- /chapter-10/01-data-detectors/02-skew-detector-apache-kafka-grafana/requirements.txt: -------------------------------------------------------------------------------- 1 | pyspark==3.5.0 2 | pandas==2.0.2 3 | pyarrow==14.0.1 -------------------------------------------------------------------------------- /chapter-10/01-data-detectors/02-skew-detector-postgresql-grafana/assets/alert_config_b_part.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-10/01-data-detectors/02-skew-detector-postgresql-grafana/assets/alert_config_b_part.png -------------------------------------------------------------------------------- /chapter-10/01-data-detectors/02-skew-detector-postgresql-grafana/assets/alert_config_c_part.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-10/01-data-detectors/02-skew-detector-postgresql-grafana/assets/alert_config_c_part.png -------------------------------------------------------------------------------- /chapter-10/01-data-detectors/02-skew-detector-postgresql-grafana/assets/state_1_pending.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-10/01-data-detectors/02-skew-detector-postgresql-grafana/assets/state_1_pending.png -------------------------------------------------------------------------------- /chapter-10/01-data-detectors/02-skew-detector-postgresql-grafana/assets/state_2_firing.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-10/01-data-detectors/02-skew-detector-postgresql-grafana/assets/state_2_firing.png -------------------------------------------------------------------------------- /chapter-10/01-data-detectors/02-skew-detector-postgresql-grafana/assets/state_3_normal.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-10/01-data-detectors/02-skew-detector-postgresql-grafana/assets/state_3_normal.png -------------------------------------------------------------------------------- /chapter-10/01-data-detectors/02-skew-detector-postgresql-grafana/assets/state_history.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-10/01-data-detectors/02-skew-detector-postgresql-grafana/assets/state_history.png -------------------------------------------------------------------------------- /chapter-10/01-data-detectors/02-skew-detector-postgresql-grafana/docker/init.sql: -------------------------------------------------------------------------------- 1 | CREATE SCHEMA dedp; 2 | -------------------------------------------------------------------------------- /chapter-10/02-time-detectors/01-lag-detector-apache-spark-apache-kafka-grafana/assets/alert_b_part.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-10/02-time-detectors/01-lag-detector-apache-spark-apache-kafka-grafana/assets/alert_b_part.png -------------------------------------------------------------------------------- /chapter-10/02-time-detectors/01-lag-detector-apache-spark-apache-kafka-grafana/assets/alert_conditions_part.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-10/02-time-detectors/01-lag-detector-apache-spark-apache-kafka-grafana/assets/alert_conditions_part.png -------------------------------------------------------------------------------- /chapter-10/02-time-detectors/01-lag-detector-apache-spark-apache-kafka-grafana/assets/alert_firing.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-10/02-time-detectors/01-lag-detector-apache-spark-apache-kafka-grafana/assets/alert_firing.png -------------------------------------------------------------------------------- /chapter-10/02-time-detectors/01-lag-detector-apache-spark-apache-kafka-grafana/assets/alert_history.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-10/02-time-detectors/01-lag-detector-apache-spark-apache-kafka-grafana/assets/alert_history.png -------------------------------------------------------------------------------- /chapter-10/02-time-detectors/01-lag-detector-apache-spark-apache-kafka-grafana/assets/alert_normal.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-10/02-time-detectors/01-lag-detector-apache-spark-apache-kafka-grafana/assets/alert_normal.png -------------------------------------------------------------------------------- /chapter-10/02-time-detectors/01-lag-detector-apache-spark-apache-kafka-grafana/assets/alert_pending.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-10/02-time-detectors/01-lag-detector-apache-spark-apache-kafka-grafana/assets/alert_pending.png -------------------------------------------------------------------------------- /chapter-10/02-time-detectors/01-lag-detector-apache-spark-apache-kafka-grafana/docker/kafka_healthcheck.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | JMX_PORT= 3 | kafka-topics.sh --bootstrap-server=localhost:9092 --list 4 | -------------------------------------------------------------------------------- /chapter-10/02-time-detectors/01-lag-detector-apache-spark-apache-kafka-grafana/requirements.txt: -------------------------------------------------------------------------------- 1 | pyspark==3.5.0 2 | prometheus-client==0.20.0 3 | confluent-kafka==2.0.2 -------------------------------------------------------------------------------- /chapter-10/02-time-detectors/01-lag-detector-delta-lake-apache-spark-grafana/assets/alert_b_part.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-10/02-time-detectors/01-lag-detector-delta-lake-apache-spark-grafana/assets/alert_b_part.png -------------------------------------------------------------------------------- /chapter-10/02-time-detectors/01-lag-detector-delta-lake-apache-spark-grafana/assets/alert_conditions_part.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-10/02-time-detectors/01-lag-detector-delta-lake-apache-spark-grafana/assets/alert_conditions_part.png -------------------------------------------------------------------------------- /chapter-10/02-time-detectors/01-lag-detector-delta-lake-apache-spark-grafana/assets/alert_firing.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-10/02-time-detectors/01-lag-detector-delta-lake-apache-spark-grafana/assets/alert_firing.png -------------------------------------------------------------------------------- /chapter-10/02-time-detectors/01-lag-detector-delta-lake-apache-spark-grafana/assets/alert_history.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-10/02-time-detectors/01-lag-detector-delta-lake-apache-spark-grafana/assets/alert_history.png -------------------------------------------------------------------------------- /chapter-10/02-time-detectors/01-lag-detector-delta-lake-apache-spark-grafana/assets/alert_normal.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-10/02-time-detectors/01-lag-detector-delta-lake-apache-spark-grafana/assets/alert_normal.png -------------------------------------------------------------------------------- /chapter-10/02-time-detectors/01-lag-detector-delta-lake-apache-spark-grafana/assets/alert_pending.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-10/02-time-detectors/01-lag-detector-delta-lake-apache-spark-grafana/assets/alert_pending.png -------------------------------------------------------------------------------- /chapter-10/02-time-detectors/01-lag-detector-delta-lake-apache-spark-grafana/assets/explore_metrics_reader.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-10/02-time-detectors/01-lag-detector-delta-lake-apache-spark-grafana/assets/explore_metrics_reader.png -------------------------------------------------------------------------------- /chapter-10/02-time-detectors/01-lag-detector-delta-lake-apache-spark-grafana/assets/explore_metrics_writer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-10/02-time-detectors/01-lag-detector-delta-lake-apache-spark-grafana/assets/explore_metrics_writer.png -------------------------------------------------------------------------------- /chapter-10/02-time-detectors/01-lag-detector-delta-lake-apache-spark-grafana/assets/writer_version_increased.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-10/02-time-detectors/01-lag-detector-delta-lake-apache-spark-grafana/assets/writer_version_increased.png -------------------------------------------------------------------------------- /chapter-10/02-time-detectors/01-lag-detector-delta-lake-apache-spark-grafana/requirements.txt: -------------------------------------------------------------------------------- 1 | delta-spark==3.0.0 2 | pyspark==3.5.0 3 | prometheus-client==0.20.0 -------------------------------------------------------------------------------- /chapter-10/02-time-detectors/02-sla-misses-apache-airflow/assets/sla_table.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-10/02-time-detectors/02-sla-misses-apache-airflow/assets/sla_table.png -------------------------------------------------------------------------------- /chapter-10/02-time-detectors/02-sla-misses-apache-airflow/dags/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-10/02-time-detectors/02-sla-misses-apache-airflow/dags/__init__.py -------------------------------------------------------------------------------- /chapter-10/02-time-detectors/02-sla-misses-apache-airflow/requirements.txt: -------------------------------------------------------------------------------- 1 | apache-airflow==2.7.3 2 | apache-airflow-providers-postgres==5.8.0 3 | -------------------------------------------------------------------------------- /chapter-10/02-time-detectors/02-sla-misses-apache-flink/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-10/02-time-detectors/02-sla-misses-apache-flink/__init__.py -------------------------------------------------------------------------------- /chapter-10/02-time-detectors/02-sla-misses-apache-flink/assets/alert_a_b_parts.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-10/02-time-detectors/02-sla-misses-apache-flink/assets/alert_a_b_parts.png -------------------------------------------------------------------------------- /chapter-10/02-time-detectors/02-sla-misses-apache-flink/assets/alert_conditions_part.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-10/02-time-detectors/02-sla-misses-apache-flink/assets/alert_conditions_part.png -------------------------------------------------------------------------------- /chapter-10/02-time-detectors/02-sla-misses-apache-flink/assets/alert_firing.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-10/02-time-detectors/02-sla-misses-apache-flink/assets/alert_firing.png -------------------------------------------------------------------------------- /chapter-10/02-time-detectors/02-sla-misses-apache-flink/assets/alert_history.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-10/02-time-detectors/02-sla-misses-apache-flink/assets/alert_history.png -------------------------------------------------------------------------------- /chapter-10/02-time-detectors/02-sla-misses-apache-flink/assets/alert_normal.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-10/02-time-detectors/02-sla-misses-apache-flink/assets/alert_normal.png -------------------------------------------------------------------------------- /chapter-10/02-time-detectors/02-sla-misses-apache-flink/assets/alert_pending.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-10/02-time-detectors/02-sla-misses-apache-flink/assets/alert_pending.png -------------------------------------------------------------------------------- /chapter-10/02-time-detectors/02-sla-misses-apache-flink/assets/es_config.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-10/02-time-detectors/02-sla-misses-apache-flink/assets/es_config.png -------------------------------------------------------------------------------- /chapter-10/02-time-detectors/02-sla-misses-apache-flink/docker/kafka_healthcheck.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | JMX_PORT= 3 | kafka-topics.sh --bootstrap-server=localhost:9092 --list 4 | -------------------------------------------------------------------------------- /chapter-10/02-time-detectors/02-sla-misses-apache-flink/flink-connector-base-1.18.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-10/02-time-detectors/02-sla-misses-apache-flink/flink-connector-base-1.18.0.jar -------------------------------------------------------------------------------- /chapter-10/02-time-detectors/02-sla-misses-apache-flink/flink-connector-kafka-3.1.0-1.18.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-10/02-time-detectors/02-sla-misses-apache-flink/flink-connector-kafka-3.1.0-1.18.jar -------------------------------------------------------------------------------- /chapter-10/02-time-detectors/02-sla-misses-apache-flink/flink-metrics-prometheus-1.18.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-10/02-time-detectors/02-sla-misses-apache-flink/flink-metrics-prometheus-1.18.0.jar -------------------------------------------------------------------------------- /chapter-10/02-time-detectors/02-sla-misses-apache-flink/flink-sql-connector-kafka-3.1.0-1.18.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-10/02-time-detectors/02-sla-misses-apache-flink/flink-sql-connector-kafka-3.1.0-1.18.jar -------------------------------------------------------------------------------- /chapter-10/02-time-detectors/02-sla-misses-apache-flink/kafka-clients-3.7.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-10/02-time-detectors/02-sla-misses-apache-flink/kafka-clients-3.7.0.jar -------------------------------------------------------------------------------- /chapter-10/02-time-detectors/02-sla-misses-apache-flink/requirements.txt: -------------------------------------------------------------------------------- 1 | apache-flink==1.18.0 2 | numpy==1.26.4 3 | elasticsearch==8.6.2 -------------------------------------------------------------------------------- /chapter-10/02-time-detectors/02-sla-misses-apache-spark-structured-streaming/assets/alert_b_part.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-10/02-time-detectors/02-sla-misses-apache-spark-structured-streaming/assets/alert_b_part.png -------------------------------------------------------------------------------- /chapter-10/02-time-detectors/02-sla-misses-apache-spark-structured-streaming/assets/alert_conditions_part.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-10/02-time-detectors/02-sla-misses-apache-spark-structured-streaming/assets/alert_conditions_part.png -------------------------------------------------------------------------------- /chapter-10/02-time-detectors/02-sla-misses-apache-spark-structured-streaming/assets/alert_firing.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-10/02-time-detectors/02-sla-misses-apache-spark-structured-streaming/assets/alert_firing.png -------------------------------------------------------------------------------- /chapter-10/02-time-detectors/02-sla-misses-apache-spark-structured-streaming/assets/alert_history.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-10/02-time-detectors/02-sla-misses-apache-spark-structured-streaming/assets/alert_history.png -------------------------------------------------------------------------------- /chapter-10/02-time-detectors/02-sla-misses-apache-spark-structured-streaming/assets/alert_normal.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-10/02-time-detectors/02-sla-misses-apache-spark-structured-streaming/assets/alert_normal.png -------------------------------------------------------------------------------- /chapter-10/02-time-detectors/02-sla-misses-apache-spark-structured-streaming/assets/alert_pending.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-10/02-time-detectors/02-sla-misses-apache-spark-structured-streaming/assets/alert_pending.png -------------------------------------------------------------------------------- /chapter-10/02-time-detectors/02-sla-misses-apache-spark-structured-streaming/docker/kafka_healthcheck.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | JMX_PORT= 3 | kafka-topics.sh --bootstrap-server=localhost:9092 --list 4 | -------------------------------------------------------------------------------- /chapter-10/02-time-detectors/02-sla-misses-apache-spark-structured-streaming/requirements.txt: -------------------------------------------------------------------------------- 1 | pyspark==3.5.0 2 | prometheus-client==0.20.0 -------------------------------------------------------------------------------- /chapter-10/03-data-lineage/01-dataset-tracker-openlineage-apache-airflow-marquez/assets/tables_lineage.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-10/03-data-lineage/01-dataset-tracker-openlineage-apache-airflow-marquez/assets/tables_lineage.png -------------------------------------------------------------------------------- /chapter-10/03-data-lineage/01-dataset-tracker-openlineage-apache-airflow-marquez/dags/macros.py: -------------------------------------------------------------------------------- 1 | def get_table_name(ds_nodash: str) -> str: 2 | return 'devices_' + ds_nodash 3 | 4 | -------------------------------------------------------------------------------- /chapter-10/03-data-lineage/01-dataset-tracker-openlineage-apache-airflow-marquez/dataset/generation_configuration.yaml: -------------------------------------------------------------------------------- 1 | dataset: 2 | rows: 50 3 | composition_percentage: 4 | duplicates: 0 5 | missing_fields: 0 6 | unprocessable_rows: 0 7 | data_blocker: 8 | type: 'no' 9 | entity: 10 | type: device 11 | generator: 12 | type: one-shot 13 | writer: 14 | type: csv 15 | configuration: 16 | output_path: '/home/data_generator_user/data_generator_output/input' 17 | clean_path: true 18 | -------------------------------------------------------------------------------- /chapter-10/03-data-lineage/01-dataset-tracker-openlineage-apache-airflow-marquez/requirements.txt: -------------------------------------------------------------------------------- 1 | apache-airflow==2.9.0 2 | apache-airflow-providers-postgres==5.12.0 3 | apache-airflow-providers-openlineage==1.11.0 -------------------------------------------------------------------------------- /chapter-10/03-data-lineage/01-dataset-tracker-openlineage-apache-airflow-marquez/sql/refresh_aggregates.sql: -------------------------------------------------------------------------------- 1 | TRUNCATE TABLE devices_aggregates; 2 | 3 | 4 | INSERT INTO devices_aggregates 5 | SELECT type, full_name, version, COUNT(*) AS all_occurrences FROM devices_raw GROUP BY type, full_name, version; -------------------------------------------------------------------------------- /chapter-10/03-data-lineage/01-dataset-tracker-openlineage-apache-airflow-marquez/sql/refresh_bi_aggregates.sql: -------------------------------------------------------------------------------- 1 | TRUNCATE TABLE devices_aggregates_bi; 2 | 3 | INSERT INTO devices_aggregates_bi SELECT * FROM devices_aggregates; -------------------------------------------------------------------------------- /chapter-10/03-data-lineage/01-dataset-tracker-openlineage-apache-kafka-apache-spark-marquez/assets/topics_lineage.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-10/03-data-lineage/01-dataset-tracker-openlineage-apache-kafka-apache-spark-marquez/assets/topics_lineage.png -------------------------------------------------------------------------------- /chapter-10/03-data-lineage/01-dataset-tracker-openlineage-apache-kafka-apache-spark-marquez/config.py: -------------------------------------------------------------------------------- 1 | def get_base_dir() -> str: 2 | return '/tmp/dedp/ch10/03-data-lineage/01-dataset-tracker-openlineage-apache-kafka-apache-spark-marquez/' 3 | -------------------------------------------------------------------------------- /chapter-10/03-data-lineage/01-dataset-tracker-openlineage-apache-kafka-apache-spark-marquez/requirements.txt: -------------------------------------------------------------------------------- 1 | pyspark==3.5.0 2 | pandas==2.0.2 3 | pyarrow==14.0.1 4 | confluent-kafka==2.0.2 5 | elasticsearch==8.6.2 6 | ydata-profiling[pyspark]==4.9.0 -------------------------------------------------------------------------------- /chapter-10/03-data-lineage/02-fine-grained-tracker-apache-spark-apache-kafka/config.py: -------------------------------------------------------------------------------- 1 | def get_base_dir() -> str: 2 | return '/tmp/dedp/ch10/03-data-lineage/02-fine-grained-tracker-apache-spark-apache-kafka/' 3 | -------------------------------------------------------------------------------- /chapter-10/03-data-lineage/02-fine-grained-tracker-apache-spark-apache-kafka/requirements.txt: -------------------------------------------------------------------------------- 1 | pyspark==3.5.0 2 | delta-spark==3.0.0 3 | -------------------------------------------------------------------------------- /chapter-10/03-data-lineage/02-fine-grained-tracker-apache-spark-openlineage-marquez/dataset_b_writer_job.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartosz25/data-engineering-design-patterns-book/be54bbc3d52b96b592604d1b6b5d087a8ee9d6e1/chapter-10/03-data-lineage/02-fine-grained-tracker-apache-spark-openlineage-marquez/dataset_b_writer_job.py -------------------------------------------------------------------------------- /chapter-10/03-data-lineage/02-fine-grained-tracker-apache-spark-openlineage-marquez/requirements.txt: -------------------------------------------------------------------------------- 1 | delta-spark==3.0.0 2 | pyspark==3.5.0 --------------------------------------------------------------------------------