├── .gitignore
├── 01-docker-terraform
    ├── 1_terraform_gcp
    │   ├── 1_terraform_overview.md
    │   ├── 2_gcp_overview.md
    │   ├── README.md
    │   ├── terraform
    │   │   ├── README.md
    │   │   ├── terraform_basic
    │   │   │   └── main.tf
    │   │   └── terraform_with_variables
    │   │   │   ├── main.tf
    │   │   │   └── variables.tf
    │   └── windows.md
    ├── 2_docker_sql
    │   ├── .gitignore
    │   ├── Dockerfile
    │   ├── README.md
    │   ├── data-loading-parquet.ipynb
    │   ├── data-loading-parquet.py
    │   ├── docker-compose.yaml
    │   ├── ingest_data.py
    │   ├── pg-test-connection.ipynb
    │   ├── pipeline.py
    │   └── upload-data.ipynb
    └── README.md
├── 02-workflow-orchestration
    ├── README.md
    ├── docker
    │   ├── combined
    │   │   └── docker-compose.yml
    │   ├── kestra
    │   │   └── docker-compose.yml
    │   └── postgres
    │   │   └── docker-compose.yml
    ├── flows
    │   ├── 01_getting_started_data_pipeline.yaml
    │   ├── 02_postgres_taxi.yaml
    │   ├── 02_postgres_taxi_scheduled.yaml
    │   ├── 03_postgres_dbt.yaml
    │   ├── 04_gcp_kv.yaml
    │   ├── 05_gcp_setup.yaml
    │   ├── 06_gcp_taxi.yaml
    │   ├── 06_gcp_taxi_scheduled.yaml
    │   └── 07_gcp_dbt.yaml
    └── images
    │   └── homework.png
├── 03-data-warehouse
    ├── README.md
    ├── big_query.sql
    ├── big_query_hw.sql
    ├── big_query_ml.sql
    ├── extract_model.md
    └── extras
    │   ├── README.md
    │   └── web_to_gcs.py
├── 04-analytics-engineering
    ├── README.md
    ├── SQL_refresher.md
    ├── dbt_cloud_setup.md
    ├── docker_setup
    │   ├── Dockerfile
    │   ├── README.md
    │   └── docker-compose.yaml
    └── taxi_rides_ny
    │   ├── .gitignore
    │   ├── .gitkeep
    │   ├── README.md
    │   ├── analyses
    │       ├── .gitkeep
    │       └── hack-load-data.sql
    │   ├── dbt_project.yml
    │   ├── macros
    │       ├── .gitkeep
    │       ├── get_payment_type_description.sql
    │       └── macros_properties.yml
    │   ├── models
    │       ├── core
    │       │   ├── dim_zones.sql
    │       │   ├── dm_monthly_zone_revenue.sql
    │       │   ├── fact_trips.sql
    │       │   └── schema.yml
    │       └── staging
    │       │   ├── schema.yml
    │       │   ├── stg_green_tripdata.sql
    │       │   └── stg_yellow_tripdata.sql
    │   ├── package-lock.yml
    │   ├── packages.yml
    │   ├── seeds
    │       ├── .gitkeep
    │       ├── seeds_properties.yml
    │       └── taxi_zone_lookup.csv
    │   └── snapshots
    │       └── .gitkeep
├── 05-batch
    ├── .gitignore
    ├── README.md
    ├── code
    │   ├── 03_test.ipynb
    │   ├── 04_pyspark.ipynb
    │   ├── 05_taxi_schema.ipynb
    │   ├── 06_spark_sql.ipynb
    │   ├── 06_spark_sql.py
    │   ├── 06_spark_sql_big_query.py
    │   ├── 07_groupby_join.ipynb
    │   ├── 08_rdds.ipynb
    │   ├── 09_spark_gcs.ipynb
    │   ├── cloud.md
    │   ├── download_data.sh
    │   └── homework.ipynb
    └── setup
    │   ├── config
    │       ├── core-site.xml
    │       ├── spark-defaults.conf
    │       └── spark.dockerfile
    │   ├── hadoop-yarn.md
    │   ├── linux.md
    │   ├── macos.md
    │   ├── pyspark.md
    │   └── windows.md
├── 06-streaming
    ├── .gitignore
    ├── README.md
    ├── java
    │   └── kafka_examples
    │   │   ├── .gitignore
    │   │   ├── build.gradle
    │   │   ├── build
    │   │       └── generated-main-avro-java
    │   │       │   └── schemaregistry
    │   │       │       ├── RideRecord.java
    │   │       │       ├── RideRecordCompatible.java
    │   │       │       └── RideRecordNoneCompatible.java
    │   │   ├── gradle
    │   │       └── wrapper
    │   │       │   ├── gradle-wrapper.jar
    │   │       │   └── gradle-wrapper.properties
    │   │   ├── gradlew
    │   │   ├── gradlew.bat
    │   │   ├── settings.gradle
    │   │   └── src
    │   │       ├── main
    │   │           ├── avro
    │   │           │   ├── rides.avsc
    │   │           │   ├── rides_compatible.avsc
    │   │           │   └── rides_non_compatible.avsc
    │   │           ├── java
    │   │           │   └── org
    │   │           │   │   └── example
    │   │           │   │       ├── AvroProducer.java
    │   │           │   │       ├── JsonConsumer.java
    │   │           │   │       ├── JsonKStream.java
    │   │           │   │       ├── JsonKStreamJoins.java
    │   │           │   │       ├── JsonKStreamWindow.java
    │   │           │   │       ├── JsonProducer.java
    │   │           │   │       ├── JsonProducerPickupLocation.java
    │   │           │   │       ├── Secrets.java
    │   │           │   │       ├── Topics.java
    │   │           │   │       ├── customserdes
    │   │           │   │           └── CustomSerdes.java
    │   │           │   │       └── data
    │   │           │   │           ├── PickupLocation.java
    │   │           │   │           ├── Ride.java
    │   │           │   │           └── VendorInfo.java
    │   │           └── resources
    │   │           │   └── rides.csv
    │   │       └── test
    │   │           └── java
    │   │               └── org
    │   │                   └── example
    │   │                       ├── JsonKStreamJoinsTest.java
    │   │                       ├── JsonKStreamTest.java
    │   │                       └── helper
    │   │                           └── DataGeneratorHelper.java
    ├── ksqldb
    │   └── commands.md
    ├── pyflink
    │   ├── .gitignore
    │   ├── Dockerfile.flink
    │   ├── LICENSE
    │   ├── Makefile
    │   ├── README.md
    │   ├── docker-compose.yml
    │   ├── homework.md
    │   ├── requirements.txt
    │   └── src
    │   │   ├── job
    │   │       ├── aggregation_job.py
    │   │       ├── start_job.py
    │   │       └── taxi_job.py
    │   │   └── producers
    │   │       ├── load_taxi_data.py
    │   │       └── producer.py
    └── python
    │   ├── README.md
    │   ├── avro_example
    │       ├── consumer.py
    │       ├── producer.py
    │       ├── ride_record.py
    │       ├── ride_record_key.py
    │       └── settings.py
    │   ├── docker
    │       ├── README.md
    │       ├── docker-compose.yml
    │       ├── kafka
    │       │   └── docker-compose.yml
    │       └── spark
    │       │   ├── build.sh
    │       │   ├── cluster-base.Dockerfile
    │       │   ├── docker-compose.yml
    │       │   ├── jupyterlab.Dockerfile
    │       │   ├── spark-base.Dockerfile
    │       │   ├── spark-master.Dockerfile
    │       │   └── spark-worker.Dockerfile
    │   ├── json_example
    │       ├── consumer.py
    │       ├── producer.py
    │       ├── ride.py
    │       └── settings.py
    │   ├── redpanda_example
    │       ├── README.md
    │       ├── consumer.py
    │       ├── docker-compose.yaml
    │       ├── producer.py
    │       ├── ride.py
    │       └── settings.py
    │   ├── requirements.txt
    │   ├── resources
    │       ├── rides.csv
    │       └── schemas
    │       │   ├── taxi_ride_key.avsc
    │       │   └── taxi_ride_value.avsc
    │   └── streams-example
    │       ├── faust
    │           ├── branch_price.py
    │           ├── producer_taxi_json.py
    │           ├── stream.py
    │           ├── stream_count_vendor_trips.py
    │           ├── taxi_rides.py
    │           └── windowing.py
    │       ├── pyspark
    │           ├── README.md
    │           ├── consumer.py
    │           ├── producer.py
    │           ├── settings.py
    │           ├── spark-submit.sh
    │           ├── streaming-notebook.ipynb
    │           └── streaming.py
    │       └── redpanda
    │           ├── README.md
    │           ├── consumer.py
    │           ├── docker-compose.yaml
    │           ├── producer.py
    │           ├── settings.py
    │           ├── spark-submit.sh
    │           ├── streaming-notebook.ipynb
    │           └── streaming.py
├── README.md
├── after-sign-up.md
├── asking-questions.md
├── awesome-data-engineering.md
├── certificates.md
├── cohorts
    ├── 2022
    │   ├── README.md
    │   ├── project.md
    │   ├── week_1_basics_n_setup
    │   │   └── homework.md
    │   ├── week_2_data_ingestion
    │   │   ├── README.md
    │   │   ├── airflow
    │   │   │   ├── .env_example
    │   │   │   ├── 1_setup_official.md
    │   │   │   ├── 2_setup_nofrills.md
    │   │   │   ├── Dockerfile
    │   │   │   ├── README.md
    │   │   │   ├── dags
    │   │   │   │   └── data_ingestion_gcs_dag.py
    │   │   │   ├── dags_local
    │   │   │   │   ├── data_ingestion_local.py
    │   │   │   │   └── ingest_script.py
    │   │   │   ├── docker-compose-nofrills.yml
    │   │   │   ├── docker-compose.yaml
    │   │   │   ├── docker-compose_2.3.4.yaml
    │   │   │   ├── docs
    │   │   │   │   ├── 1_concepts.md
    │   │   │   │   ├── arch-diag-airflow.png
    │   │   │   │   └── gcs_ingestion_dag.png
    │   │   │   ├── extras
    │   │   │   │   ├── data_ingestion_gcs_dag_ex2.py
    │   │   │   │   └── web_to_gcs.sh
    │   │   │   ├── requirements.txt
    │   │   │   └── scripts
    │   │   │   │   └── entrypoint.sh
    │   │   ├── homework
    │   │   │   ├── homework.md
    │   │   │   └── solution.py
    │   │   └── transfer_service
    │   │   │   └── README.md
    │   ├── week_3_data_warehouse
    │   │   └── airflow
    │   │   │   ├── .env_example
    │   │   │   ├── 1_setup_official.md
    │   │   │   ├── 2_setup_nofrills.md
    │   │   │   ├── README.md
    │   │   │   ├── dags
    │   │   │       └── gcs_to_bq_dag.py
    │   │   │   ├── docker-compose-nofrills.yml
    │   │   │   ├── docker-compose.yaml
    │   │   │   ├── docs
    │   │   │       ├── gcs_2_bq_dag_graph_view.png
    │   │   │       └── gcs_2_bq_dag_tree_view.png
    │   │   │   └── scripts
    │   │   │       └── entrypoint.sh
    │   ├── week_5_batch_processing
    │   │   └── homework.md
    │   └── week_6_stream_processing
    │   │   └── homework.md
    ├── 2023
    │   ├── README.md
    │   ├── leaderboard.md
    │   ├── project.md
    │   ├── week_1_docker_sql
    │   │   └── homework.md
    │   ├── week_1_terraform
    │   │   └── homework.md
    │   ├── week_2_workflow_orchestration
    │   │   ├── README.md
    │   │   └── homework.md
    │   ├── week_3_data_warehouse
    │   │   └── homework.md
    │   ├── week_4_analytics_engineering
    │   │   └── homework.md
    │   ├── week_5_batch_processing
    │   │   └── homework.md
    │   ├── week_6_stream_processing
    │   │   ├── client.properties
    │   │   ├── homework.md
    │   │   ├── producer_confluent.py
    │   │   ├── settings.py
    │   │   ├── spark-submit.sh
    │   │   └── streaming_confluent.py
    │   └── workshops
    │   │   └── piperider.md
    ├── 2024
    │   ├── 01-docker-terraform
    │   │   └── homework.md
    │   ├── 02-workflow-orchestration
    │   │   ├── README.md
    │   │   └── homework.md
    │   ├── 03-data-warehouse
    │   │   └── homework.md
    │   ├── 04-analytics-engineering
    │   │   └── homework.md
    │   ├── 05-batch
    │   │   └── homework.md
    │   ├── 06-streaming
    │   │   ├── docker-compose.yml
    │   │   └── homework.md
    │   ├── README.md
    │   ├── leaderboard.md
    │   ├── project.md
    │   └── workshops
    │   │   ├── dlt.md
    │   │   ├── dlt_resources
    │   │       ├── data_ingestion_workshop.md
    │   │       ├── homework_solution.ipynb
    │   │       ├── homework_starter.ipynb
    │   │       ├── incremental_loading.png
    │   │       └── workshop.ipynb
    │   │   └── rising-wave.md
    └── 2025
    │   ├── 01-docker-terraform
    │       ├── homework.md
    │       └── solution.md
    │   ├── 02-workflow-orchestration
    │       ├── homework.md
    │       └── solution.md
    │   ├── 03-data-warehouse
    │       ├── DLT_upload_to_GCP.ipynb
    │       ├── homework.md
    │       └── load_yellow_taxi_data.py
    │   ├── 04-analytics-engineering
    │       ├── homework.md
    │       └── homework_q2.png
    │   ├── 05-batch
    │       ├── homework.md
    │       └── homework
    │       │   └── solution.ipynb
    │   ├── 06-streaming
    │       ├── homework.md
    │       └── homework
    │       │   └── homework.ipynb
    │   ├── README.md
    │   ├── project.md
    │   └── workshops
    │       ├── dlt
    │           ├── README.md
    │           ├── data_ingestion_workshop.md
    │           ├── dlt_homework.md
    │           └── img
    │           │   ├── Rest_API.png
    │           │   ├── dlt.png
    │           │   └── pipes.jpg
    │       └── dynamic_load_dlt.py
├── dataset.md
├── images
    ├── architecture
    │   ├── arch_v3_workshops.jpg
    │   ├── arch_v4_workshops.jpg
    │   └── photo1700757552.jpeg
    ├── aws
    │   └── iam.png
    ├── dlthub.png
    ├── kestra.svg
    ├── mage.svg
    ├── piperider.png
    └── rising-wave.png
├── learning-in-public.md
└── projects
    ├── README.md
    └── datasets.md


/.gitignore:
--------------------------------------------------------------------------------
 1 | 
 2 | .DS_Store
 3 | .idea
 4 | *.tfstate
 5 | *.tfstate.*
 6 | **.terraform
 7 | **.terraform.lock.*
 8 | **google_credentials.json
 9 | **logs/
10 | **.env
11 | **__pycache__/
12 | .history
13 | **/ny_taxi_postgres_data/*
14 | serving_dir
15 | .ipynb_checkpoints/
16 | !week_6_stream_processing/avro_example/data/rides.csv
17 | 


--------------------------------------------------------------------------------
/01-docker-terraform/1_terraform_gcp/1_terraform_overview.md:
--------------------------------------------------------------------------------
 1 | ## Terraform Overview
 2 | 
 3 | [Video](https://www.youtube.com/watch?v=18jIzE41fJ4&list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb&index=2)
 4 | 
 5 | ### Concepts
 6 | 
 7 | #### Introduction
 8 | 
 9 | 1. What is [Terraform](https://www.terraform.io)?
10 |    * open-source tool by [HashiCorp](https://www.hashicorp.com), used for provisioning infrastructure resources
11 |    * supports DevOps best practices for change management
12 |    * Managing configuration files in source control to maintain an ideal provisioning state 
13 |      for testing and production environments
14 | 2. What is IaC?
15 |    * Infrastructure-as-Code
16 |    * build, change, and manage your infrastructure in a safe, consistent, and repeatable way 
17 |      by defining resource configurations that you can version, reuse, and share.
18 | 3. Some advantages
19 |    * Infrastructure lifecycle management
20 |    * Version control commits
21 |    * Very useful for stack-based deployments, and with cloud providers such as AWS, GCP, Azure, K8S…
22 |    * State-based approach to track resource changes throughout deployments
23 | 
24 | 
25 | #### Files
26 | 
27 | * `main.tf`
28 | * `variables.tf`
29 | * Optional: `resources.tf`, `output.tf`
30 | * `.tfstate`
31 | 
32 | #### Declarations
33 | * `terraform`: configure basic Terraform settings to provision your infrastructure
34 |    * `required_version`: minimum Terraform version to apply to your configuration
35 |    * `backend`: stores Terraform's "state" snapshots, to map real-world resources to your configuration.
36 |       * `local`: stores state file locally as `terraform.tfstate`
37 |    * `required_providers`: specifies the providers required by the current module
38 | * `provider`:
39 |    * adds a set of resource types and/or data sources that Terraform can manage
40 |    * The Terraform Registry is the main directory of publicly available providers from most major infrastructure platforms.
41 | * `resource`
42 |   * blocks to define components of your infrastructure
43 |   * Project modules/resources: google_storage_bucket, google_bigquery_dataset, google_bigquery_table
44 | * `variable` & `locals`
45 |   * runtime arguments and constants
46 | 
47 | 
48 | #### Execution steps
49 | 1. `terraform init`: 
50 |     * Initializes & configures the backend, installs plugins/providers, & checks out an existing configuration from a version control 
51 | 2. `terraform plan`:
52 |     * Matches/previews local changes against a remote state, and proposes an Execution Plan.
53 | 3. `terraform apply`: 
54 |     * Asks for approval to the proposed plan, and applies changes to cloud
55 | 4. `terraform destroy`
56 |     * Removes your stack from the Cloud
57 | 
58 | 
59 | ### Terraform Workshop to create GCP Infra
60 | Continue [here](./terraform): `week_1_basics_n_setup/1_terraform_gcp/terraform`
61 | 
62 | 
63 | ### References
64 | https://learn.hashicorp.com/collections/terraform/gcp-get-started
65 | 


--------------------------------------------------------------------------------
/01-docker-terraform/1_terraform_gcp/2_gcp_overview.md:
--------------------------------------------------------------------------------
 1 | ## GCP Overview
 2 | 
 3 | [Video](https://www.youtube.com/watch?v=18jIzE41fJ4&list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb&index=2)
 4 | 
 5 | 
 6 | ### Project infrastructure modules in GCP:
 7 | * Google Cloud Storage (GCS): Data Lake
 8 | * BigQuery: Data Warehouse
 9 | 
10 | (Concepts explained in Week 2 - Data Ingestion)
11 | 
12 | ### Initial Setup
13 | 
14 | For this course, we'll use a free version (upto EUR 300 credits). 
15 | 
16 | 1. Create an account with your Google email ID 
17 | 2. Setup your first [project](https://console.cloud.google.com/) if you haven't already
18 |     * eg. "DTC DE Course", and note down the "Project ID" (we'll use this later when deploying infra with TF)
19 | 3. Setup [service account & authentication](https://cloud.google.com/docs/authentication/getting-started) for this project
20 |     * Grant `Viewer` role to begin with.
21 |     * Download service-account-keys (.json) for auth.
22 | 4. Download [SDK](https://cloud.google.com/sdk/docs/quickstart) for local setup
23 | 5. Set environment variable to point to your downloaded GCP keys:
24 |    ```shell
25 |    export GOOGLE_APPLICATION_CREDENTIALS="<path/to/your/service-account-authkeys>.json"
26 |    
27 |    # Refresh token/session, and verify authentication
28 |    gcloud auth application-default login
29 |    ```
30 |    
31 | ### Setup for Access
32 |  
33 | 1. [IAM Roles](https://cloud.google.com/storage/docs/access-control/iam-roles) for Service account:
34 |    * Go to the *IAM* section of *IAM & Admin* https://console.cloud.google.com/iam-admin/iam
35 |    * Click the *Edit principal* icon for your service account.
36 |    * Add these roles in addition to *Viewer* : **Storage Admin** + **Storage Object Admin** + **BigQuery Admin**
37 |    
38 | 2. Enable these APIs for your project:
39 |    * https://console.cloud.google.com/apis/library/iam.googleapis.com
40 |    * https://console.cloud.google.com/apis/library/iamcredentials.googleapis.com
41 |    
42 | 3. Please ensure `GOOGLE_APPLICATION_CREDENTIALS` env-var is set.
43 |    ```shell
44 |    export GOOGLE_APPLICATION_CREDENTIALS="<path/to/your/service-account-authkeys>.json"
45 |    ```
46 |  
47 | ### Terraform Workshop to create GCP Infra
48 | Continue [here](./terraform): `week_1_basics_n_setup/1_terraform_gcp/terraform`
49 | 


--------------------------------------------------------------------------------
/01-docker-terraform/1_terraform_gcp/README.md:
--------------------------------------------------------------------------------
 1 | ## Local Setup for Terraform and GCP
 2 | 
 3 | ### Pre-Requisites
 4 | 1. Terraform client installation: https://www.terraform.io/downloads
 5 | 2. Cloud Provider account: https://console.cloud.google.com/ 
 6 | 
 7 | ### Terraform Concepts
 8 | [Terraform Overview](1_terraform_overview.md)
 9 | 
10 | ### GCP setup
11 | 
12 | 1. [Setup for First-time](2_gcp_overview.md#initial-setup)
13 |     * [Only for Windows](windows.md) - Steps 4 & 5
14 | 2. [IAM / Access specific to this course](2_gcp_overview.md#setup-for-access)
15 | 
16 | ### Terraform Workshop for GCP Infra
17 | Your setup is ready!
18 | Now head to the [terraform](terraform) directory, and perform the execution steps to create your infrastructure.
19 | 


--------------------------------------------------------------------------------
/01-docker-terraform/1_terraform_gcp/terraform/README.md:
--------------------------------------------------------------------------------
 1 | ### Concepts
 2 | * [Terraform_overview](../1_terraform_overview.md)
 3 | 
 4 | ### Execution
 5 | 
 6 | ```shell
 7 | # Refresh service-account's auth-token for this session
 8 | gcloud auth application-default login
 9 | 
10 | # Initialize state file (.tfstate)
11 | terraform init
12 | 
13 | # Check changes to new infra plan
14 | terraform plan -var="project=<your-gcp-project-id>"
15 | ```
16 | 
17 | ```shell
18 | # Create new infra
19 | terraform apply -var="project=<your-gcp-project-id>"
20 | ```
21 | 
22 | ```shell
23 | # Delete infra after your work, to avoid costs on any running services
24 | terraform destroy
25 | ```
26 | 


--------------------------------------------------------------------------------
/01-docker-terraform/1_terraform_gcp/terraform/terraform_basic/main.tf:
--------------------------------------------------------------------------------
 1 | terraform {
 2 |   required_providers {
 3 |     google = {
 4 |       source  = "hashicorp/google"
 5 |       version = "4.51.0"
 6 |     }
 7 |   }
 8 | }
 9 | 
10 | provider "google" {
11 | # Credentials only needs to be set if you do not have the GOOGLE_APPLICATION_CREDENTIALS set
12 | #  credentials = 
13 |   project = "<Your Project ID>"
14 |   region  = "us-central1"
15 | }
16 | 
17 | 
18 | 
19 | resource "google_storage_bucket" "data-lake-bucket" {
20 |   name          = "<Your Unique Bucket Name>"
21 |   location      = "US"
22 | 
23 |   # Optional, but recommended settings:
24 |   storage_class = "STANDARD"
25 |   uniform_bucket_level_access = true
26 | 
27 |   versioning {
28 |     enabled     = true
29 |   }
30 | 
31 |   lifecycle_rule {
32 |     action {
33 |       type = "Delete"
34 |     }
35 |     condition {
36 |       age = 30  // days
37 |     }
38 |   }
39 | 
40 |   force_destroy = true
41 | }
42 | 
43 | 
44 | resource "google_bigquery_dataset" "dataset" {
45 |   dataset_id = "<The Dataset Name You Want to Use>"
46 |   project    = "<Your Project ID>"
47 |   location   = "US"
48 | }


--------------------------------------------------------------------------------
/01-docker-terraform/1_terraform_gcp/terraform/terraform_with_variables/main.tf:
--------------------------------------------------------------------------------
 1 | terraform {
 2 |   required_providers {
 3 |     google = {
 4 |       source  = "hashicorp/google"
 5 |       version = "5.6.0"
 6 |     }
 7 |   }
 8 | }
 9 | 
10 | provider "google" {
11 |   credentials = file(var.credentials)
12 |   project     = var.project
13 |   region      = var.region
14 | }
15 | 
16 | 
17 | resource "google_storage_bucket" "demo-bucket" {
18 |   name          = var.gcs_bucket_name
19 |   location      = var.location
20 |   force_destroy = true
21 | 
22 | 
23 |   lifecycle_rule {
24 |     condition {
25 |       age = 1
26 |     }
27 |     action {
28 |       type = "AbortIncompleteMultipartUpload"
29 |     }
30 |   }
31 | }
32 | 
33 | 
34 | 
35 | resource "google_bigquery_dataset" "demo_dataset" {
36 |   dataset_id = var.bq_dataset_name
37 |   location   = var.location
38 | }


--------------------------------------------------------------------------------
/01-docker-terraform/1_terraform_gcp/terraform/terraform_with_variables/variables.tf:
--------------------------------------------------------------------------------
 1 | variable "credentials" {
 2 |   description = "My Credentials"
 3 |   default     = "<Path to your Service Account json file>"
 4 |   #ex: if you have a directory where this file is called keys with your service account json file
 5 |   #saved there as my-creds.json you could use default = "./keys/my-creds.json"
 6 | }
 7 | 
 8 | 
 9 | variable "project" {
10 |   description = "Project"
11 |   default     = "<Your Project ID>"
12 | }
13 | 
14 | variable "region" {
15 |   description = "Region"
16 |   #Update the below to your desired region
17 |   default     = "us-central1"
18 | }
19 | 
20 | variable "location" {
21 |   description = "Project Location"
22 |   #Update the below to your desired location
23 |   default     = "US"
24 | }
25 | 
26 | variable "bq_dataset_name" {
27 |   description = "My BigQuery Dataset Name"
28 |   #Update the below to what you want your dataset to be called
29 |   default     = "demo_dataset"
30 | }
31 | 
32 | variable "gcs_bucket_name" {
33 |   description = "My Storage Bucket Name"
34 |   #Update the below to a unique bucket name
35 |   default     = "terraform-demo-terra-bucket"
36 | }
37 | 
38 | variable "gcs_storage_class" {
39 |   description = "Bucket Storage Class"
40 |   default     = "STANDARD"
41 | }


--------------------------------------------------------------------------------
/01-docker-terraform/2_docker_sql/.gitignore:
--------------------------------------------------------------------------------
1 | .ipynb_checkpoints/
2 | ny_taxi_postgres_data/
3 | *.csv


--------------------------------------------------------------------------------
/01-docker-terraform/2_docker_sql/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.9.1
2 | 
3 | RUN apt-get install wget
4 | RUN pip install pandas sqlalchemy psycopg2
5 | 
6 | WORKDIR /app
7 | COPY ingest_data.py ingest_data.py 
8 | 
9 | ENTRYPOINT [ "python", "ingest_data.py" ]


--------------------------------------------------------------------------------
/01-docker-terraform/2_docker_sql/data-loading-parquet.py:
--------------------------------------------------------------------------------
 1 | #Cleaned up version of data-loading.ipynb
 2 | import argparse, os, sys
 3 | from time import time
 4 | import pandas as pd 
 5 | import pyarrow.parquet as pq
 6 | from sqlalchemy import create_engine
 7 | 
 8 | 
 9 | def main(params):
10 |     user = params.user
11 |     password = params.password
12 |     host = params.host
13 |     port = params.port
14 |     db = params.db
15 |     tb = params.tb
16 |     url = params.url
17 |     
18 |     # Get the name of the file from url
19 |     file_name = url.rsplit('/', 1)[-1].strip()
20 |     print(f'Downloading {file_name} ...')
21 |     # Download file from url
22 |     os.system(f'curl {url.strip()} -o {file_name}')
23 |     print('\n')
24 | 
25 |     # Create SQL engine
26 |     engine = create_engine(f'postgresql://{user}:{password}@{host}:{port}/{db}')
27 | 
28 |     # Read file based on csv or parquet
29 |     if '.csv' in file_name:
30 |         df = pd.read_csv(file_name, nrows=10)
31 |         df_iter = pd.read_csv(file_name, iterator=True, chunksize=100000)
32 |     elif '.parquet' in file_name:
33 |         file = pq.ParquetFile(file_name)
34 |         df = next(file.iter_batches(batch_size=10)).to_pandas()
35 |         df_iter = file.iter_batches(batch_size=100000)
36 |     else: 
37 |         print('Error. Only .csv or .parquet files allowed.')
38 |         sys.exit()
39 | 
40 | 
41 |     # Create the table
42 |     df.head(0).to_sql(name=tb, con=engine, if_exists='replace')
43 | 
44 | 
45 |     # Insert values
46 |     t_start = time()
47 |     count = 0
48 |     for batch in df_iter:
49 |         count+=1
50 | 
51 |         if '.parquet' in file_name:
52 |             batch_df = batch.to_pandas()
53 |         else:
54 |             batch_df = batch
55 | 
56 |         print(f'inserting batch {count}...')
57 | 
58 |         b_start = time()
59 |         batch_df.to_sql(name=tb, con=engine, if_exists='append')
60 |         b_end = time()
61 | 
62 |         print(f'inserted! time taken {b_end-b_start:10.3f} seconds.\n')
63 |         
64 |     t_end = time()   
65 |     print(f'Completed! Total time taken was {t_end-t_start:10.3f} seconds for {count} batches.')    
66 | 
67 | 
68 | 
69 | if __name__ == '__main__':
70 |     #Parsing arguments 
71 |     parser = argparse.ArgumentParser(description='Loading data from .paraquet file link to a Postgres datebase.')
72 | 
73 |     parser.add_argument('--user', help='Username for Postgres.')
74 |     parser.add_argument('--password', help='Password to the username for Postgres.')
75 |     parser.add_argument('--host', help='Hostname for Postgres.')
76 |     parser.add_argument('--port', help='Port for Postgres connection.')
77 |     parser.add_argument('--db', help='Databse name for Postgres')
78 |     parser.add_argument('--tb', help='Destination table name for Postgres.')
79 |     parser.add_argument('--url', help='URL for .paraquet file.')
80 | 
81 |     args = parser.parse_args()
82 |     main(args)
83 | 
84 | 
85 | 
86 | 
87 | 


--------------------------------------------------------------------------------
/01-docker-terraform/2_docker_sql/docker-compose.yaml:
--------------------------------------------------------------------------------
 1 | services:
 2 |   pgdatabase:
 3 |     image: postgres:13
 4 |     environment:
 5 |       - POSTGRES_USER=root
 6 |       - POSTGRES_PASSWORD=root
 7 |       - POSTGRES_DB=ny_taxi
 8 |     volumes:
 9 |       - "./ny_taxi_postgres_data:/var/lib/postgresql/data:rw"
10 |     ports:
11 |       - "5432:5432"
12 |   pgadmin:
13 |     image: dpage/pgadmin4
14 |     environment:
15 |       - PGADMIN_DEFAULT_EMAIL=admin@admin.com
16 |       - PGADMIN_DEFAULT_PASSWORD=root
17 |     ports:
18 |       - "8080:80"
19 |     


--------------------------------------------------------------------------------
/01-docker-terraform/2_docker_sql/ingest_data.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding: utf-8
 3 | 
 4 | import os
 5 | import argparse
 6 | 
 7 | from time import time
 8 | 
 9 | import pandas as pd
10 | from sqlalchemy import create_engine
11 | 
12 | 
13 | def main(params):
14 |     user = params.user
15 |     password = params.password
16 |     host = params.host 
17 |     port = params.port 
18 |     db = params.db
19 |     table_name = params.table_name
20 |     url = params.url
21 |     
22 |     # the backup files are gzipped, and it's important to keep the correct extension
23 |     # for pandas to be able to open the file
24 |     if url.endswith('.csv.gz'):
25 |         csv_name = 'output.csv.gz'
26 |     else:
27 |         csv_name = 'output.csv'
28 | 
29 |     os.system(f"wget {url} -O {csv_name}")
30 | 
31 |     engine = create_engine(f'postgresql://{user}:{password}@{host}:{port}/{db}')
32 | 
33 |     df_iter = pd.read_csv(csv_name, iterator=True, chunksize=100000)
34 | 
35 |     df = next(df_iter)
36 | 
37 |     df.tpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime)
38 |     df.tpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime)
39 | 
40 |     df.head(n=0).to_sql(name=table_name, con=engine, if_exists='replace')
41 | 
42 |     df.to_sql(name=table_name, con=engine, if_exists='append')
43 | 
44 | 
45 |     while True: 
46 | 
47 |         try:
48 |             t_start = time()
49 |             
50 |             df = next(df_iter)
51 | 
52 |             df.tpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime)
53 |             df.tpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime)
54 | 
55 |             df.to_sql(name=table_name, con=engine, if_exists='append')
56 | 
57 |             t_end = time()
58 | 
59 |             print('inserted another chunk, took %.3f second' % (t_end - t_start))
60 | 
61 |         except StopIteration:
62 |             print("Finished ingesting data into the postgres database")
63 |             break
64 | 
65 | if __name__ == '__main__':
66 |     parser = argparse.ArgumentParser(description='Ingest CSV data to Postgres')
67 | 
68 |     parser.add_argument('--user', required=True, help='user name for postgres')
69 |     parser.add_argument('--password', required=True, help='password for postgres')
70 |     parser.add_argument('--host', required=True, help='host for postgres')
71 |     parser.add_argument('--port', required=True, help='port for postgres')
72 |     parser.add_argument('--db', required=True, help='database name for postgres')
73 |     parser.add_argument('--table_name', required=True, help='name of the table where we will write the results to')
74 |     parser.add_argument('--url', required=True, help='url of the csv file')
75 | 
76 |     args = parser.parse_args()
77 | 
78 |     main(args)
79 | 


--------------------------------------------------------------------------------
/01-docker-terraform/2_docker_sql/pipeline.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | import pandas as pd
 4 | 
 5 | print(sys.argv)
 6 | 
 7 | day = sys.argv[1]
 8 | 
 9 | # some fancy stuff with pandas
10 | 
11 | print(f'job finished successfully for day = {day}')


--------------------------------------------------------------------------------
/02-workflow-orchestration/docker/combined/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | volumes:
 2 |   postgres-data:
 3 |     driver: local
 4 |   kestra-data:
 5 |     driver: local
 6 |   zoomcamp-data:
 7 |     driver: local
 8 | 
 9 | services:
10 |   postgres:
11 |     image: postgres
12 |     volumes:
13 |       - postgres-data:/var/lib/postgresql/data
14 |     environment:
15 |       POSTGRES_DB: kestra
16 |       POSTGRES_USER: kestra
17 |       POSTGRES_PASSWORD: k3str4
18 |     healthcheck:
19 |       test: ["CMD-SHELL", "pg_isready -d $${POSTGRES_DB} -U $${POSTGRES_USER}"]
20 |       interval: 30s
21 |       timeout: 10s
22 |       retries: 10
23 | 
24 |   kestra:
25 |     image: kestra/kestra:v0.20.7
26 |     pull_policy: always
27 |     # Note that this setup with a root user is intended for development purpose.
28 |     # Our base image runs without root, but the Docker Compose implementation needs root to access the Docker socket
29 |     # To run Kestra in a rootless mode in production, see: https://kestra.io/docs/installation/podman-compose
30 |     user: "root"
31 |     command: server standalone
32 |     volumes:
33 |       - kestra-data:/app/storage
34 |       - /var/run/docker.sock:/var/run/docker.sock
35 |       - /tmp/kestra-wd:/tmp/kestra-wd
36 |     environment:
37 |       KESTRA_CONFIGURATION: |
38 |         datasources:
39 |           postgres:
40 |             url: jdbc:postgresql://postgres:5432/kestra
41 |             driverClassName: org.postgresql.Driver
42 |             username: kestra
43 |             password: k3str4
44 |         kestra:
45 |           server:
46 |             basicAuth:
47 |               enabled: false
48 |               username: "admin@kestra.io" # it must be a valid email address
49 |               password: kestra
50 |           repository:
51 |             type: postgres
52 |           storage:
53 |             type: local
54 |             local:
55 |               basePath: "/app/storage"
56 |           queue:
57 |             type: postgres
58 |           tasks:
59 |             tmpDir:
60 |               path: /tmp/kestra-wd/tmp
61 |           url: http://localhost:8080/
62 |     ports:
63 |       - "8080:8080"
64 |       - "8081:8081"
65 |     depends_on:
66 |       postgres:
67 |         condition: service_started
68 |     
69 |   postgres_zoomcamp:
70 |     image: postgres
71 |     environment:
72 |       POSTGRES_USER: kestra
73 |       POSTGRES_PASSWORD: k3str4
74 |       POSTGRES_DB: postgres-zoomcamp
75 |     ports:
76 |       - "5432:5432"
77 |     volumes:
78 |       - zoomcamp-data:/var/lib/postgresql/data
79 |     depends_on:
80 |       kestra:
81 |         condition: service_started
82 | 
83 |   pgadmin:
84 |     image: dpage/pgadmin4
85 |     environment:
86 |       - PGADMIN_DEFAULT_EMAIL=admin@admin.com
87 |       - PGADMIN_DEFAULT_PASSWORD=root
88 |     ports:
89 |       - "8085:80"
90 |     depends_on:
91 |       postgres_zoomcamp:
92 |         condition: service_started
93 | 


--------------------------------------------------------------------------------
/02-workflow-orchestration/docker/kestra/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | volumes:
 2 |   postgres-data:
 3 |     driver: local
 4 |   kestra-data:
 5 |     driver: local
 6 | 
 7 | services:
 8 |   postgres:
 9 |     image: postgres
10 |     volumes:
11 |       - postgres-data:/var/lib/postgresql/data
12 |     environment:
13 |       POSTGRES_DB: kestra
14 |       POSTGRES_USER: kestra
15 |       POSTGRES_PASSWORD: k3str4
16 |     healthcheck:
17 |       test: ["CMD-SHELL", "pg_isready -d $${POSTGRES_DB} -U $${POSTGRES_USER}"]
18 |       interval: 30s
19 |       timeout: 10s
20 |       retries: 10
21 | 
22 |   kestra:
23 |     image: kestra/kestra:v0.20.7
24 |     pull_policy: always
25 |     user: "root"
26 |     command: server standalone
27 |     volumes:
28 |       - kestra-data:/app/storage
29 |       - /var/run/docker.sock:/var/run/docker.sock
30 |       - /tmp/kestra-wd:/tmp/kestra-wd
31 |     environment:
32 |       KESTRA_CONFIGURATION: |
33 |         datasources:
34 |           postgres:
35 |             url: jdbc:postgresql://postgres:5432/kestra
36 |             driverClassName: org.postgresql.Driver
37 |             username: kestra
38 |             password: k3str4
39 |         kestra:
40 |           server:
41 |             basicAuth:
42 |               enabled: false
43 |               username: "admin@kestra.io" # it must be a valid email address
44 |               password: kestra
45 |           repository:
46 |             type: postgres
47 |           storage:
48 |             type: local
49 |             local:
50 |               basePath: "/app/storage"
51 |           queue:
52 |             type: postgres
53 |           tasks:
54 |             tmpDir:
55 |               path: /tmp/kestra-wd/tmp
56 |           url: http://localhost:8080/
57 |     ports:
58 |       - "8080:8080"
59 |       - "8081:8081"
60 |     depends_on:
61 |       postgres:
62 |         condition: service_started
63 | 


--------------------------------------------------------------------------------
/02-workflow-orchestration/docker/postgres/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: "3.8"
 2 | services:
 3 |   postgres:
 4 |     image: postgres
 5 |     container_name: postgres-db
 6 |     environment:
 7 |       POSTGRES_USER: kestra
 8 |       POSTGRES_PASSWORD: k3str4
 9 |       POSTGRES_DB: postgres-zoomcamp
10 |     ports:
11 |       - "5432:5432"
12 |     volumes:
13 |       - postgres-data:/var/lib/postgresql/data
14 | volumes:
15 |   postgres-data:


--------------------------------------------------------------------------------
/02-workflow-orchestration/flows/01_getting_started_data_pipeline.yaml:
--------------------------------------------------------------------------------
 1 | id: 01_getting_started_data_pipeline
 2 | namespace: zoomcamp
 3 | 
 4 | inputs:
 5 |   - id: columns_to_keep
 6 |     type: ARRAY
 7 |     itemType: STRING
 8 |     defaults:
 9 |       - brand
10 |       - price
11 | 
12 | tasks:
13 |   - id: extract
14 |     type: io.kestra.plugin.core.http.Download
15 |     uri: https://dummyjson.com/products
16 | 
17 |   - id: transform
18 |     type: io.kestra.plugin.scripts.python.Script
19 |     containerImage: python:3.11-alpine
20 |     inputFiles:
21 |       data.json: "{{outputs.extract.uri}}"
22 |     outputFiles:
23 |       - "*.json"
24 |     env:
25 |       COLUMNS_TO_KEEP: "{{inputs.columns_to_keep}}"
26 |     script: |
27 |       import json
28 |       import os
29 | 
30 |       columns_to_keep_str = os.getenv("COLUMNS_TO_KEEP")
31 |       columns_to_keep = json.loads(columns_to_keep_str)
32 | 
33 |       with open("data.json", "r") as file:
34 |           data = json.load(file)
35 | 
36 |       filtered_data = [
37 |           {column: product.get(column, "N/A") for column in columns_to_keep}
38 |           for product in data["products"]
39 |       ]
40 | 
41 |       with open("products.json", "w") as file:
42 |           json.dump(filtered_data, file, indent=4)
43 | 
44 |   - id: query
45 |     type: io.kestra.plugin.jdbc.duckdb.Query
46 |     inputFiles:
47 |       products.json: "{{outputs.transform.outputFiles['products.json']}}"
48 |     sql: |
49 |       INSTALL json;
50 |       LOAD json;
51 |       SELECT brand, round(avg(price), 2) as avg_price
52 |       FROM read_json_auto('{{workingDir}}/products.json')
53 |       GROUP BY brand
54 |       ORDER BY avg_price DESC;
55 |     fetchType: STORE
56 | 


--------------------------------------------------------------------------------
/02-workflow-orchestration/flows/03_postgres_dbt.yaml:
--------------------------------------------------------------------------------
 1 | id: 03_postgres_dbt
 2 | namespace: zoomcamp
 3 | inputs:
 4 |   - id: dbt_command
 5 |     type: SELECT
 6 |     allowCustomValue: true
 7 |     defaults: dbt build
 8 |     values:
 9 |       - dbt build
10 |       - dbt debug # use when running the first time to validate DB connection
11 | tasks:
12 |   - id: sync
13 |     type: io.kestra.plugin.git.SyncNamespaceFiles
14 |     url: https://github.com/DataTalksClub/data-engineering-zoomcamp
15 |     branch: main
16 |     namespace: "{{ flow.namespace }}"
17 |     gitDirectory: 04-analytics-engineering/taxi_rides_ny
18 |     dryRun: false
19 |     # disabled: true # this Git Sync is needed only when running it the first time, afterwards the task can be disabled
20 | 
21 |   - id: dbt-build
22 |     type: io.kestra.plugin.dbt.cli.DbtCLI
23 |     env:
24 |       DBT_DATABASE: postgres-zoomcamp
25 |       DBT_SCHEMA: public
26 |     namespaceFiles:
27 |       enabled: true
28 |     containerImage: ghcr.io/kestra-io/dbt-postgres:latest
29 |     taskRunner:
30 |       type: io.kestra.plugin.scripts.runner.docker.Docker
31 |       networkMode: host
32 |     commands:
33 |       - dbt deps
34 |       - "{{ inputs.dbt_command }}"
35 |     storeManifest:
36 |       key: manifest.json
37 |       namespace: "{{ flow.namespace }}"
38 |     profiles: |
39 |       default:
40 |         outputs:
41 |           dev:
42 |             type: postgres
43 |             host: host.docker.internal
44 |             user: kestra
45 |             password: k3str4
46 |             port: 5432
47 |             dbname: postgres-zoomcamp
48 |             schema: public
49 |             threads: 8
50 |             connect_timeout: 10
51 |             priority: interactive
52 |         target: dev
53 | description: |
54 |   Note that you need to adjust the models/staging/schema.yml file to match your database and schema. Select and edit that Namespace File from the UI. Save and run this flow. Once https://github.com/DataTalksClub/data-engineering-zoomcamp/pull/565/files is merged, you can ignore this note as it will be dynamically adjusted based on env variables.
55 |   ```yaml
56 |   sources:
57 |     - name: staging
58 |       database: postgres-zoomcamp
59 |       schema: public
60 |   ```
61 | 


--------------------------------------------------------------------------------
/02-workflow-orchestration/flows/04_gcp_kv.yaml:
--------------------------------------------------------------------------------
 1 | id: 04_gcp_kv
 2 | namespace: zoomcamp
 3 | 
 4 | tasks:
 5 |   - id: gcp_project_id
 6 |     type: io.kestra.plugin.core.kv.Set
 7 |     key: GCP_PROJECT_ID
 8 |     kvType: STRING
 9 |     value: kestra-sandbox # TODO replace with your project id
10 | 
11 |   - id: gcp_location
12 |     type: io.kestra.plugin.core.kv.Set
13 |     key: GCP_LOCATION
14 |     kvType: STRING
15 |     value: europe-west2
16 | 
17 |   - id: gcp_bucket_name
18 |     type: io.kestra.plugin.core.kv.Set
19 |     key: GCP_BUCKET_NAME
20 |     kvType: STRING
21 |     value: your-name-kestra # TODO make sure it's globally unique!
22 | 
23 |   - id: gcp_dataset
24 |     type: io.kestra.plugin.core.kv.Set
25 |     key: GCP_DATASET
26 |     kvType: STRING
27 |     value: zoomcamp
28 | 


--------------------------------------------------------------------------------
/02-workflow-orchestration/flows/05_gcp_setup.yaml:
--------------------------------------------------------------------------------
 1 | id: 05_gcp_setup
 2 | namespace: zoomcamp
 3 | 
 4 | tasks:
 5 |   - id: create_gcs_bucket
 6 |     type: io.kestra.plugin.gcp.gcs.CreateBucket
 7 |     ifExists: SKIP
 8 |     storageClass: REGIONAL
 9 |     name: "{{kv('GCP_BUCKET_NAME')}}" # make sure it's globally unique!
10 | 
11 |   - id: create_bq_dataset
12 |     type: io.kestra.plugin.gcp.bigquery.CreateDataset
13 |     name: "{{kv('GCP_DATASET')}}"
14 |     ifExists: SKIP
15 | 
16 | pluginDefaults:
17 |   - type: io.kestra.plugin.gcp
18 |     values:
19 |       serviceAccount: "{{kv('GCP_CREDS')}}"
20 |       projectId: "{{kv('GCP_PROJECT_ID')}}"
21 |       location: "{{kv('GCP_LOCATION')}}"
22 |       bucket: "{{kv('GCP_BUCKET_NAME')}}"
23 | 


--------------------------------------------------------------------------------
/02-workflow-orchestration/flows/07_gcp_dbt.yaml:
--------------------------------------------------------------------------------
 1 | id: 07_gcp_dbt
 2 | namespace: zoomcamp
 3 | inputs:
 4 |   - id: dbt_command
 5 |     type: SELECT
 6 |     allowCustomValue: true
 7 |     defaults: dbt build
 8 |     values:
 9 |       - dbt build
10 |       - dbt debug # use when running the first time to validate DB connection
11 | 
12 | tasks:
13 |   - id: sync
14 |     type: io.kestra.plugin.git.SyncNamespaceFiles
15 |     url: https://github.com/DataTalksClub/data-engineering-zoomcamp
16 |     branch: main
17 |     namespace: "{{flow.namespace}}"
18 |     gitDirectory: 04-analytics-engineering/taxi_rides_ny
19 |     dryRun: false
20 |     # disabled: true # this Git Sync is needed only when running it the first time, afterwards the task can be disabled
21 | 
22 |   - id: dbt-build
23 |     type: io.kestra.plugin.dbt.cli.DbtCLI
24 |     env:
25 |       DBT_DATABASE: "{{kv('GCP_PROJECT_ID')}}"
26 |       DBT_SCHEMA: "{{kv('GCP_DATASET')}}"
27 |     namespaceFiles:
28 |       enabled: true
29 |     containerImage: ghcr.io/kestra-io/dbt-bigquery:latest
30 |     taskRunner:
31 |       type: io.kestra.plugin.scripts.runner.docker.Docker
32 |     inputFiles:
33 |       sa.json: "{{kv('GCP_CREDS')}}"
34 |     commands:
35 |       - dbt deps
36 |       - "{{ inputs.dbt_command }}"
37 |     storeManifest:
38 |       key: manifest.json
39 |       namespace: "{{ flow.namespace }}"
40 |     profiles: |
41 |       default:
42 |         outputs:
43 |           dev:
44 |             type: bigquery
45 |             dataset: "{{kv('GCP_DATASET')}}"
46 |             project: "{{kv('GCP_PROJECT_ID')}}"
47 |             location: "{{kv('GCP_LOCATION')}}"
48 |             keyfile: sa.json
49 |             method: service-account
50 |             priority: interactive
51 |             threads: 16
52 |             timeout_seconds: 300
53 |             fixed_retries: 1
54 |         target: dev
55 | description: |
56 |   Note that you need to adjust the models/staging/schema.yml file to match your database and schema. Select and edit that Namespace File from the UI. Save and run this flow. Once https://github.com/DataTalksClub/data-engineering-zoomcamp/pull/565/files is merged, you can ignore this note as it will be dynamically adjusted based on env variables.
57 |   ```yaml
58 |   sources:
59 |     - name: staging
60 |       database: kestra-sandbox 
61 |       schema: zoomcamp
62 |   ```
63 | 


--------------------------------------------------------------------------------
/02-workflow-orchestration/images/homework.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataTalksClub/data-engineering-zoomcamp/76fed2419e917176369a0d3a2eeff6c936f87286/02-workflow-orchestration/images/homework.png


--------------------------------------------------------------------------------
/03-data-warehouse/big_query.sql:
--------------------------------------------------------------------------------
 1 | -- Query public available table
 2 | SELECT station_id, name FROM
 3 |     bigquery-public-data.new_york_citibike.citibike_stations
 4 | LIMIT 100;
 5 | 
 6 | 
 7 | -- Creating external table referring to gcs path
 8 | CREATE OR REPLACE EXTERNAL TABLE `taxi-rides-ny.nytaxi.external_yellow_tripdata`
 9 | OPTIONS (
10 |   format = 'CSV',
11 |   uris = ['gs://nyc-tl-data/trip data/yellow_tripdata_2019-*.csv', 'gs://nyc-tl-data/trip data/yellow_tripdata_2020-*.csv']
12 | );
13 | 
14 | -- Check yello trip data
15 | SELECT * FROM taxi-rides-ny.nytaxi.external_yellow_tripdata limit 10;
16 | 
17 | -- Create a non partitioned table from external table
18 | CREATE OR REPLACE TABLE taxi-rides-ny.nytaxi.yellow_tripdata_non_partitioned AS
19 | SELECT * FROM taxi-rides-ny.nytaxi.external_yellow_tripdata;
20 | 
21 | 
22 | -- Create a partitioned table from external table
23 | CREATE OR REPLACE TABLE taxi-rides-ny.nytaxi.yellow_tripdata_partitioned
24 | PARTITION BY
25 |   DATE(tpep_pickup_datetime) AS
26 | SELECT * FROM taxi-rides-ny.nytaxi.external_yellow_tripdata;
27 | 
28 | -- Impact of partition
29 | -- Scanning 1.6GB of data
30 | SELECT DISTINCT(VendorID)
31 | FROM taxi-rides-ny.nytaxi.yellow_tripdata_non_partitioned
32 | WHERE DATE(tpep_pickup_datetime) BETWEEN '2019-06-01' AND '2019-06-30';
33 | 
34 | -- Scanning ~106 MB of DATA
35 | SELECT DISTINCT(VendorID)
36 | FROM taxi-rides-ny.nytaxi.yellow_tripdata_partitioned
37 | WHERE DATE(tpep_pickup_datetime) BETWEEN '2019-06-01' AND '2019-06-30';
38 | 
39 | -- Let's look into the partitions
40 | SELECT table_name, partition_id, total_rows
41 | FROM `nytaxi.INFORMATION_SCHEMA.PARTITIONS`
42 | WHERE table_name = 'yellow_tripdata_partitioned'
43 | ORDER BY total_rows DESC;
44 | 
45 | -- Creating a partition and cluster table
46 | CREATE OR REPLACE TABLE taxi-rides-ny.nytaxi.yellow_tripdata_partitioned_clustered
47 | PARTITION BY DATE(tpep_pickup_datetime)
48 | CLUSTER BY VendorID AS
49 | SELECT * FROM taxi-rides-ny.nytaxi.external_yellow_tripdata;
50 | 
51 | -- Query scans 1.1 GB
52 | SELECT count(*) as trips
53 | FROM taxi-rides-ny.nytaxi.yellow_tripdata_partitioned
54 | WHERE DATE(tpep_pickup_datetime) BETWEEN '2019-06-01' AND '2020-12-31'
55 |   AND VendorID=1;
56 | 
57 | -- Query scans 864.5 MB
58 | SELECT count(*) as trips
59 | FROM taxi-rides-ny.nytaxi.yellow_tripdata_partitioned_clustered
60 | WHERE DATE(tpep_pickup_datetime) BETWEEN '2019-06-01' AND '2020-12-31'
61 |   AND VendorID=1;
62 | 
63 | 


--------------------------------------------------------------------------------
/03-data-warehouse/big_query_hw.sql:
--------------------------------------------------------------------------------
 1 | CREATE OR REPLACE EXTERNAL TABLE `taxi-rides-ny.nytaxi.fhv_tripdata`
 2 | OPTIONS (
 3 |   format = 'CSV',
 4 |   uris = ['gs://nyc-tl-data/trip data/fhv_tripdata_2019-*.csv']
 5 | );
 6 | 
 7 | 
 8 | SELECT count(*) FROM `taxi-rides-ny.nytaxi.fhv_tripdata`;
 9 | 
10 | 
11 | SELECT COUNT(DISTINCT(dispatching_base_num)) FROM `taxi-rides-ny.nytaxi.fhv_tripdata`;
12 | 
13 | 
14 | CREATE OR REPLACE TABLE `taxi-rides-ny.nytaxi.fhv_nonpartitioned_tripdata`
15 | AS SELECT * FROM `taxi-rides-ny.nytaxi.fhv_tripdata`;
16 | 
17 | CREATE OR REPLACE TABLE `taxi-rides-ny.nytaxi.fhv_partitioned_tripdata`
18 | PARTITION BY DATE(dropoff_datetime)
19 | CLUSTER BY dispatching_base_num AS (
20 |   SELECT * FROM `taxi-rides-ny.nytaxi.fhv_tripdata`
21 | );
22 | 
23 | SELECT count(*) FROM  `taxi-rides-ny.nytaxi.fhv_nonpartitioned_tripdata`
24 | WHERE DATE(dropoff_datetime) BETWEEN '2019-01-01' AND '2019-03-31'
25 |   AND dispatching_base_num IN ('B00987', 'B02279', 'B02060');
26 | 
27 | 
28 | SELECT count(*) FROM `taxi-rides-ny.nytaxi.fhv_partitioned_tripdata`
29 | WHERE DATE(dropoff_datetime) BETWEEN '2019-01-01' AND '2019-03-31'
30 |   AND dispatching_base_num IN ('B00987', 'B02279', 'B02060');
31 | 


--------------------------------------------------------------------------------
/03-data-warehouse/big_query_ml.sql:
--------------------------------------------------------------------------------
 1 | -- SELECT THE COLUMNS INTERESTED FOR YOU
 2 | SELECT passenger_count, trip_distance, PULocationID, DOLocationID, payment_type, fare_amount, tolls_amount, tip_amount
 3 | FROM `taxi-rides-ny.nytaxi.yellow_tripdata_partitioned` WHERE fare_amount != 0;
 4 | 
 5 | -- CREATE A ML TABLE WITH APPROPRIATE TYPE
 6 | CREATE OR REPLACE TABLE `taxi-rides-ny.nytaxi.yellow_tripdata_ml` (
 7 | `passenger_count` INTEGER,
 8 | `trip_distance` FLOAT64,
 9 | `PULocationID` STRING,
10 | `DOLocationID` STRING,
11 | `payment_type` STRING,
12 | `fare_amount` FLOAT64,
13 | `tolls_amount` FLOAT64,
14 | `tip_amount` FLOAT64
15 | ) AS (
16 | SELECT passenger_count, trip_distance, cast(PULocationID AS STRING), CAST(DOLocationID AS STRING),
17 | CAST(payment_type AS STRING), fare_amount, tolls_amount, tip_amount
18 | FROM `taxi-rides-ny.nytaxi.yellow_tripdata_partitioned` WHERE fare_amount != 0
19 | );
20 | 
21 | -- CREATE MODEL WITH DEFAULT SETTING
22 | CREATE OR REPLACE MODEL `taxi-rides-ny.nytaxi.tip_model`
23 | OPTIONS
24 | (model_type='linear_reg',
25 | input_label_cols=['tip_amount'],
26 | DATA_SPLIT_METHOD='AUTO_SPLIT') AS
27 | SELECT
28 | *
29 | FROM
30 | `taxi-rides-ny.nytaxi.yellow_tripdata_ml`
31 | WHERE
32 | tip_amount IS NOT NULL;
33 | 
34 | -- CHECK FEATURES
35 | SELECT * FROM ML.FEATURE_INFO(MODEL `taxi-rides-ny.nytaxi.tip_model`);
36 | 
37 | -- EVALUATE THE MODEL
38 | SELECT
39 | *
40 | FROM
41 | ML.EVALUATE(MODEL `taxi-rides-ny.nytaxi.tip_model`,
42 | (
43 | SELECT
44 | *
45 | FROM
46 | `taxi-rides-ny.nytaxi.yellow_tripdata_ml`
47 | WHERE
48 | tip_amount IS NOT NULL
49 | ));
50 | 
51 | -- PREDICT THE MODEL
52 | SELECT
53 | *
54 | FROM
55 | ML.PREDICT(MODEL `taxi-rides-ny.nytaxi.tip_model`,
56 | (
57 | SELECT
58 | *
59 | FROM
60 | `taxi-rides-ny.nytaxi.yellow_tripdata_ml`
61 | WHERE
62 | tip_amount IS NOT NULL
63 | ));
64 | 
65 | -- PREDICT AND EXPLAIN
66 | SELECT
67 | *
68 | FROM
69 | ML.EXPLAIN_PREDICT(MODEL `taxi-rides-ny.nytaxi.tip_model`,
70 | (
71 | SELECT
72 | *
73 | FROM
74 | `taxi-rides-ny.nytaxi.yellow_tripdata_ml`
75 | WHERE
76 | tip_amount IS NOT NULL
77 | ), STRUCT(3 as top_k_features));
78 | 
79 | -- HYPER PARAM TUNNING
80 | CREATE OR REPLACE MODEL `taxi-rides-ny.nytaxi.tip_hyperparam_model`
81 | OPTIONS
82 | (model_type='linear_reg',
83 | input_label_cols=['tip_amount'],
84 | DATA_SPLIT_METHOD='AUTO_SPLIT',
85 | num_trials=5,
86 | max_parallel_trials=2,
87 | l1_reg=hparam_range(0, 20),
88 | l2_reg=hparam_candidates([0, 0.1, 1, 10])) AS
89 | SELECT
90 | *
91 | FROM
92 | `taxi-rides-ny.nytaxi.yellow_tripdata_ml`
93 | WHERE
94 | tip_amount IS NOT NULL;
95 | 
96 | 


--------------------------------------------------------------------------------
/03-data-warehouse/extract_model.md:
--------------------------------------------------------------------------------
 1 | ## Model deployment
 2 | [Tutorial](https://cloud.google.com/bigquery-ml/docs/export-model-tutorial)
 3 | ### Steps
 4 | - gcloud auth login
 5 | - bq --project_id taxi-rides-ny extract -m nytaxi.tip_model gs://taxi_ml_model/tip_model
 6 | - mkdir /tmp/model
 7 | - gsutil cp -r gs://taxi_ml_model/tip_model /tmp/model
 8 | - mkdir -p serving_dir/tip_model/1
 9 | - cp -r /tmp/model/tip_model/* serving_dir/tip_model/1
10 | - docker pull tensorflow/serving
11 | - docker run -p 8501:8501 --mount type=bind,source=`pwd`/serving_dir/tip_model,target=
12 |   /models/tip_model -e MODEL_NAME=tip_model -t tensorflow/serving &
13 | - curl -d '{"instances": [{"passenger_count":1, "trip_distance":12.2, "PULocationID":"193", "DOLocationID":"264", "payment_type":"2","fare_amount":20.4,"tolls_amount":0.0}]}' -X POST http://localhost:8501/v1/models/tip_model:predict
14 | - http://localhost:8501/v1/models/tip_model


--------------------------------------------------------------------------------
/03-data-warehouse/extras/README.md:
--------------------------------------------------------------------------------
1 | Quick hack to load files directly to GCS, without Airflow. Downloads csv files from https://nyc-tlc.s3.amazonaws.com/trip+data/ and uploads them to your Cloud Storage Account as parquet files.
2 | 
3 | 1. Install pre-reqs (more info in `web_to_gcs.py` script)
4 | 2. Run: `python web_to_gcs.py`
5 | 


--------------------------------------------------------------------------------
/03-data-warehouse/extras/web_to_gcs.py:
--------------------------------------------------------------------------------
 1 | import io
 2 | import os
 3 | import requests
 4 | import pandas as pd
 5 | from google.cloud import storage
 6 | 
 7 | """
 8 | Pre-reqs: 
 9 | 1. `pip install pandas pyarrow google-cloud-storage`
10 | 2. Set GOOGLE_APPLICATION_CREDENTIALS to your project/service-account key
11 | 3. Set GCP_GCS_BUCKET as your bucket or change default value of BUCKET
12 | """
13 | 
14 | # services = ['fhv','green','yellow']
15 | init_url = 'https://github.com/DataTalksClub/nyc-tlc-data/releases/download/'
16 | # switch out the bucketname
17 | BUCKET = os.environ.get("GCP_GCS_BUCKET", "dtc-data-lake-bucketname")
18 | 
19 | 
20 | def upload_to_gcs(bucket, object_name, local_file):
21 |     """
22 |     Ref: https://cloud.google.com/storage/docs/uploading-objects#storage-upload-object-python
23 |     """
24 |     # # WORKAROUND to prevent timeout for files > 6 MB on 800 kbps upload speed.
25 |     # # (Ref: https://github.com/googleapis/python-storage/issues/74)
26 |     # storage.blob._MAX_MULTIPART_SIZE = 5 * 1024 * 1024  # 5 MB
27 |     # storage.blob._DEFAULT_CHUNKSIZE = 5 * 1024 * 1024  # 5 MB
28 | 
29 |     client = storage.Client()
30 |     bucket = client.bucket(bucket)
31 |     blob = bucket.blob(object_name)
32 |     blob.upload_from_filename(local_file)
33 | 
34 | 
35 | def web_to_gcs(year, service):
36 |     for i in range(12):
37 |         
38 |         # sets the month part of the file_name string
39 |         month = '0'+str(i+1)
40 |         month = month[-2:]
41 | 
42 |         # csv file_name
43 |         file_name = f"{service}_tripdata_{year}-{month}.csv.gz"
44 | 
45 |         # download it using requests via a pandas df
46 |         request_url = f"{init_url}{service}/{file_name}"
47 |         r = requests.get(request_url)
48 |         open(file_name, 'wb').write(r.content)
49 |         print(f"Local: {file_name}")
50 | 
51 |         # read it back into a parquet file
52 |         df = pd.read_csv(file_name, compression='gzip')
53 |         file_name = file_name.replace('.csv.gz', '.parquet')
54 |         df.to_parquet(file_name, engine='pyarrow')
55 |         print(f"Parquet: {file_name}")
56 | 
57 |         # upload it to gcs 
58 |         upload_to_gcs(BUCKET, f"{service}/{file_name}", file_name)
59 |         print(f"GCS: {service}/{file_name}")
60 | 
61 | 
62 | web_to_gcs('2019', 'green')
63 | web_to_gcs('2020', 'green')
64 | # web_to_gcs('2019', 'yellow')
65 | # web_to_gcs('2020', 'yellow')
66 | 
67 | 


--------------------------------------------------------------------------------
/04-analytics-engineering/docker_setup/docker-compose.yaml:
--------------------------------------------------------------------------------
 1 | version: '3'
 2 | services:
 3 |   dbt-bq-dtc:
 4 |     build:
 5 |       context: .
 6 |       target: dbt-bigquery
 7 |     image: dbt/bigquery
 8 |     volumes:
 9 |       - .:/usr/app
10 |       - ~/.dbt/:/root/.dbt/
11 |       - ~/.google/credentials/google_credentials.json:/.google/credentials/google_credentials.json
12 |     network_mode: host


--------------------------------------------------------------------------------
/04-analytics-engineering/taxi_rides_ny/.gitignore:
--------------------------------------------------------------------------------
1 | # you shouldn't commit these into source control
2 | # these are the default directory names, adjust/add to fit your needs
3 | target/
4 | dbt_packages/
5 | logs/
6 | 


--------------------------------------------------------------------------------
/04-analytics-engineering/taxi_rides_ny/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataTalksClub/data-engineering-zoomcamp/76fed2419e917176369a0d3a2eeff6c936f87286/04-analytics-engineering/taxi_rides_ny/.gitkeep


--------------------------------------------------------------------------------
/04-analytics-engineering/taxi_rides_ny/README.md:
--------------------------------------------------------------------------------
 1 | Welcome to your new dbt project!
 2 | 
 3 | ### How to run this project 
 4 | ### About the project
 5 | This project is based in [dbt starter project](https://github.com/dbt-labs/dbt-starter-project) (generated by running `dbt init`)
 6 | Try running the following commands:
 7 | - dbt run
 8 | - dbt test
 9 | 
10 | A project includes the following files: 
11 | - dbt_project.yml: file used to configure the dbt project. If you are using dbt locally, make sure the profile here matches the one setup during installation in ~/.dbt/profiles.yml
12 | - *.yml files under folders models, data, macros: documentation files
13 | - csv files in the data folder: these will be our sources, files described above
14 | - Files inside folder models: The sql files contain the scripts to run our models, this will cover staging, core and a datamarts models. At the end, these models will follow this structure: 
15 | 
16 | ![image](https://user-images.githubusercontent.com/4315804/152691312-e71b56a4-53ff-4884-859c-c9090dbd0db8.png)
17 | 
18 | 
19 | #### Workflow
20 | ![image](https://user-images.githubusercontent.com/4315804/148699280-964c4e0b-e685-4c0f-a266-4f3e097156c9.png)
21 | 
22 | #### Execution
23 | After having installed the required tools and cloning this repo, execute the following commands: 
24 | 
25 | 1. Change into the project's directory from the command line: `$ cd [..]/taxi_rides_ny`
26 | 2. Load the CSVs into the database. This materializes the CSVs as tables in your target schema: `$ dbt seed`
27 | 3. Run the models: `$ dbt run`
28 | 4. Test your data: `$ dbt test`
29 | _Alternative: use `$ dbt build` to execute with one command the 3 steps above together_
30 | 5. Generate documentation for the project: `$ dbt docs generate`
31 | 6. View the documentation for the project, this step should open the documentation page on a webserver, but it can also be accessed from  http://localhost:8080 : `$ dbt docs serve`
32 | 
33 | ### dbt resources:
34 | - Learn more about dbt [in the docs](https://docs.getdbt.com/docs/introduction)
35 | - Check out [Discourse](https://discourse.getdbt.com/) for commonly asked questions and answers
36 | - Join the [chat](http://slack.getdbt.com/) on Slack for live discussions and support
37 | - Find [dbt events](https://events.getdbt.com) near you
38 | - Check out [the blog](https://blog.getdbt.com/) for the latest news on dbt's development and best practices


--------------------------------------------------------------------------------
/04-analytics-engineering/taxi_rides_ny/analyses/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataTalksClub/data-engineering-zoomcamp/76fed2419e917176369a0d3a2eeff6c936f87286/04-analytics-engineering/taxi_rides_ny/analyses/.gitkeep


--------------------------------------------------------------------------------
/04-analytics-engineering/taxi_rides_ny/analyses/hack-load-data.sql:
--------------------------------------------------------------------------------
 1 | -- MAKE SURE YOU REPLACE taxi-rides-ny-339813-412521 WITH THE NAME OF YOUR DATASET! 
 2 | -- When you run the query, only run 5 of the ALTER TABLE statements at one time (by highlighting only 5). 
 3 | -- Otherwise BigQuery will say too many alterations to the table are being made.
 4 | 
 5 | CREATE TABLE  `taxi-rides-ny-339813-412521.trips_data_all.green_tripdata` as
 6 | SELECT * FROM `bigquery-public-data.new_york_taxi_trips.tlc_green_trips_2019`; 
 7 | 
 8 | 
 9 | CREATE TABLE  `taxi-rides-ny-339813-412521.trips_data_all.yellow_tripdata` as
10 | SELECT * FROM `bigquery-public-data.new_york_taxi_trips.tlc_yellow_trips_2019`;
11 | 
12 | insert into  `taxi-rides-ny-339813-412521.trips_data_all.green_tripdata` 
13 | SELECT * FROM `bigquery-public-data.new_york_taxi_trips.tlc_green_trips_2020` ;
14 | 
15 | 
16 | insert into  `taxi-rides-ny-339813-412521.trips_data_all.yellow_tripdata` 
17 | SELECT * FROM `bigquery-public-data.new_york_taxi_trips.tlc_yellow_trips_2020`; 
18 | 
19 |   -- Fixes yellow table schema
20 | ALTER TABLE `taxi-rides-ny-339813-412521.trips_data_all.yellow_tripdata`
21 |   RENAME COLUMN vendor_id TO VendorID;
22 | ALTER TABLE `taxi-rides-ny-339813-412521.trips_data_all.yellow_tripdata`
23 |   RENAME COLUMN pickup_datetime TO tpep_pickup_datetime;
24 | ALTER TABLE `taxi-rides-ny-339813-412521.trips_data_all.yellow_tripdata`
25 |   RENAME COLUMN dropoff_datetime TO tpep_dropoff_datetime;
26 | ALTER TABLE `taxi-rides-ny-339813-412521.trips_data_all.yellow_tripdata`
27 |   RENAME COLUMN rate_code TO RatecodeID;
28 | ALTER TABLE `taxi-rides-ny-339813-412521.trips_data_all.yellow_tripdata`
29 |   RENAME COLUMN imp_surcharge TO improvement_surcharge;
30 | ALTER TABLE `taxi-rides-ny-339813-412521.trips_data_all.yellow_tripdata`
31 |   RENAME COLUMN pickup_location_id TO PULocationID;
32 | ALTER TABLE `taxi-rides-ny-339813-412521.trips_data_all.yellow_tripdata`
33 |   RENAME COLUMN dropoff_location_id TO DOLocationID;
34 | 
35 |   -- Fixes green table schema
36 | ALTER TABLE `taxi-rides-ny-339813-412521.trips_data_all.green_tripdata`
37 |   RENAME COLUMN vendor_id TO VendorID;
38 | ALTER TABLE `taxi-rides-ny-339813-412521.trips_data_all.green_tripdata`
39 |   RENAME COLUMN pickup_datetime TO lpep_pickup_datetime;
40 | ALTER TABLE `taxi-rides-ny-339813-412521.trips_data_all.green_tripdata`
41 |   RENAME COLUMN dropoff_datetime TO lpep_dropoff_datetime;
42 | ALTER TABLE `taxi-rides-ny-339813-412521.trips_data_all.green_tripdata`
43 |   RENAME COLUMN rate_code TO RatecodeID;
44 | ALTER TABLE `taxi-rides-ny-339813-412521.trips_data_all.green_tripdata`
45 |   RENAME COLUMN imp_surcharge TO improvement_surcharge;
46 | ALTER TABLE `taxi-rides-ny-339813-412521.trips_data_all.green_tripdata`
47 |   RENAME COLUMN pickup_location_id TO PULocationID;
48 | ALTER TABLE `taxi-rides-ny-339813-412521.trips_data_all.green_tripdata`
49 |   RENAME COLUMN dropoff_location_id TO DOLocationID;
50 | 


--------------------------------------------------------------------------------
/04-analytics-engineering/taxi_rides_ny/dbt_project.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | # Name your project! Project names should contain only lowercase characters
 3 | # and underscores. A good package name should reflect your organization's
 4 | # name or the intended use of these models
 5 | name: 'taxi_rides_ny'
 6 | version: '1.0.0'
 7 | config-version: 2
 8 | 
 9 | # This setting configures which "profile" dbt uses for this project.
10 | profile: 'default'
11 | 
12 | # These configurations specify where dbt should look for different types of files.
13 | # The `model-paths` config, for example, states that models in this project can be
14 | # found in the "models/" directory. You probably won't need to change these!
15 | model-paths: ["models"]
16 | analysis-paths: ["analyses"]
17 | test-paths: ["tests"]
18 | seed-paths: ["seeds"]
19 | macro-paths: ["macros"]
20 | snapshot-paths: ["snapshots"]
21 | 
22 | target-path: "target"  # directory which will store compiled SQL files
23 | clean-targets:         # directories to be removed by `dbt clean`
24 |   - "target"
25 |   - "dbt_packages"
26 | 
27 | 
28 | # Configuring models
29 | # Full documentation: https://docs.getdbt.com/docs/configuring-models
30 | 
31 | # In dbt, the default materialization for a model is a view. This means, when you run 
32 | # dbt run or dbt build, all of your models will be built as a view in your data platform. 
33 | # The configuration below will override this setting for models in the example folder to 
34 | # instead be materialized as tables. Any models you add to the root of the models folder will 
35 | # continue to be built as views. These settings can be overridden in the individual model files
36 | # using the `{{ config(...) }}` macro.
37 | 
38 | models:
39 |   taxi_rides_ny:
40 |       # Applies to all files under models/.../
41 |       staging:
42 |           materialized: view
43 |       core:
44 |           materialized: table
45 | vars:
46 |   payment_type_values: [1, 2, 3, 4, 5, 6]
47 | 
48 | seeds: 
49 |     taxi_rides_ny:
50 |         taxi_zone_lookup:
51 |             +column_types:
52 |                 locationid: numeric


--------------------------------------------------------------------------------
/04-analytics-engineering/taxi_rides_ny/macros/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataTalksClub/data-engineering-zoomcamp/76fed2419e917176369a0d3a2eeff6c936f87286/04-analytics-engineering/taxi_rides_ny/macros/.gitkeep


--------------------------------------------------------------------------------
/04-analytics-engineering/taxi_rides_ny/macros/get_payment_type_description.sql:
--------------------------------------------------------------------------------
 1 | {#
 2 |     This macro returns the description of the payment_type 
 3 | #}
 4 | 
 5 | {% macro get_payment_type_description(payment_type) -%}
 6 | 
 7 |     case {{ dbt.safe_cast("payment_type", api.Column.translate_type("integer")) }}  
 8 |         when 1 then 'Credit card'
 9 |         when 2 then 'Cash'
10 |         when 3 then 'No charge'
11 |         when 4 then 'Dispute'
12 |         when 5 then 'Unknown'
13 |         when 6 then 'Voided trip'
14 |         else 'EMPTY'
15 |     end
16 | 
17 | {%- endmacro %}


--------------------------------------------------------------------------------
/04-analytics-engineering/taxi_rides_ny/macros/macros_properties.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | 
 3 | macros:
 4 |   - name: get_payment_type_description
 5 |     description: >
 6 |       This macro receives a payment_type and returns the corresponding description.
 7 |     arguments:
 8 |       - name: payment_type
 9 |         type: int
10 |         description: > 
11 |           payment_type value.
12 |           Must be one of the accepted values, otherwise the macro will return null


--------------------------------------------------------------------------------
/04-analytics-engineering/taxi_rides_ny/models/core/dim_zones.sql:
--------------------------------------------------------------------------------
1 | {{ config(materialized='table') }}
2 | 
3 | select 
4 |     locationid, 
5 |     borough, 
6 |     zone, 
7 |     replace(service_zone,'Boro','Green') as service_zone 
8 | from {{ ref('taxi_zone_lookup') }}


--------------------------------------------------------------------------------
/04-analytics-engineering/taxi_rides_ny/models/core/dm_monthly_zone_revenue.sql:
--------------------------------------------------------------------------------
 1 | {{ config(materialized='table') }}
 2 | 
 3 | with trips_data as (
 4 |     select * from {{ ref('fact_trips') }}
 5 | )
 6 |     select 
 7 |     -- Revenue grouping 
 8 |     pickup_zone as revenue_zone,
 9 |     {{ dbt.date_trunc("month", "pickup_datetime") }} as revenue_month, 
10 | 
11 |     service_type, 
12 | 
13 |     -- Revenue calculation 
14 |     sum(fare_amount) as revenue_monthly_fare,
15 |     sum(extra) as revenue_monthly_extra,
16 |     sum(mta_tax) as revenue_monthly_mta_tax,
17 |     sum(tip_amount) as revenue_monthly_tip_amount,
18 |     sum(tolls_amount) as revenue_monthly_tolls_amount,
19 |     sum(ehail_fee) as revenue_monthly_ehail_fee,
20 |     sum(improvement_surcharge) as revenue_monthly_improvement_surcharge,
21 |     sum(total_amount) as revenue_monthly_total_amount,
22 | 
23 |     -- Additional calculations
24 |     count(tripid) as total_monthly_trips,
25 |     avg(passenger_count) as avg_monthly_passenger_count,
26 |     avg(trip_distance) as avg_monthly_trip_distance
27 | 
28 |     from trips_data
29 |     group by 1,2,3


--------------------------------------------------------------------------------
/04-analytics-engineering/taxi_rides_ny/models/core/fact_trips.sql:
--------------------------------------------------------------------------------
 1 | {{
 2 |     config(
 3 |         materialized='table'
 4 |     )
 5 | }}
 6 | 
 7 | with green_tripdata as (
 8 |     select *, 
 9 |         'Green' as service_type
10 |     from {{ ref('stg_green_tripdata') }}
11 | ), 
12 | yellow_tripdata as (
13 |     select *, 
14 |         'Yellow' as service_type
15 |     from {{ ref('stg_yellow_tripdata') }}
16 | ), 
17 | trips_unioned as (
18 |     select * from green_tripdata
19 |     union all 
20 |     select * from yellow_tripdata
21 | ), 
22 | dim_zones as (
23 |     select * from {{ ref('dim_zones') }}
24 |     where borough != 'Unknown'
25 | )
26 | select trips_unioned.tripid, 
27 |     trips_unioned.vendorid, 
28 |     trips_unioned.service_type,
29 |     trips_unioned.ratecodeid, 
30 |     trips_unioned.pickup_locationid, 
31 |     pickup_zone.borough as pickup_borough, 
32 |     pickup_zone.zone as pickup_zone, 
33 |     trips_unioned.dropoff_locationid,
34 |     dropoff_zone.borough as dropoff_borough, 
35 |     dropoff_zone.zone as dropoff_zone,  
36 |     trips_unioned.pickup_datetime, 
37 |     trips_unioned.dropoff_datetime, 
38 |     trips_unioned.store_and_fwd_flag, 
39 |     trips_unioned.passenger_count, 
40 |     trips_unioned.trip_distance, 
41 |     trips_unioned.trip_type, 
42 |     trips_unioned.fare_amount, 
43 |     trips_unioned.extra, 
44 |     trips_unioned.mta_tax, 
45 |     trips_unioned.tip_amount, 
46 |     trips_unioned.tolls_amount, 
47 |     trips_unioned.ehail_fee, 
48 |     trips_unioned.improvement_surcharge, 
49 |     trips_unioned.total_amount, 
50 |     trips_unioned.payment_type, 
51 |     trips_unioned.payment_type_description
52 | from trips_unioned
53 | inner join dim_zones as pickup_zone
54 | on trips_unioned.pickup_locationid = pickup_zone.locationid
55 | inner join dim_zones as dropoff_zone
56 | on trips_unioned.dropoff_locationid = dropoff_zone.locationid


--------------------------------------------------------------------------------
/04-analytics-engineering/taxi_rides_ny/models/staging/stg_green_tripdata.sql:
--------------------------------------------------------------------------------
 1 | {{
 2 |     config(
 3 |         materialized='view'
 4 |     )
 5 | }}
 6 | 
 7 | with tripdata as 
 8 | (
 9 |   select *,
10 |     row_number() over(partition by vendorid, lpep_pickup_datetime) as rn
11 |   from {{ source('staging','green_tripdata') }}
12 |   where vendorid is not null 
13 | )
14 | select
15 |     -- identifiers
16 |     {{ dbt_utils.generate_surrogate_key(['vendorid', 'lpep_pickup_datetime']) }} as tripid,
17 |     {{ dbt.safe_cast("vendorid", api.Column.translate_type("integer")) }} as vendorid,
18 |     {{ dbt.safe_cast("ratecodeid", api.Column.translate_type("integer")) }} as ratecodeid,
19 |     {{ dbt.safe_cast("pulocationid", api.Column.translate_type("integer")) }} as pickup_locationid,
20 |     {{ dbt.safe_cast("dolocationid", api.Column.translate_type("integer")) }} as dropoff_locationid,
21 |     
22 |     -- timestamps
23 |     cast(lpep_pickup_datetime as timestamp) as pickup_datetime,
24 |     cast(lpep_dropoff_datetime as timestamp) as dropoff_datetime,
25 |     
26 |     -- trip info
27 |     store_and_fwd_flag,
28 |     {{ dbt.safe_cast("passenger_count", api.Column.translate_type("integer")) }} as passenger_count,
29 |     cast(trip_distance as numeric) as trip_distance,
30 |     {{ dbt.safe_cast("trip_type", api.Column.translate_type("integer")) }} as trip_type,
31 | 
32 |     -- payment info
33 |     cast(fare_amount as numeric) as fare_amount,
34 |     cast(extra as numeric) as extra,
35 |     cast(mta_tax as numeric) as mta_tax,
36 |     cast(tip_amount as numeric) as tip_amount,
37 |     cast(tolls_amount as numeric) as tolls_amount,
38 |     cast(ehail_fee as numeric) as ehail_fee,
39 |     cast(improvement_surcharge as numeric) as improvement_surcharge,
40 |     cast(total_amount as numeric) as total_amount,
41 |     coalesce({{ dbt.safe_cast("payment_type", api.Column.translate_type("integer")) }},0) as payment_type,
42 |     {{ get_payment_type_description("payment_type") }} as payment_type_description
43 | from tripdata
44 | where rn = 1
45 | 
46 | 
47 | -- dbt build --select <model_name> --vars '{'is_test_run': 'false'}'
48 | {% if var('is_test_run', default=true) %}
49 | 
50 |   limit 100
51 | 
52 | {% endif %}


--------------------------------------------------------------------------------
/04-analytics-engineering/taxi_rides_ny/models/staging/stg_yellow_tripdata.sql:
--------------------------------------------------------------------------------
 1 | {{ config(materialized='view') }}
 2 |  
 3 | with tripdata as 
 4 | (
 5 |   select *,
 6 |     row_number() over(partition by vendorid, tpep_pickup_datetime) as rn
 7 |   from {{ source('staging','yellow_tripdata') }}
 8 |   where vendorid is not null 
 9 | )
10 | select
11 |    -- identifiers
12 |     {{ dbt_utils.generate_surrogate_key(['vendorid', 'tpep_pickup_datetime']) }} as tripid,    
13 |     {{ dbt.safe_cast("vendorid", api.Column.translate_type("integer")) }} as vendorid,
14 |     {{ dbt.safe_cast("ratecodeid", api.Column.translate_type("integer")) }} as ratecodeid,
15 |     {{ dbt.safe_cast("pulocationid", api.Column.translate_type("integer")) }} as pickup_locationid,
16 |     {{ dbt.safe_cast("dolocationid", api.Column.translate_type("integer")) }} as dropoff_locationid,
17 | 
18 |     -- timestamps
19 |     cast(tpep_pickup_datetime as timestamp) as pickup_datetime,
20 |     cast(tpep_dropoff_datetime as timestamp) as dropoff_datetime,
21 |     
22 |     -- trip info
23 |     store_and_fwd_flag,
24 |     {{ dbt.safe_cast("passenger_count", api.Column.translate_type("integer")) }} as passenger_count,
25 |     cast(trip_distance as numeric) as trip_distance,
26 |     -- yellow cabs are always street-hail
27 |     1 as trip_type,
28 |     
29 |     -- payment info
30 |     cast(fare_amount as numeric) as fare_amount,
31 |     cast(extra as numeric) as extra,
32 |     cast(mta_tax as numeric) as mta_tax,
33 |     cast(tip_amount as numeric) as tip_amount,
34 |     cast(tolls_amount as numeric) as tolls_amount,
35 |     cast(0 as numeric) as ehail_fee,
36 |     cast(improvement_surcharge as numeric) as improvement_surcharge,
37 |     cast(total_amount as numeric) as total_amount,
38 |     coalesce({{ dbt.safe_cast("payment_type", api.Column.translate_type("integer")) }},0) as payment_type,
39 |     {{ get_payment_type_description('payment_type') }} as payment_type_description
40 | from tripdata
41 | where rn = 1
42 | 
43 | -- dbt build --select <model.sql> --vars '{'is_test_run: false}'
44 | {% if var('is_test_run', default=true) %}
45 | 
46 |   limit 100
47 | 
48 | {% endif %}


--------------------------------------------------------------------------------
/04-analytics-engineering/taxi_rides_ny/package-lock.yml:
--------------------------------------------------------------------------------
1 | packages:
2 | - package: dbt-labs/dbt_utils
3 |   version: 1.1.1
4 | - package: dbt-labs/codegen
5 |   version: 0.12.1
6 | sha1_hash: d974113b0f072cce35300077208f38581075ab40
7 | 


--------------------------------------------------------------------------------
/04-analytics-engineering/taxi_rides_ny/packages.yml:
--------------------------------------------------------------------------------
1 | packages:
2 |   - package: dbt-labs/dbt_utils
3 |     version: 1.1.1
4 |   - package: dbt-labs/codegen
5 |     version: 0.12.1


--------------------------------------------------------------------------------
/04-analytics-engineering/taxi_rides_ny/seeds/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataTalksClub/data-engineering-zoomcamp/76fed2419e917176369a0d3a2eeff6c936f87286/04-analytics-engineering/taxi_rides_ny/seeds/.gitkeep


--------------------------------------------------------------------------------
/04-analytics-engineering/taxi_rides_ny/seeds/seeds_properties.yml:
--------------------------------------------------------------------------------
1 | version: 2
2 | 
3 | seeds: 
4 |   - name: taxi_zone_lookup
5 |     description: >
6 |       Taxi Zones roughly based on NYC Department of City Planning's Neighborhood
7 |       Tabulation Areas (NTAs) and are meant to approximate neighborhoods, so you can see which
8 |       neighborhood a passenger was picked up in, and which neighborhood they were dropped off in. 
9 |       Includes associated service_zone (EWR, Boro Zone, Yellow Zone)


--------------------------------------------------------------------------------
/04-analytics-engineering/taxi_rides_ny/snapshots/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataTalksClub/data-engineering-zoomcamp/76fed2419e917176369a0d3a2eeff6c936f87286/04-analytics-engineering/taxi_rides_ny/snapshots/.gitkeep


--------------------------------------------------------------------------------
/05-batch/.gitignore:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataTalksClub/data-engineering-zoomcamp/76fed2419e917176369a0d3a2eeff6c936f87286/05-batch/.gitignore


--------------------------------------------------------------------------------
/05-batch/code/download_data.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | set -e
 3 | 
 4 | TAXI_TYPE=$1 # "yellow"
 5 | YEAR=$2 # 2020
 6 | 
 7 | URL_PREFIX="https://github.com/DataTalksClub/nyc-tlc-data/releases/download"
 8 | 
 9 | for MONTH in {1..12}; do
10 |   FMONTH=`printf "%02d" ${MONTH}`
11 | 
12 |   URL="${URL_PREFIX}/${TAXI_TYPE}/${TAXI_TYPE}_tripdata_${YEAR}-${FMONTH}.csv.gz"
13 | 
14 |   LOCAL_PREFIX="data/raw/${TAXI_TYPE}/${YEAR}/${FMONTH}"
15 |   LOCAL_FILE="${TAXI_TYPE}_tripdata_${YEAR}_${FMONTH}.csv.gz"
16 |   LOCAL_PATH="${LOCAL_PREFIX}/${LOCAL_FILE}"
17 | 
18 |   echo "downloading ${URL} to ${LOCAL_PATH}"
19 |   mkdir -p ${LOCAL_PREFIX}
20 |   wget ${URL} -O ${LOCAL_PATH}
21 | 
22 | done
23 | 


--------------------------------------------------------------------------------
/05-batch/setup/config/core-site.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
 3 | 
 4 | <configuration>
 5 |   <property>
 6 |     <name>fs.AbstractFileSystem.gs.impl</name>
 7 |     <value>com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS</value>
 8 |   </property>
 9 |   <property>
10 |     <name>fs.gs.impl</name>
11 |     <value>com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem</value>
12 |   </property>
13 |   <property>
14 |     <name>fs.gs.auth.service.account.json.keyfile</name>
15 |     <value>/home/alexey/.google/credentials/google_credentials.json</value>
16 |   </property>
17 |   <property>
18 |     <name>fs.gs.auth.service.account.enable</name>
19 |     <value>true</value>
20 |   </property>
21 | </configuration>


--------------------------------------------------------------------------------
/05-batch/setup/config/spark-defaults.conf:
--------------------------------------------------------------------------------
1 | spark-master    yarn
2 | spark.hadoop.google.cloud.auth.service.account.enable        true
3 | spark.hadoop.google.cloud.auth.service.account.json.keyfile  /home/alexey
4 | 


--------------------------------------------------------------------------------
/05-batch/setup/config/spark.dockerfile:
--------------------------------------------------------------------------------
1 | FROM library/openjdk:11


--------------------------------------------------------------------------------
/05-batch/setup/linux.md:
--------------------------------------------------------------------------------
 1 | 
 2 | ## Linux
 3 | 
 4 | Here we'll show you how to install Spark 3.3.2 for Linux.
 5 | We tested it on Ubuntu 20.04 (also WSL), but it should work
 6 | for other Linux distros as well
 7 | 
 8 | 
 9 | ### Installing Java
10 | 
11 | Download OpenJDK 11 or Oracle JDK 11 (It's important that the version is 11 - spark requires 8 or 11)
12 | 
13 | We'll use [OpenJDK](https://jdk.java.net/archive/)
14 | 
15 | Download it (e.g. to `~/spark`):
16 | 
17 | ```
18 | wget https://download.java.net/java/GA/jdk11/9/GPL/openjdk-11.0.2_linux-x64_bin.tar.gz
19 | ```
20 | 
21 | Unpack it:
22 | 
23 | ```bash
24 | tar xzfv openjdk-11.0.2_linux-x64_bin.tar.gz
25 | ```
26 | 
27 | define `JAVA_HOME` and add it to `PATH`:
28 | 
29 | ```bash
30 | export JAVA_HOME="${HOME}/spark/jdk-11.0.2"
31 | export PATH="${JAVA_HOME}/bin:${PATH}"
32 | ```
33 | 
34 | check that it works:
35 | 
36 | ```bash
37 | java --version
38 | ```
39 | 
40 | Output:
41 | 
42 | ```
43 | openjdk 11.0.2 2019-01-15
44 | OpenJDK Runtime Environment 18.9 (build 11.0.2+9)
45 | OpenJDK 64-Bit Server VM 18.9 (build 11.0.2+9, mixed mode)
46 | ```
47 | 
48 | Remove the archive:
49 | 
50 | ```bash
51 | rm openjdk-11.0.2_linux-x64_bin.tar.gz
52 | ```
53 | 
54 | ### Installing Spark
55 | 
56 | 
57 | Download Spark. Use 3.3.2 version:
58 | 
59 | ```bash
60 | wget https://archive.apache.org/dist/spark/spark-3.3.2/spark-3.3.2-bin-hadoop3.tgz
61 | ```
62 | 
63 | Unpack:
64 | 
65 | ```bash
66 | tar xzfv spark-3.3.2-bin-hadoop3.tgz
67 | ```
68 | 
69 | Remove the archive:
70 | 
71 | ```bash
72 | rm spark-3.3.2-bin-hadoop3.tgz
73 | ```
74 | 
75 | Add it to `PATH`:
76 | 
77 | ```bash
78 | export SPARK_HOME="${HOME}/spark/spark-3.3.2-bin-hadoop3"
79 | export PATH="${SPARK_HOME}/bin:${PATH}"
80 | ```
81 | 
82 | ### Testing Spark
83 | 
84 | Execute `spark-shell` and run the following:
85 | 
86 | ```scala
87 | val data = 1 to 10000
88 | val distData = sc.parallelize(data)
89 | distData.filter(_ < 10).collect()
90 | ```
91 | 
92 | ### PySpark
93 | 
94 | It's the same for all platforms. Go to [pyspark.md](pyspark.md). 
95 | 


--------------------------------------------------------------------------------
/05-batch/setup/pyspark.md:
--------------------------------------------------------------------------------
 1 | 
 2 | ## PySpark
 3 | 
 4 | This document assumes you already have python.
 5 | 
 6 | To run PySpark, we first need to add it to `PYTHONPATH`:
 7 | 
 8 | ```bash
 9 | export PYTHONPATH="${SPARK_HOME}/python/:$PYTHONPATH"
10 | export PYTHONPATH="${SPARK_HOME}/python/lib/py4j-0.10.9.5-src.zip:$PYTHONPATH"
11 | ```
12 | 
13 | Make sure that the version under `${SPARK_HOME}/python/lib/` matches the filename of py4j or you will
14 | encounter `ModuleNotFoundError: No module named 'py4j'` while executing `import pyspark`.
15 | 
16 | For example, if the file under `${SPARK_HOME}/python/lib/` is `py4j-0.10.9.3-src.zip`, then the
17 | `export PYTHONPATH` statement above should be changed to
18 | 
19 | ```bash
20 | export PYTHONPATH="${SPARK_HOME}/python/lib/py4j-0.10.9.3-src.zip:$PYTHONPATH"
21 | ```
22 | 
23 | On Windows, you may have to do path conversion from unix-style to windows-style:
24 | 
25 | ```bash
26 | SPARK_WIN=`cygpath -w ${SPARK_HOME}`
27 | 
28 | export PYTHONPATH="${SPARK_WIN}\\python\\"
29 | export PYTHONPATH="${SPARK_WIN}\\python\\lib\\py4j-0.10.9-src.zip;$PYTHONPATH"
30 | ```
31 | 
32 | Now you can run Jupyter or IPython to test if things work. Go to some other directory, e.g. `~/tmp`.
33 | 
34 | Download a CSV file that we'll use for testing:
35 | 
36 | ```bash
37 | wget https://d37ci6vzurychx.cloudfront.net/misc/taxi_zone_lookup.csv
38 | ```
39 | 
40 | Now let's run `ipython` (or `jupyter notebook`) and execute:
41 | 
42 | ```python
43 | import pyspark
44 | from pyspark.sql import SparkSession
45 | 
46 | spark = SparkSession.builder \
47 |     .master("local[*]") \
48 |     .appName('test') \
49 |     .getOrCreate()
50 | 
51 | df = spark.read \
52 |     .option("header", "true") \
53 |     .csv('taxi_zone_lookup.csv')
54 | 
55 | df.show()
56 | ```
57 | 
58 | Test that writing works as well:
59 | 
60 | ```python
61 | df.write.parquet('zones')
62 | ```
63 | 


--------------------------------------------------------------------------------
/06-streaming/.gitignore:
--------------------------------------------------------------------------------
1 | week6_venv


--------------------------------------------------------------------------------
/06-streaming/java/kafka_examples/.gitignore:
--------------------------------------------------------------------------------
 1 | .gradle
 2 | bin
 3 | !src/main/resources/rides.csv
 4 | 
 5 | build/classes
 6 | build/generated
 7 | build/libs
 8 | build/reports
 9 | build/resources
10 | build/test-results
11 | build/tmp
12 | 


--------------------------------------------------------------------------------
/06-streaming/java/kafka_examples/build.gradle:
--------------------------------------------------------------------------------
 1 | plugins {
 2 |     id 'java'
 3 |     id "com.github.davidmc24.gradle.plugin.avro" version "1.5.0"
 4 | }
 5 | 
 6 | 
 7 | group 'org.example'
 8 | version '1.0-SNAPSHOT'
 9 | 
10 | repositories {
11 |     mavenCentral()
12 |     maven {
13 |         url "https://packages.confluent.io/maven"
14 |     }
15 | }
16 | 
17 | dependencies {
18 |     implementation 'org.apache.kafka:kafka-clients:3.3.1'
19 |     implementation 'com.opencsv:opencsv:5.7.1'
20 |     implementation 'io.confluent:kafka-json-serializer:7.3.1'
21 |     implementation 'org.apache.kafka:kafka-streams:3.3.1'
22 |     implementation 'io.confluent:kafka-avro-serializer:7.3.1'
23 |     implementation 'io.confluent:kafka-schema-registry-client:7.3.1'
24 |     implementation 'io.confluent:kafka-streams-avro-serde:7.3.1'
25 |     implementation "org.apache.avro:avro:1.11.0"
26 |     testImplementation 'org.junit.jupiter:junit-jupiter-api:5.8.1'
27 |     testRuntimeOnly 'org.junit.jupiter:junit-jupiter-engine:5.8.1'
28 |     testImplementation 'org.apache.kafka:kafka-streams-test-utils:3.3.1'
29 | }
30 | 
31 | sourceSets.main.java.srcDirs = ['build/generated-main-avro-java','src/main/java']
32 | 
33 | test {
34 |     useJUnitPlatform()
35 | }
36 | 
37 | 


--------------------------------------------------------------------------------
/06-streaming/java/kafka_examples/gradle/wrapper/gradle-wrapper.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataTalksClub/data-engineering-zoomcamp/76fed2419e917176369a0d3a2eeff6c936f87286/06-streaming/java/kafka_examples/gradle/wrapper/gradle-wrapper.jar


--------------------------------------------------------------------------------
/06-streaming/java/kafka_examples/gradle/wrapper/gradle-wrapper.properties:
--------------------------------------------------------------------------------
1 | distributionBase=GRADLE_USER_HOME
2 | distributionPath=wrapper/dists
3 | distributionUrl=https\://services.gradle.org/distributions/gradle-7.5.1-bin.zip
4 | zipStoreBase=GRADLE_USER_HOME
5 | zipStorePath=wrapper/dists
6 | 


--------------------------------------------------------------------------------
/06-streaming/java/kafka_examples/settings.gradle:
--------------------------------------------------------------------------------
1 | pluginManagement {
2 |     repositories {
3 |         gradlePluginPortal()
4 |         mavenCentral()
5 |     }
6 | }
7 | rootProject.name = 'kafka_examples'


--------------------------------------------------------------------------------
/06-streaming/java/kafka_examples/src/main/avro/rides.avsc:
--------------------------------------------------------------------------------
 1 | {
 2 |        "type": "record",
 3 |        "name":"RideRecord",
 4 |        "namespace": "schemaregistry",
 5 |        "fields":[
 6 |          {"name":"vendor_id","type":"string"},
 7 |          {"name":"passenger_count","type":"int"},
 8 |          {"name":"trip_distance","type":"double"}
 9 |        ]
10 | }


--------------------------------------------------------------------------------
/06-streaming/java/kafka_examples/src/main/avro/rides_compatible.avsc:
--------------------------------------------------------------------------------
 1 | {
 2 |    "type": "record",
 3 |        "name":"RideRecordCompatible",
 4 |        "namespace": "schemaregistry",
 5 |        "fields":[
 6 |          {"name":"vendorId","type":"string"},
 7 |          {"name":"passenger_count","type":"int"},
 8 |          {"name":"trip_distance","type":"double"},
 9 |          {"name":"pu_location_id", "type": [ "null", "long" ], "default": null}
10 |        ]
11 | }


--------------------------------------------------------------------------------
/06-streaming/java/kafka_examples/src/main/avro/rides_non_compatible.avsc:
--------------------------------------------------------------------------------
 1 | {
 2 |    "type": "record",
 3 |        "name":"RideRecordNoneCompatible",
 4 |        "namespace": "schemaregistry",
 5 |        "fields":[
 6 |          {"name":"vendorId","type":"int"},
 7 |          {"name":"passenger_count","type":"int"},
 8 |          {"name":"trip_distance","type":"double"}
 9 |        ]
10 | }


--------------------------------------------------------------------------------
/06-streaming/java/kafka_examples/src/main/java/org/example/JsonConsumer.java:
--------------------------------------------------------------------------------
 1 | package org.example;
 2 | 
 3 | import org.apache.kafka.clients.consumer.ConsumerConfig;
 4 | import org.apache.kafka.clients.consumer.ConsumerRecord;
 5 | import org.apache.kafka.clients.consumer.KafkaConsumer;
 6 | import org.apache.kafka.clients.producer.ProducerConfig;
 7 | import org.example.data.Ride;
 8 | 
 9 | import java.time.Duration;
10 | import java.time.temporal.ChronoUnit;
11 | import java.time.temporal.TemporalUnit;
12 | import java.util.List;
13 | import java.util.Properties;
14 | import io.confluent.kafka.serializers.KafkaJsonDeserializerConfig;
15 | public class JsonConsumer {
16 | 
17 |     private Properties props = new Properties();
18 |     private KafkaConsumer<String, Ride> consumer;
19 |     public JsonConsumer() {
20 |         props.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, "pkc-75m1o.europe-west3.gcp.confluent.cloud:9092");
21 |         props.put("security.protocol", "SASL_SSL");
22 |         props.put("sasl.jaas.config", "org.apache.kafka.common.security.plain.PlainLoginModule required username='"+Secrets.KAFKA_CLUSTER_KEY+"' password='"+Secrets.KAFKA_CLUSTER_SECRET+"';");
23 |         props.put("sasl.mechanism", "PLAIN");
24 |         props.put("client.dns.lookup", "use_all_dns_ips");
25 |         props.put("session.timeout.ms", "45000");
26 |         props.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringDeserializer");
27 |         props.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, "io.confluent.kafka.serializers.KafkaJsonDeserializer");
28 |         props.put(ConsumerConfig.GROUP_ID_CONFIG, "kafka_tutorial_example.jsonconsumer.v2");
29 |         props.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "earliest");
30 |         props.put(KafkaJsonDeserializerConfig.JSON_VALUE_TYPE, Ride.class);
31 |         consumer = new KafkaConsumer<String, Ride>(props);
32 |         consumer.subscribe(List.of("rides"));
33 | 
34 |     }
35 | 
36 |     public void consumeFromKafka() {
37 |         System.out.println("Consuming form kafka started");
38 |         var results = consumer.poll(Duration.of(1, ChronoUnit.SECONDS));
39 |         var i = 0;
40 |         do {
41 | 
42 |             for(ConsumerRecord<String, Ride> result: results) {
43 |                 System.out.println(result.value().DOLocationID);
44 |             }
45 |             results =  consumer.poll(Duration.of(1, ChronoUnit.SECONDS));
46 |             System.out.println("RESULTS:::" + results.count());
47 |             i++;
48 |         }
49 |         while(!results.isEmpty() || i < 10);
50 |     }
51 | 
52 |     public static void main(String[] args) {
53 |         JsonConsumer jsonConsumer = new JsonConsumer();
54 |         jsonConsumer.consumeFromKafka();
55 |     }
56 | }
57 | 


--------------------------------------------------------------------------------
/06-streaming/java/kafka_examples/src/main/java/org/example/JsonKStream.java:
--------------------------------------------------------------------------------
 1 | package org.example;
 2 | 
 3 | import org.apache.kafka.clients.consumer.ConsumerConfig;
 4 | import org.apache.kafka.common.serialization.Serdes;
 5 | import org.apache.kafka.streams.KafkaStreams;
 6 | import org.apache.kafka.streams.StreamsBuilder;
 7 | import org.apache.kafka.streams.StreamsConfig;
 8 | import org.apache.kafka.streams.Topology;
 9 | import org.apache.kafka.streams.kstream.Consumed;
10 | import org.apache.kafka.streams.kstream.Produced;
11 | import org.example.customserdes.CustomSerdes;
12 | import org.example.data.Ride;
13 | 
14 | import java.util.Properties;
15 | 
16 | public class JsonKStream {
17 |     private Properties props = new Properties();
18 | 
19 |     public JsonKStream() {
20 |         props.put(StreamsConfig.BOOTSTRAP_SERVERS_CONFIG, "pkc-75m1o.europe-west3.gcp.confluent.cloud:9092");
21 |         props.put("security.protocol", "SASL_SSL");
22 |         props.put("sasl.jaas.config", "org.apache.kafka.common.security.plain.PlainLoginModule required username='"+Secrets.KAFKA_CLUSTER_KEY+"' password='"+Secrets.KAFKA_CLUSTER_SECRET+"';");
23 |         props.put("sasl.mechanism", "PLAIN");
24 |         props.put("client.dns.lookup", "use_all_dns_ips");
25 |         props.put("session.timeout.ms", "45000");
26 |         props.put(StreamsConfig.APPLICATION_ID_CONFIG, "kafka_tutorial.kstream.count.plocation.v1");
27 |         props.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "latest");
28 |         props.put(StreamsConfig.CACHE_MAX_BYTES_BUFFERING_CONFIG, 0);
29 | 
30 |     }
31 | 
32 |     public Topology createTopology() {
33 |         StreamsBuilder streamsBuilder = new StreamsBuilder();
34 |         var ridesStream = streamsBuilder.stream("rides", Consumed.with(Serdes.String(), CustomSerdes.getSerde(Ride.class)));
35 |         var puLocationCount = ridesStream.groupByKey().count().toStream();
36 |         puLocationCount.to("rides-pulocation-count", Produced.with(Serdes.String(), Serdes.Long()));
37 |         return streamsBuilder.build();
38 |     }
39 | 
40 |     public void countPLocation() throws InterruptedException {
41 |         var topology = createTopology();
42 |         var kStreams = new KafkaStreams(topology, props);
43 |         kStreams.start();
44 |         while (kStreams.state() != KafkaStreams.State.RUNNING) {
45 |             System.out.println(kStreams.state());
46 |             Thread.sleep(1000);
47 |         }
48 |         System.out.println(kStreams.state());
49 |         Runtime.getRuntime().addShutdownHook(new Thread(kStreams::close));
50 |     }
51 | 
52 |     public static void main(String[] args) throws InterruptedException {
53 |         var object = new JsonKStream();
54 |         object.countPLocation();
55 |     }
56 | }
57 | 


--------------------------------------------------------------------------------
/06-streaming/java/kafka_examples/src/main/java/org/example/JsonKStreamWindow.java:
--------------------------------------------------------------------------------
 1 | package org.example;
 2 | 
 3 | import org.apache.kafka.clients.consumer.ConsumerConfig;
 4 | import org.apache.kafka.common.serialization.Serdes;
 5 | import org.apache.kafka.streams.KafkaStreams;
 6 | import org.apache.kafka.streams.StreamsBuilder;
 7 | import org.apache.kafka.streams.StreamsConfig;
 8 | import org.apache.kafka.streams.Topology;
 9 | import org.apache.kafka.streams.kstream.Consumed;
10 | import org.apache.kafka.streams.kstream.Produced;
11 | import org.apache.kafka.streams.kstream.TimeWindows;
12 | import org.apache.kafka.streams.kstream.WindowedSerdes;
13 | import org.example.customserdes.CustomSerdes;
14 | import org.example.data.Ride;
15 | 
16 | import java.time.Duration;
17 | import java.time.temporal.ChronoUnit;
18 | import java.util.Properties;
19 | 
20 | public class JsonKStreamWindow {
21 |     private Properties props = new Properties();
22 | 
23 |     public JsonKStreamWindow() {
24 |         props.put(StreamsConfig.BOOTSTRAP_SERVERS_CONFIG, "pkc-75m1o.europe-west3.gcp.confluent.cloud:9092");
25 |         props.put("security.protocol", "SASL_SSL");
26 |         props.put("sasl.jaas.config", "org.apache.kafka.common.security.plain.PlainLoginModule required username='"+Secrets.KAFKA_CLUSTER_KEY+"' password='"+Secrets.KAFKA_CLUSTER_SECRET+"';");
27 |         props.put("sasl.mechanism", "PLAIN");
28 |         props.put("client.dns.lookup", "use_all_dns_ips");
29 |         props.put("session.timeout.ms", "45000");
30 |         props.put(StreamsConfig.APPLICATION_ID_CONFIG, "kafka_tutorial.kstream.count.plocation.v1");
31 |         props.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "latest");
32 |         props.put(StreamsConfig.CACHE_MAX_BYTES_BUFFERING_CONFIG, 0);
33 | 
34 |     }
35 | 
36 |     public Topology createTopology() {
37 |         StreamsBuilder streamsBuilder = new StreamsBuilder();
38 |         var ridesStream = streamsBuilder.stream("rides", Consumed.with(Serdes.String(), CustomSerdes.getSerde(Ride.class)));
39 |         var puLocationCount = ridesStream.groupByKey()
40 |                 .windowedBy(TimeWindows.ofSizeAndGrace(Duration.ofSeconds(10), Duration.ofSeconds(5)))
41 |                 .count().toStream();
42 |         var windowSerde = WindowedSerdes.timeWindowedSerdeFrom(String.class, 10*1000);
43 | 
44 |         puLocationCount.to("rides-pulocation-window-count", Produced.with(windowSerde, Serdes.Long()));
45 |         return streamsBuilder.build();
46 |     }
47 | 
48 |     public void countPLocationWindowed() {
49 |         var topology = createTopology();
50 |         var kStreams = new KafkaStreams(topology, props);
51 |         kStreams.start();
52 | 
53 |         Runtime.getRuntime().addShutdownHook(new Thread(kStreams::close));
54 |     }
55 | 
56 |     public static void main(String[] args) {
57 |         var object = new JsonKStreamWindow();
58 |         object.countPLocationWindowed();
59 |     }
60 | }
61 | 


--------------------------------------------------------------------------------
/06-streaming/java/kafka_examples/src/main/java/org/example/JsonProducerPickupLocation.java:
--------------------------------------------------------------------------------
 1 | package org.example;
 2 | 
 3 | import com.opencsv.exceptions.CsvException;
 4 | import org.apache.kafka.clients.producer.KafkaProducer;
 5 | import org.apache.kafka.clients.producer.ProducerConfig;
 6 | import org.apache.kafka.clients.producer.ProducerRecord;
 7 | import org.example.data.PickupLocation;
 8 | 
 9 | import java.io.IOException;
10 | import java.time.LocalDateTime;
11 | import java.util.Properties;
12 | import java.util.concurrent.ExecutionException;
13 | 
14 | public class JsonProducerPickupLocation {
15 |     private Properties props = new Properties();
16 | 
17 |     public JsonProducerPickupLocation() {
18 |         props.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, "pkc-75m1o.europe-west3.gcp.confluent.cloud:9092");
19 |         props.put("security.protocol", "SASL_SSL");
20 |         props.put("sasl.jaas.config", "org.apache.kafka.common.security.plain.PlainLoginModule required username='"+Secrets.KAFKA_CLUSTER_KEY+"' password='"+Secrets.KAFKA_CLUSTER_SECRET+"';");
21 |         props.put("sasl.mechanism", "PLAIN");
22 |         props.put("client.dns.lookup", "use_all_dns_ips");
23 |         props.put("session.timeout.ms", "45000");
24 |         props.put(ProducerConfig.ACKS_CONFIG, "all");
25 |         props.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringSerializer");
26 |         props.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, "io.confluent.kafka.serializers.KafkaJsonSerializer");
27 |     }
28 | 
29 |     public void publish(PickupLocation pickupLocation) throws ExecutionException, InterruptedException {
30 |         KafkaProducer<String, PickupLocation> kafkaProducer = new KafkaProducer<String, PickupLocation>(props);
31 |         var record = kafkaProducer.send(new ProducerRecord<>("rides_location", String.valueOf(pickupLocation.PULocationID), pickupLocation), (metadata, exception) -> {
32 |             if (exception != null) {
33 |                 System.out.println(exception.getMessage());
34 |             }
35 |         });
36 |         System.out.println(record.get().offset());
37 |     }
38 | 
39 | 
40 |     public static void main(String[] args) throws IOException, CsvException, ExecutionException, InterruptedException {
41 |         var producer = new JsonProducerPickupLocation();
42 |         producer.publish(new PickupLocation(186, LocalDateTime.now()));
43 |     }
44 | }
45 | 


--------------------------------------------------------------------------------
/06-streaming/java/kafka_examples/src/main/java/org/example/Secrets.java:
--------------------------------------------------------------------------------
 1 | package org.example;
 2 | 
 3 | public class Secrets {
 4 |     public static final String KAFKA_CLUSTER_KEY = "REPLACE_WITH_YOUR_KAFKA_CLUSTER_KEY";
 5 |     public static final String KAFKA_CLUSTER_SECRET = "REPLACE_WITH_YOUR_KAFKA_CLUSTER_SECRET";
 6 | 
 7 |     public static final String SCHEMA_REGISTRY_KEY = "REPLACE_WITH_SCHEMA_REGISTRY_KEY";
 8 |     public static final String SCHEMA_REGISTRY_SECRET = "REPLACE_WITH_SCHEMA_REGISTRY_SECRET";
 9 | 
10 | }
11 | 


--------------------------------------------------------------------------------
/06-streaming/java/kafka_examples/src/main/java/org/example/Topics.java:
--------------------------------------------------------------------------------
1 | package org.example;
2 | 
3 | public class Topics {
4 |     public static final String INPUT_RIDE_TOPIC = "rides";
5 |     public static final String INPUT_RIDE_LOCATION_TOPIC = "rides_location";
6 |     public static final String OUTPUT_TOPIC = "vendor_info";
7 | }
8 | 


--------------------------------------------------------------------------------
/06-streaming/java/kafka_examples/src/main/java/org/example/customserdes/CustomSerdes.java:
--------------------------------------------------------------------------------
 1 | package org.example.customserdes;
 2 | 
 3 | import io.confluent.kafka.serializers.AbstractKafkaAvroSerDeConfig;
 4 | import io.confluent.kafka.serializers.KafkaJsonDeserializer;
 5 | import io.confluent.kafka.serializers.KafkaJsonSerializer;
 6 | import io.confluent.kafka.streams.serdes.avro.SpecificAvroSerde;
 7 | import org.apache.avro.specific.SpecificRecordBase;
 8 | import org.apache.kafka.common.serialization.Deserializer;
 9 | import org.apache.kafka.common.serialization.Serde;
10 | import org.apache.kafka.common.serialization.Serdes;
11 | import org.apache.kafka.common.serialization.Serializer;
12 | import org.example.data.PickupLocation;
13 | import org.example.data.Ride;
14 | import org.example.data.VendorInfo;
15 | 
16 | import java.util.HashMap;
17 | import java.util.Map;
18 | 
19 | public class CustomSerdes {
20 | 
21 |     public static <T> Serde<T> getSerde(Class<T> classOf) {
22 |         Map<String, Object> serdeProps = new HashMap<>();
23 |         serdeProps.put("json.value.type", classOf);
24 |         final Serializer<T> mySerializer = new KafkaJsonSerializer<>();
25 |         mySerializer.configure(serdeProps, false);
26 | 
27 |         final Deserializer<T> myDeserializer = new KafkaJsonDeserializer<>();
28 |         myDeserializer.configure(serdeProps, false);
29 |         return Serdes.serdeFrom(mySerializer, myDeserializer);
30 |     }
31 | 
32 |     public static <T extends SpecificRecordBase> SpecificAvroSerde getAvroSerde(boolean isKey, String schemaRegistryUrl) {
33 |         var serde = new SpecificAvroSerde<T>();
34 | 
35 |         Map<String, Object> serdeProps = new HashMap<>();
36 |         serdeProps.put(AbstractKafkaAvroSerDeConfig.SCHEMA_REGISTRY_URL_CONFIG, schemaRegistryUrl);
37 |         serde.configure(serdeProps, isKey);
38 |         return serde;
39 |     }
40 | 
41 | 
42 | }
43 | 


--------------------------------------------------------------------------------
/06-streaming/java/kafka_examples/src/main/java/org/example/data/PickupLocation.java:
--------------------------------------------------------------------------------
 1 | package org.example.data;
 2 | 
 3 | import java.time.LocalDateTime;
 4 | 
 5 | public class PickupLocation {
 6 |     public PickupLocation(long PULocationID, LocalDateTime tpep_pickup_datetime) {
 7 |         this.PULocationID = PULocationID;
 8 |         this.tpep_pickup_datetime = tpep_pickup_datetime;
 9 |     }
10 | 
11 |     public PickupLocation() {
12 |     }
13 | 
14 |     public long PULocationID;
15 |     public LocalDateTime tpep_pickup_datetime;
16 | }
17 | 


--------------------------------------------------------------------------------
/06-streaming/java/kafka_examples/src/main/java/org/example/data/Ride.java:
--------------------------------------------------------------------------------
 1 | package org.example.data;
 2 | 
 3 | import java.nio.DoubleBuffer;
 4 | import java.time.LocalDate;
 5 | import java.time.LocalDateTime;
 6 | import java.time.format.DateTimeFormatter;
 7 | 
 8 | public class Ride {
 9 |     public Ride(String[] arr) {
10 |         VendorID = arr[0];
11 |         tpep_pickup_datetime = LocalDateTime.parse(arr[1], DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"));
12 |         tpep_dropoff_datetime = LocalDateTime.parse(arr[2], DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"));
13 |         passenger_count = Integer.parseInt(arr[3]);
14 |         trip_distance = Double.parseDouble(arr[4]);
15 |         RatecodeID = Long.parseLong(arr[5]);
16 |         store_and_fwd_flag = arr[6];
17 |         PULocationID = Long.parseLong(arr[7]);
18 |         DOLocationID = Long.parseLong(arr[8]);
19 |         payment_type = arr[9];
20 |         fare_amount = Double.parseDouble(arr[10]);
21 |         extra = Double.parseDouble(arr[11]);
22 |         mta_tax = Double.parseDouble(arr[12]);
23 |         tip_amount = Double.parseDouble(arr[13]);
24 |         tolls_amount = Double.parseDouble(arr[14]);
25 |         improvement_surcharge = Double.parseDouble(arr[15]);
26 |         total_amount = Double.parseDouble(arr[16]);
27 |         congestion_surcharge = Double.parseDouble(arr[17]);
28 |     }
29 |     public Ride(){}
30 |     public String VendorID;
31 |     public LocalDateTime tpep_pickup_datetime;
32 |     public LocalDateTime tpep_dropoff_datetime;
33 |     public int passenger_count;
34 |     public double trip_distance;
35 |     public long RatecodeID;
36 |     public String store_and_fwd_flag;
37 |     public long PULocationID;
38 |     public long DOLocationID;
39 |     public String payment_type;
40 |     public double fare_amount;
41 |     public double extra;
42 |     public double mta_tax;
43 |     public double tip_amount;
44 |     public double tolls_amount;
45 |     public double improvement_surcharge;
46 |     public double total_amount;
47 |     public double congestion_surcharge;
48 | 
49 | }
50 | 


--------------------------------------------------------------------------------
/06-streaming/java/kafka_examples/src/main/java/org/example/data/VendorInfo.java:
--------------------------------------------------------------------------------
 1 | package org.example.data;
 2 | 
 3 | import java.time.LocalDateTime;
 4 | 
 5 | public class VendorInfo {
 6 | 
 7 |     public VendorInfo(String vendorID, long PULocationID, LocalDateTime pickupTime, LocalDateTime lastDropoffTime) {
 8 |         VendorID = vendorID;
 9 |         this.PULocationID = PULocationID;
10 |         this.pickupTime = pickupTime;
11 |         this.lastDropoffTime = lastDropoffTime;
12 |     }
13 | 
14 |     public VendorInfo() {
15 |     }
16 | 
17 |     public String VendorID;
18 |     public long PULocationID;
19 |     public LocalDateTime pickupTime;
20 |     public LocalDateTime lastDropoffTime;
21 | }
22 | 


--------------------------------------------------------------------------------
/06-streaming/java/kafka_examples/src/test/java/org/example/helper/DataGeneratorHelper.java:
--------------------------------------------------------------------------------
 1 | package org.example.helper;
 2 | 
 3 | import org.example.data.PickupLocation;
 4 | import org.example.data.Ride;
 5 | import org.example.data.VendorInfo;
 6 | 
 7 | import java.time.LocalDateTime;
 8 | import java.time.format.DateTimeFormatter;
 9 | import java.util.List;
10 | 
11 | public class DataGeneratorHelper {
12 |     public static Ride generateRide() {
13 |         var arrivalTime = LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"));
14 |         var departureTime = LocalDateTime.now().minusMinutes(30).format(DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"));
15 |         return new Ride(new String[]{"1", departureTime, arrivalTime,"1","1.50","1","N","238","75","2","8","0.5","0.5","0","0","0.3","9.3","0"});
16 |     }
17 | 
18 |     public static PickupLocation generatePickUpLocation(long pickupLocationId) {
19 |         return new PickupLocation(pickupLocationId, LocalDateTime.now());
20 |     }
21 | }
22 | 


--------------------------------------------------------------------------------
/06-streaming/ksqldb/commands.md:
--------------------------------------------------------------------------------
 1 | ## KSQL DB Examples
 2 | ### Create streams
 3 | ```sql
 4 | CREATE STREAM ride_streams (
 5 |     VendorId varchar, 
 6 |     trip_distance double,
 7 |     payment_type varchar
 8 | )  WITH (KAFKA_TOPIC='rides',
 9 |         VALUE_FORMAT='JSON');
10 | ```
11 | 
12 | ### Query stream
13 | ```sql
14 | select * from RIDE_STREAMS 
15 | EMIT CHANGES;
16 | ```
17 | 
18 | ### Query stream count
19 | ```sql
20 | SELECT VENDORID, count(*) FROM RIDE_STREAMS 
21 | GROUP BY VENDORID
22 | EMIT CHANGES;
23 | ```
24 | 
25 | ### Query stream with filters
26 | ```sql
27 | SELECT payment_type, count(*) FROM RIDE_STREAMS 
28 | WHERE payment_type IN ('1', '2')
29 | GROUP BY payment_type
30 | EMIT CHANGES;
31 | ```
32 | 
33 | ### Query stream with window functions
34 | ```sql
35 | CREATE TABLE payment_type_sessions AS
36 |   SELECT payment_type,
37 |          count(*)
38 |   FROM  RIDE_STREAMS 
39 |   WINDOW SESSION (60 SECONDS)
40 |   GROUP BY payment_type
41 |   EMIT CHANGES;
42 | ```
43 | 
44 | ## KSQL documentation for details
45 | [KSQL DB Documentation](https://docs.ksqldb.io/en/latest/developer-guide/ksqldb-reference/quick-reference/)
46 | 
47 | [KSQL DB Java client](https://docs.ksqldb.io/en/latest/developer-guide/ksqldb-clients/java-client/)


--------------------------------------------------------------------------------
/06-streaming/pyflink/.gitignore:
--------------------------------------------------------------------------------
  1 | data/
  2 | postgres-data
  3 | # Byte-compiled / optimized / DLL files
  4 | __pycache__/
  5 | *.py[cod]
  6 | *$py.class
  7 | 
  8 | # C extensions
  9 | *.so
 10 | 
 11 | # Distribution / packaging
 12 | .Python
 13 | build/
 14 | develop-eggs/
 15 | dist/
 16 | downloads/
 17 | eggs/
 18 | .eggs/
 19 | lib/
 20 | lib64/
 21 | parts/
 22 | sdist/
 23 | var/
 24 | wheels/
 25 | pip-wheel-metadata/
 26 | share/python-wheels/
 27 | *.egg-info/
 28 | .installed.cfg
 29 | *.egg
 30 | MANIFEST
 31 | 
 32 | # PyInstaller
 33 | #  Usually these files are written by a python script from a template
 34 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 35 | *.manifest
 36 | *.spec
 37 | 
 38 | # Installer logs
 39 | pip-log.txt
 40 | pip-delete-this-directory.txt
 41 | 
 42 | # Unit test / coverage reports
 43 | htmlcov/
 44 | .tox/
 45 | .nox/
 46 | .coverage
 47 | .coverage.*
 48 | .cache
 49 | nosetests.xml
 50 | coverage.xml
 51 | *.cover
 52 | *.py,cover
 53 | .hypothesis/
 54 | .pytest_cache/
 55 | 
 56 | # Translations
 57 | *.mo
 58 | *.pot
 59 | 
 60 | # Django stuff:
 61 | *.log
 62 | local_settings.py
 63 | db.sqlite3
 64 | db.sqlite3-journal
 65 | 
 66 | # Flask stuff:
 67 | instance/
 68 | .webassets-cache
 69 | 
 70 | # Scrapy stuff:
 71 | .scrapy
 72 | 
 73 | # Sphinx documentation
 74 | docs/_build/
 75 | 
 76 | # PyBuilder
 77 | target/
 78 | 
 79 | # Jupyter Notebook
 80 | .ipynb_checkpoints
 81 | 
 82 | # IPython
 83 | profile_default/
 84 | ipython_config.py
 85 | 
 86 | # pyenv
 87 | .python-version
 88 | 
 89 | # pipenv
 90 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 91 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 92 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 93 | #   install all needed dependencies.
 94 | #Pipfile.lock
 95 | 
 96 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 97 | __pypackages__/
 98 | 
 99 | # Celery stuff
100 | celerybeat-schedule
101 | celerybeat.pid
102 | 
103 | # SageMath parsed files
104 | *.sage.py
105 | 
106 | # Environments
107 | .env
108 | .venv
109 | env/
110 | venv/
111 | ENV/
112 | env.bak/
113 | venv.bak/
114 | 
115 | # Spyder project settings
116 | .spyderproject
117 | .spyproject
118 | 
119 | # Rope project settings
120 | .ropeproject
121 | 
122 | # mkdocs documentation
123 | /site
124 | 
125 | # mypy
126 | .mypy_cache/
127 | .dmypy.json
128 | dmypy.json
129 | 
130 | # Pyre type checker
131 | .pyre/
132 | 
133 | dump.sql
134 | 
135 | # Personal workspace files
136 | .idea/*
137 | .vscode/*


--------------------------------------------------------------------------------
/06-streaming/pyflink/Dockerfile.flink:
--------------------------------------------------------------------------------
 1 | FROM --platform=linux/amd64 flink:1.16.0-scala_2.12-java8
 2 | 
 3 | # install python3: it has updated Python to 3.9 in Debian 11 and so install Python 3.7 from source
 4 | # it currently only supports Python 3.6, 3.7 and 3.8 in PyFlink officially.
 5 | 
 6 | # ref: https://nightlies.apache.org/flink/flink-docs-release-1.16/docs/deployment/resource-providers/standalone/docker/#using-flink-python-on-docker
 7 | 
 8 | RUN apt-get update -y && \
 9 |     apt-get install -y build-essential libssl-dev zlib1g-dev libbz2-dev libffi-dev liblzma-dev && \
10 |     wget https://www.python.org/ftp/python/3.7.9/Python-3.7.9.tgz && \
11 |     tar -xvf Python-3.7.9.tgz && \
12 |     cd Python-3.7.9 && \
13 |     ./configure --without-tests --enable-shared && \
14 |     make -j6 && \
15 |     make install && \
16 |     ldconfig /usr/local/lib && \
17 |     cd .. && rm -f Python-3.7.9.tgz && rm -rf Python-3.7.9 && \
18 |     ln -s /usr/local/bin/python3 /usr/local/bin/python && \
19 |     apt-get clean && \
20 |     rm -rf /var/lib/apt/lists/*
21 | 
22 | # install PyFlink
23 | COPY requirements.txt .
24 | RUN python -m pip install --upgrade pip; \
25 |     pip3 install --upgrade google-api-python-client; \
26 |     pip3 install -r requirements.txt  --no-cache-dir;
27 | 
28 | # Download connector libraries
29 | RUN wget -P /opt/flink/lib/ https://repo.maven.apache.org/maven2/org/apache/flink/flink-json/1.16.0/flink-json-1.16.0.jar; \
30 |     wget -P /opt/flink/lib/ https://repo.maven.apache.org/maven2/org/apache/flink/flink-sql-connector-kafka/1.16.0/flink-sql-connector-kafka-1.16.0.jar; \
31 |     wget -P /opt/flink/lib/ https://repo.maven.apache.org/maven2/org/apache/flink/flink-connector-jdbc/1.16.0/flink-connector-jdbc-1.16.0.jar; \
32 |     wget -P /opt/flink/lib/ https://repo1.maven.org/maven2/org/postgresql/postgresql/42.2.24/postgresql-42.2.24.jar;
33 | 
34 | RUN echo "taskmanager.memory.jvm-metaspace.size: 512m" >> /opt/flink/conf/flink-conf.yaml;
35 | 
36 | WORKDIR /opt/flink
37 | 


--------------------------------------------------------------------------------
/06-streaming/pyflink/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2025 Sreela Das, Julie Scherer, Zach Wilson
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/06-streaming/pyflink/Makefile:
--------------------------------------------------------------------------------
 1 | PLATFORM ?= linux/amd64
 2 | 
 3 | # COLORS
 4 | GREEN  := $(shell tput -Txterm setaf 2)
 5 | YELLOW := $(shell tput -Txterm setaf 3)
 6 | WHITE  := $(shell tput -Txterm setaf 7)
 7 | RESET  := $(shell tput -Txterm sgr0)
 8 | 
 9 | 
10 | TARGET_MAX_CHAR_NUM=20
11 | 
12 | ## Show help with `make help`
13 | help:
14 | 	@echo ''
15 | 	@echo 'Usage:'
16 | 	@echo '  ${YELLOW}make${RESET} ${GREEN}<target>${RESET}'
17 | 	@echo ''
18 | 	@echo 'Targets:'
19 | 	@awk '/^[a-zA-Z\-\_0-9]+:/ { \
20 | 		helpMessage = match(lastLine, /^## (.*)/); \
21 | 		if (helpMessage) { \
22 | 			helpCommand = substr($$1, 0, index($$1, ":")-1); \
23 | 			helpMessage = substr(lastLine, RSTART + 3, RLENGTH); \
24 | 			printf "  ${YELLOW}%-$(TARGET_MAX_CHAR_NUM)s${RESET} ${GREEN}%s${RESET}\n", helpCommand, helpMessage; \
25 | 		} \
26 | 	} \
27 | 	{ lastLine = $$0 }' $(MAKEFILE_LIST)
28 | 
29 | .PHONY: build
30 | ## Builds the Flink base image with pyFlink and connectors installed
31 | build:
32 | 	docker build .
33 | 
34 | .PHONY: up
35 | ## Builds the base Docker image and starts Flink cluster
36 | up:
37 | 	docker compose up --build --remove-orphans  -d
38 | 
39 | .PHONY: down
40 | ## Shuts down the Flink cluster
41 | down:
42 | 	docker compose down --remove-orphans
43 | 
44 | .PHONY: job
45 | ## Submit the Flink job
46 | job:
47 | 	docker compose exec jobmanager ./bin/flink run -py /opt/src/job/start_job.py --pyFiles /opt/src -d
48 | 
49 | aggregation_job:
50 | 	docker compose exec jobmanager ./bin/flink run -py /opt/src/job/aggregation_job.py --pyFiles /opt/src -d
51 | 
52 | .PHONY: stop
53 | ## Stops all services in Docker compose
54 | stop:
55 | 	docker compose stop
56 | 
57 | .PHONY: start
58 | ## Starts all services in Docker compose
59 | start:
60 | 	docker compose start
61 | 


--------------------------------------------------------------------------------
/06-streaming/pyflink/requirements.txt:
--------------------------------------------------------------------------------
1 | apache-flink==1.16.0
2 | psycopg2-binary==2.9.1
3 | requests
4 | kafka-python


--------------------------------------------------------------------------------
/06-streaming/pyflink/src/job/start_job.py:
--------------------------------------------------------------------------------
 1 | from pyflink.datastream import StreamExecutionEnvironment
 2 | from pyflink.table import EnvironmentSettings, DataTypes, TableEnvironment, StreamTableEnvironment
 3 | 
 4 | 
 5 | def create_processed_events_sink_postgres(t_env):
 6 |     table_name = 'processed_events'
 7 |     sink_ddl = f"""
 8 |         CREATE TABLE {table_name} (
 9 |             test_data INTEGER,
10 |             event_timestamp TIMESTAMP
11 |         ) WITH (
12 |             'connector' = 'jdbc',
13 |             'url' = 'jdbc:postgresql://postgres:5432/postgres',
14 |             'table-name' = '{table_name}',
15 |             'username' = 'postgres',
16 |             'password' = 'postgres',
17 |             'driver' = 'org.postgresql.Driver'
18 |         );
19 |         """
20 |     t_env.execute_sql(sink_ddl)
21 |     return table_name
22 | 
23 | 
24 | def create_events_source_kafka(t_env):
25 |     table_name = "events"
26 |     pattern = "yyyy-MM-dd HH:mm:ss.SSS"
27 |     source_ddl = f"""
28 |         CREATE TABLE {table_name} (
29 |             test_data INTEGER,
30 |             event_timestamp BIGINT,
31 |             event_watermark AS TO_TIMESTAMP_LTZ(event_timestamp, 3),
32 |             WATERMARK for event_watermark as event_watermark - INTERVAL '5' SECOND
33 |         ) WITH (
34 |             'connector' = 'kafka',
35 |             'properties.bootstrap.servers' = 'redpanda-1:29092',
36 |             'topic' = 'test-topic',
37 |             'scan.startup.mode' = 'latest-offset',
38 |             'properties.auto.offset.reset' = 'latest',
39 |             'format' = 'json'
40 |         );
41 |         """
42 |     t_env.execute_sql(source_ddl)
43 |     return table_name
44 | 
45 | def log_processing():
46 |     # Set up the execution environment
47 |     env = StreamExecutionEnvironment.get_execution_environment()
48 |     env.enable_checkpointing(10 * 1000)
49 |     # env.set_parallelism(1)
50 | 
51 |     # Set up the table environment
52 |     settings = EnvironmentSettings.new_instance().in_streaming_mode().build()
53 |     t_env = StreamTableEnvironment.create(env, environment_settings=settings)
54 |     try:
55 |         # Create Kafka table
56 |         source_table = create_events_source_kafka(t_env)
57 |         postgres_sink = create_processed_events_sink_postgres(t_env)
58 |         # write records to postgres too!
59 |         t_env.execute_sql(
60 |             f"""
61 |                     INSERT INTO {postgres_sink}
62 |                     SELECT
63 |                         test_data,
64 |                         TO_TIMESTAMP_LTZ(event_timestamp, 3) as event_timestamp
65 |                     FROM {source_table}
66 |                     """
67 |         ).wait()
68 | 
69 |     except Exception as e:
70 |         print("Writing records from Kafka to JDBC failed:", str(e))
71 | 
72 | 
73 | if __name__ == '__main__':
74 |     log_processing()
75 | 


--------------------------------------------------------------------------------
/06-streaming/pyflink/src/producers/load_taxi_data.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | import json
 3 | from kafka import KafkaProducer
 4 | 
 5 | def main():
 6 |     # Create a Kafka producer
 7 |     producer = KafkaProducer(
 8 |         bootstrap_servers='localhost:9092',
 9 |         value_serializer=lambda v: json.dumps(v).encode('utf-8')
10 |     )
11 | 
12 |     csv_file = 'data/green_tripdata_2019-10.csv'  # change to your CSV file path if needed
13 | 
14 |     with open(csv_file, 'r', newline='', encoding='utf-8') as file:
15 |         reader = csv.DictReader(file)
16 | 
17 |         for row in reader:
18 |             # Each row will be a dictionary keyed by the CSV headers
19 |             # Send data to Kafka topic "green-data"
20 |             producer.send('green-data', value=row)
21 | 
22 |     # Make sure any remaining messages are delivered
23 |     producer.flush()
24 |     producer.close()
25 | 
26 | 
27 | if __name__ == "__main__":
28 |     main()


--------------------------------------------------------------------------------
/06-streaming/pyflink/src/producers/producer.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import time
 3 | from kafka import KafkaProducer
 4 | 
 5 | def json_serializer(data):
 6 |     return json.dumps(data).encode('utf-8')
 7 | 
 8 | server = 'localhost:9092'
 9 | 
10 | producer = KafkaProducer(
11 |     bootstrap_servers=[server],
12 |     value_serializer=json_serializer
13 | )
14 | t0 = time.time()
15 | 
16 | topic_name = 'test-topic'
17 | 
18 | for i in range(10, 1000):
19 |     message = {'test_data': i, 'event_timestamp': time.time() * 1000}
20 |     producer.send(topic_name, value=message)
21 |     print(f"Sent: {message}")
22 |     time.sleep(0.05)
23 | 
24 | producer.flush()
25 | 
26 | t1 = time.time()
27 | print(f'took {(t1 - t0):.2f} seconds')


--------------------------------------------------------------------------------
/06-streaming/python/README.md:
--------------------------------------------------------------------------------
 1 | ### Stream-Processing with Python
 2 | 
 3 | In this document, you will be finding information about stream processing 
 4 | using different Python libraries (`kafka-python`,`confluent-kafka`,`pyspark`, `faust`).
 5 | 
 6 | This Python module can be separated in following modules.
 7 | 
 8 | ####  1. Docker
 9 | Docker module includes, Dockerfiles and docker-compose definitions 
10 | to run Kafka and Spark in a docker container. Setting up required services is
11 | the prerequsite step for running following modules.
12 | 
13 | #### 2. Kafka Producer - Consumer Examples
14 | - [Json Producer-Consumer Example](json_example) using `kafka-python` library
15 | - [Avro Producer-Consumer Example](avro_example) using `confluent-kafka` library
16 | 
17 | Both of these examples require, up-and running Kafka services, therefore please ensure
18 | following steps under [docker-README](docker/README.md)
19 | 
20 | To run the producer-consumer examples in the respective example folder, run following commands
21 | ```bash
22 | # Start producer script
23 | python3 producer.py
24 | # Start consumer script
25 | python3 consumer.py
26 | ```
27 | 
28 | 
29 | 
30 | 
31 | 
32 | 


--------------------------------------------------------------------------------
/06-streaming/python/avro_example/ride_record.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Dict
 2 | 
 3 | 
 4 | class RideRecord:
 5 | 
 6 |     def __init__(self, arr: List[str]):
 7 |         self.vendor_id = int(arr[0])
 8 |         self.passenger_count = int(arr[1])
 9 |         self.trip_distance = float(arr[2])
10 |         self.payment_type = int(arr[3])
11 |         self.total_amount = float(arr[4])
12 | 
13 |     @classmethod
14 |     def from_dict(cls, d: Dict):
15 |         return cls(arr=[
16 |             d['vendor_id'],
17 |             d['passenger_count'],
18 |             d['trip_distance'],
19 |             d['payment_type'],
20 |             d['total_amount']
21 |         ]
22 |         )
23 | 
24 |     def __repr__(self):
25 |         return f'{self.__class__.__name__}: {self.__dict__}'
26 | 
27 | 
28 | def dict_to_ride_record(obj, ctx):
29 |     if obj is None:
30 |         return None
31 | 
32 |     return RideRecord.from_dict(obj)
33 | 
34 | 
35 | def ride_record_to_dict(ride_record: RideRecord, ctx):
36 |     return ride_record.__dict__
37 | 


--------------------------------------------------------------------------------
/06-streaming/python/avro_example/ride_record_key.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict
 2 | 
 3 | 
 4 | class RideRecordKey:
 5 |     def __init__(self, vendor_id):
 6 |         self.vendor_id = vendor_id
 7 | 
 8 |     @classmethod
 9 |     def from_dict(cls, d: Dict):
10 |         return cls(vendor_id=d['vendor_id'])
11 | 
12 |     def __repr__(self):
13 |         return f'{self.__class__.__name__}: {self.__dict__}'
14 | 
15 | 
16 | def dict_to_ride_record_key(obj, ctx):
17 |     if obj is None:
18 |         return None
19 | 
20 |     return RideRecordKey.from_dict(obj)
21 | 
22 | 
23 | def ride_record_key_to_dict(ride_record_key: RideRecordKey, ctx):
24 |     return ride_record_key.__dict__
25 | 


--------------------------------------------------------------------------------
/06-streaming/python/avro_example/settings.py:
--------------------------------------------------------------------------------
1 | INPUT_DATA_PATH = '../resources/rides.csv'
2 | 
3 | RIDE_KEY_SCHEMA_PATH = '../resources/schemas/taxi_ride_key.avsc'
4 | RIDE_VALUE_SCHEMA_PATH = '../resources/schemas/taxi_ride_value.avsc'
5 | 
6 | SCHEMA_REGISTRY_URL = 'http://localhost:8081'
7 | BOOTSTRAP_SERVERS = 'localhost:9092'
8 | KAFKA_TOPIC = 'rides_avro'
9 | 


--------------------------------------------------------------------------------
/06-streaming/python/docker/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # Running Spark and Kafka Clusters on Docker
 3 | 
 4 | ### 1. Build Required Images for running Spark
 5 | 
 6 | The details of how to spark-images are build in different layers can be created can be read through 
 7 | the blog post written by André Perez on [Medium blog -Towards Data Science](https://towardsdatascience.com/apache-spark-cluster-on-docker-ft-a-juyterlab-interface-418383c95445)
 8 | 
 9 | ```bash
10 | # Build Spark Images
11 | ./build.sh 
12 | ```
13 | 
14 | ### 2. Create Docker Network & Volume
15 | 
16 | ```bash
17 | # Create Network
18 | docker network  create kafka-spark-network
19 | 
20 | # Create Volume
21 | docker volume create --name=hadoop-distributed-file-system
22 | ```
23 | 
24 | ### 3. Run Services on Docker
25 | ```bash
26 | # Start Docker-Compose (within for kafka and spark folders)
27 | docker compose up -d
28 | ```
29 | In depth explanation of [Kafka Listeners](https://www.confluent.io/blog/kafka-listeners-explained/)
30 | 
31 | Explanation of [Kafka Listeners](https://www.confluent.io/blog/kafka-listeners-explained/)
32 | 
33 | ### 4. Stop Services on Docker
34 | ```bash
35 | # Stop Docker-Compose (within for kafka and spark folders)
36 | docker compose down
37 | ```
38 | 
39 | ### 5. Helpful Comands
40 | ```bash
41 | # Delete all Containers
42 | docker rm -f $(docker ps -a -q)
43 | 
44 | # Delete all volumes
45 | docker volume rm $(docker volume ls -q)
46 | ```
47 | 
48 | 


--------------------------------------------------------------------------------
/06-streaming/python/docker/spark/build.sh:
--------------------------------------------------------------------------------
 1 | # -- Software Stack Version
 2 | 
 3 | SPARK_VERSION="3.3.1"
 4 | HADOOP_VERSION="3"
 5 | JUPYTERLAB_VERSION="3.6.1"
 6 | 
 7 | # -- Building the Images
 8 | 
 9 | docker build \
10 |   -f cluster-base.Dockerfile \
11 |   -t cluster-base .
12 | 
13 | docker build \
14 |   --build-arg spark_version="${SPARK_VERSION}" \
15 |   --build-arg hadoop_version="${HADOOP_VERSION}" \
16 |   -f spark-base.Dockerfile \
17 |   -t spark-base .
18 | 
19 | docker build \
20 |   -f spark-master.Dockerfile \
21 |   -t spark-master .
22 | 
23 | docker build \
24 |   -f spark-worker.Dockerfile \
25 |   -t spark-worker .
26 | 
27 | docker build \
28 |   --build-arg spark_version="${SPARK_VERSION}" \
29 |   --build-arg jupyterlab_version="${JUPYTERLAB_VERSION}" \
30 |   -f jupyterlab.Dockerfile \
31 |   -t jupyterlab .
32 | 


--------------------------------------------------------------------------------
/06-streaming/python/docker/spark/cluster-base.Dockerfile:
--------------------------------------------------------------------------------
 1 | # Reference from offical Apache Spark repository Dockerfile for Kubernetes
 2 | # https://github.com/apache/spark/blob/master/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/Dockerfile
 3 | ARG java_image_tag=17-jre
 4 | FROM eclipse-temurin:${java_image_tag}
 5 | 
 6 | # -- Layer: OS + Python
 7 | 
 8 | ARG shared_workspace=/opt/workspace
 9 | 
10 | RUN mkdir -p ${shared_workspace} && \
11 |     apt-get update -y && \
12 |     apt-get install -y python3 && \
13 |     ln -s /usr/bin/python3 /usr/bin/python && \
14 |     rm -rf /var/lib/apt/lists/*
15 | 
16 | ENV SHARED_WORKSPACE=${shared_workspace}
17 | 
18 | # -- Runtime
19 | 
20 | VOLUME ${shared_workspace}
21 | CMD ["bash"]


--------------------------------------------------------------------------------
/06-streaming/python/docker/spark/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: "3.6"
 2 | volumes:
 3 |   shared-workspace:
 4 |     name: "hadoop-distributed-file-system"
 5 |     driver: local
 6 | networks:
 7 |   default:
 8 |     name: kafka-spark-network
 9 |     external: true
10 | 
11 | services:
12 |   jupyterlab:
13 |     image: jupyterlab
14 |     container_name: jupyterlab
15 |     ports:
16 |       - 8888:8888
17 |     volumes:
18 |       - shared-workspace:/opt/workspace
19 |   spark-master:
20 |     image: spark-master
21 |     container_name: spark-master
22 |     environment:
23 |       SPARK_LOCAL_IP: 'spark-master'
24 |     ports:
25 |       - 8080:8080
26 |       - 7077:7077
27 |     volumes:
28 |       - shared-workspace:/opt/workspace
29 |   spark-worker-1:
30 |     image: spark-worker
31 |     container_name: spark-worker-1
32 |     environment:
33 |       - SPARK_WORKER_CORES=1
34 |       - SPARK_WORKER_MEMORY=4g
35 |     ports:
36 |       - 8083:8081
37 |     volumes:
38 |       - shared-workspace:/opt/workspace
39 |     depends_on:
40 |       - spark-master
41 |   spark-worker-2:
42 |     image: spark-worker
43 |     container_name: spark-worker-2
44 |     environment:
45 |       - SPARK_WORKER_CORES=1
46 |       - SPARK_WORKER_MEMORY=4g
47 |     ports:
48 |       - 8084:8081
49 |     volumes:
50 |       - shared-workspace:/opt/workspace
51 |     depends_on:
52 |       - spark-master
53 | 


--------------------------------------------------------------------------------
/06-streaming/python/docker/spark/jupyterlab.Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM cluster-base
 2 | 
 3 | # -- Layer: JupyterLab
 4 | 
 5 | ARG spark_version=3.3.1
 6 | ARG jupyterlab_version=3.6.1
 7 | 
 8 | RUN apt-get update -y && \
 9 |     apt-get install -y python3-pip && \
10 |     pip3 install wget pyspark==${spark_version} jupyterlab==${jupyterlab_version}
11 | 
12 | # -- Runtime
13 | 
14 | EXPOSE 8888
15 | WORKDIR ${SHARED_WORKSPACE}
16 | CMD jupyter lab --ip=0.0.0.0 --port=8888 --no-browser --allow-root --NotebookApp.token=
17 | 


--------------------------------------------------------------------------------
/06-streaming/python/docker/spark/spark-base.Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM cluster-base
 2 | 
 3 | # -- Layer: Apache Spark
 4 | 
 5 | ARG spark_version=3.3.1
 6 | ARG hadoop_version=3
 7 | 
 8 | RUN apt-get update -y && \
 9 |     apt-get install -y curl && \
10 |     curl https://archive.apache.org/dist/spark/spark-${spark_version}/spark-${spark_version}-bin-hadoop${hadoop_version}.tgz -o spark.tgz && \
11 |     tar -xf spark.tgz && \
12 |     mv spark-${spark_version}-bin-hadoop${hadoop_version} /usr/bin/ && \
13 |     mkdir /usr/bin/spark-${spark_version}-bin-hadoop${hadoop_version}/logs && \
14 |     rm spark.tgz
15 | 
16 | ENV SPARK_HOME /usr/bin/spark-${spark_version}-bin-hadoop${hadoop_version}
17 | ENV SPARK_MASTER_HOST spark-master
18 | ENV SPARK_MASTER_PORT 7077
19 | ENV PYSPARK_PYTHON python3
20 | 
21 | # -- Runtime
22 | 
23 | WORKDIR ${SPARK_HOME}


--------------------------------------------------------------------------------
/06-streaming/python/docker/spark/spark-master.Dockerfile:
--------------------------------------------------------------------------------
1 | FROM spark-base
2 | 
3 | # -- Runtime
4 | 
5 | ARG spark_master_web_ui=8080
6 | 
7 | EXPOSE ${spark_master_web_ui} ${SPARK_MASTER_PORT}
8 | CMD bin/spark-class org.apache.spark.deploy.master.Master >> logs/spark-master.out


--------------------------------------------------------------------------------
/06-streaming/python/docker/spark/spark-worker.Dockerfile:
--------------------------------------------------------------------------------
1 | FROM spark-base
2 | 
3 | # -- Runtime
4 | 
5 | ARG spark_worker_web_ui=8081
6 | 
7 | EXPOSE ${spark_worker_web_ui}
8 | CMD bin/spark-class org.apache.spark.deploy.worker.Worker spark://${SPARK_MASTER_HOST}:${SPARK_MASTER_PORT} >> logs/spark-worker.out
9 | 


--------------------------------------------------------------------------------
/06-streaming/python/json_example/consumer.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict, List
 2 | from json import loads
 3 | from kafka import KafkaConsumer
 4 | 
 5 | from ride import Ride
 6 | from settings import BOOTSTRAP_SERVERS, KAFKA_TOPIC
 7 | 
 8 | 
 9 | class JsonConsumer:
10 |     def __init__(self, props: Dict):
11 |         self.consumer = KafkaConsumer(**props)
12 | 
13 |     def consume_from_kafka(self, topics: List[str]):
14 |         self.consumer.subscribe(topics)
15 |         print('Consuming from Kafka started')
16 |         print('Available topics to consume: ', self.consumer.subscription())
17 |         while True:
18 |             try:
19 |                 # SIGINT can't be handled when polling, limit timeout to 1 second.
20 |                 message = self.consumer.poll(1.0)
21 |                 if message is None or message == {}:
22 |                     continue
23 |                 for message_key, message_value in message.items():
24 |                     for msg_val in message_value:
25 |                         print(msg_val.key, msg_val.value)
26 |             except KeyboardInterrupt:
27 |                 break
28 | 
29 |         self.consumer.close()
30 | 
31 | 
32 | if __name__ == '__main__':
33 |     config = {
34 |         'bootstrap_servers': BOOTSTRAP_SERVERS,
35 |         'auto_offset_reset': 'earliest',
36 |         'enable_auto_commit': True,
37 |         'key_deserializer': lambda key: int(key.decode('utf-8')),
38 |         'value_deserializer': lambda x: loads(x.decode('utf-8'), object_hook=lambda d: Ride.from_dict(d)),
39 |         'group_id': 'consumer.group.id.json-example.1',
40 |     }
41 | 
42 |     json_consumer = JsonConsumer(props=config)
43 |     json_consumer.consume_from_kafka(topics=[KAFKA_TOPIC])
44 | 


--------------------------------------------------------------------------------
/06-streaming/python/json_example/producer.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | import json
 3 | from typing import List, Dict
 4 | from kafka import KafkaProducer
 5 | from kafka.errors import KafkaTimeoutError
 6 | 
 7 | from ride import Ride
 8 | from settings import BOOTSTRAP_SERVERS, INPUT_DATA_PATH, KAFKA_TOPIC
 9 | 
10 | 
11 | class JsonProducer(KafkaProducer):
12 |     def __init__(self, props: Dict):
13 |         self.producer = KafkaProducer(**props)
14 | 
15 |     @staticmethod
16 |     def read_records(resource_path: str):
17 |         records = []
18 |         with open(resource_path, 'r') as f:
19 |             reader = csv.reader(f)
20 |             header = next(reader)  # skip the header row
21 |             for row in reader:
22 |                 records.append(Ride(arr=row))
23 |         return records
24 | 
25 |     def publish_rides(self, topic: str, messages: List[Ride]):
26 |         for ride in messages:
27 |             try:
28 |                 record = self.producer.send(topic=topic, key=ride.pu_location_id, value=ride)
29 |                 print('Record {} successfully produced at offset {}'.format(ride.pu_location_id, record.get().offset))
30 |             except KafkaTimeoutError as e:
31 |                 print(e.__str__())
32 | 
33 | 
34 | if __name__ == '__main__':
35 |     # Config Should match with the KafkaProducer expectation
36 |     config = {
37 |         'bootstrap_servers': BOOTSTRAP_SERVERS,
38 |         'key_serializer': lambda key: str(key).encode(),
39 |         'value_serializer': lambda x: json.dumps(x.__dict__, default=str).encode('utf-8')
40 |     }
41 |     producer = JsonProducer(props=config)
42 |     rides = producer.read_records(resource_path=INPUT_DATA_PATH)
43 |     producer.publish_rides(topic=KAFKA_TOPIC, messages=rides)
44 | 


--------------------------------------------------------------------------------
/06-streaming/python/json_example/ride.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Dict
 2 | from decimal import Decimal
 3 | from datetime import datetime
 4 | 
 5 | 
 6 | class Ride:
 7 |     def __init__(self, arr: List[str]):
 8 |         self.vendor_id = arr[0]
 9 |         self.tpep_pickup_datetime = datetime.strptime(arr[1], "%Y-%m-%d %H:%M:%S"),
10 |         self.tpep_dropoff_datetime = datetime.strptime(arr[2], "%Y-%m-%d %H:%M:%S"),
11 |         self.passenger_count = int(arr[3])
12 |         self.trip_distance = Decimal(arr[4])
13 |         self.rate_code_id = int(arr[5])
14 |         self.store_and_fwd_flag = arr[6]
15 |         self.pu_location_id = int(arr[7])
16 |         self.do_location_id = int(arr[8])
17 |         self.payment_type = arr[9]
18 |         self.fare_amount = Decimal(arr[10])
19 |         self.extra = Decimal(arr[11])
20 |         self.mta_tax = Decimal(arr[12])
21 |         self.tip_amount = Decimal(arr[13])
22 |         self.tolls_amount = Decimal(arr[14])
23 |         self.improvement_surcharge = Decimal(arr[15])
24 |         self.total_amount = Decimal(arr[16])
25 |         self.congestion_surcharge = Decimal(arr[17])
26 | 
27 |     @classmethod
28 |     def from_dict(cls, d: Dict):
29 |         return cls(arr=[
30 |             d['vendor_id'],
31 |             d['tpep_pickup_datetime'][0],
32 |             d['tpep_dropoff_datetime'][0],
33 |             d['passenger_count'],
34 |             d['trip_distance'],
35 |             d['rate_code_id'],
36 |             d['store_and_fwd_flag'],
37 |             d['pu_location_id'],
38 |             d['do_location_id'],
39 |             d['payment_type'],
40 |             d['fare_amount'],
41 |             d['extra'],
42 |             d['mta_tax'],
43 |             d['tip_amount'],
44 |             d['tolls_amount'],
45 |             d['improvement_surcharge'],
46 |             d['total_amount'],
47 |             d['congestion_surcharge'],
48 |         ]
49 |         )
50 | 
51 |     def __repr__(self):
52 |         return f'{self.__class__.__name__}: {self.__dict__}'
53 | 


--------------------------------------------------------------------------------
/06-streaming/python/json_example/settings.py:
--------------------------------------------------------------------------------
1 | INPUT_DATA_PATH = '../resources/rides.csv'
2 | 
3 | BOOTSTRAP_SERVERS = ['localhost:9092']
4 | KAFKA_TOPIC = 'rides_json'
5 | 


--------------------------------------------------------------------------------
/06-streaming/python/redpanda_example/consumer.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from typing import Dict, List
 3 | from json import loads
 4 | from kafka import KafkaConsumer
 5 | 
 6 | from ride import Ride
 7 | from settings import BOOTSTRAP_SERVERS, KAFKA_TOPIC
 8 | 
 9 | 
10 | class JsonConsumer:
11 |     def __init__(self, props: Dict):
12 |         self.consumer = KafkaConsumer(**props)
13 | 
14 |     def consume_from_kafka(self, topics: List[str]):
15 |         self.consumer.subscribe(topics)
16 |         print('Consuming from Kafka started')
17 |         print('Available topics to consume: ', self.consumer.subscription())
18 |         while True:
19 |             try:
20 |                 # SIGINT can't be handled when polling, limit timeout to 1 second.
21 |                 message = self.consumer.poll(1.0)
22 |                 if message is None or message == {}:
23 |                     continue
24 |                 for message_key, message_value in message.items():
25 |                     for msg_val in message_value:
26 |                         print(msg_val.key, msg_val.value)
27 |             except KeyboardInterrupt:
28 |                 break
29 | 
30 |         self.consumer.close()
31 | 
32 | 
33 | if __name__ == '__main__':
34 |     config = {
35 |         'bootstrap_servers': BOOTSTRAP_SERVERS,
36 |         'auto_offset_reset': 'earliest',
37 |         'enable_auto_commit': True,
38 |         'key_deserializer': lambda key: int(key.decode('utf-8')),
39 |         'value_deserializer': lambda x: loads(x.decode('utf-8'), object_hook=lambda d: Ride.from_dict(d)),
40 |         'group_id': 'consumer.group.id.json-example.1',
41 |     }
42 | 
43 |     json_consumer = JsonConsumer(props=config)
44 |     json_consumer.consume_from_kafka(topics=[KAFKA_TOPIC])
45 | 
46 | 
47 | # There's no schema in JSON format, so if the schema changes and one column is removed or new one added or the data types is changed, the Ride class would still work and produce-consume messages would still run without a hitch.
48 | # But the issue is in the downstream Analytics as the dataset would no longer have that column and the dashboards would thus fail. Therefore, the trust in our data and processes would erodes.


--------------------------------------------------------------------------------
/06-streaming/python/redpanda_example/docker-compose.yaml:
--------------------------------------------------------------------------------
 1 | version: '3.7'
 2 | services:
 3 |   # Redpanda cluster
 4 |   redpanda-1:
 5 |     image: docker.redpanda.com/redpandadata/redpanda:v23.2.26
 6 |     container_name: redpanda-1
 7 |     command:
 8 |       - redpanda
 9 |       - start
10 |       - --smp
11 |       - '1'
12 |       - --reserve-memory
13 |       - 0M
14 |       - --overprovisioned
15 |       - --node-id
16 |       - '1'
17 |       - --kafka-addr
18 |       - PLAINTEXT://0.0.0.0:29092,OUTSIDE://0.0.0.0:9092
19 |       - --advertise-kafka-addr
20 |       - PLAINTEXT://redpanda-1:29092,OUTSIDE://localhost:9092
21 |       - --pandaproxy-addr
22 |       - PLAINTEXT://0.0.0.0:28082,OUTSIDE://0.0.0.0:8082
23 |       - --advertise-pandaproxy-addr
24 |       - PLAINTEXT://redpanda-1:28082,OUTSIDE://localhost:8082
25 |       - --rpc-addr
26 |       - 0.0.0.0:33145
27 |       - --advertise-rpc-addr
28 |       - redpanda-1:33145
29 |     ports:
30 |       # - 8081:8081
31 |       - 8082:8082
32 |       - 9092:9092
33 |       - 9644:9644
34 |       - 28082:28082
35 |       - 29092:29092
36 | 
37 |   # Want a two node Redpanda cluster? Uncomment this block :)
38 |   # redpanda-2:
39 |   #   image: docker.redpanda.com/redpandadata/redpanda:v23.1.1
40 |   #   container_name: redpanda-2
41 |   #   command:
42 |   #     - redpanda
43 |   #     - start
44 |   #     - --smp
45 |   #     - '1'
46 |   #     - --reserve-memory
47 |   #     - 0M
48 |   #     - --overprovisioned
49 |   #     - --node-id
50 |   #     - '2'
51 |   #     - --seeds
52 |   #     - redpanda-1:33145
53 |   #     - --kafka-addr
54 |   #     - PLAINTEXT://0.0.0.0:29093,OUTSIDE://0.0.0.0:9093
55 |   #     - --advertise-kafka-addr
56 |   #     - PLAINTEXT://redpanda-2:29093,OUTSIDE://localhost:9093
57 |   #     - --pandaproxy-addr
58 |   #     - PLAINTEXT://0.0.0.0:28083,OUTSIDE://0.0.0.0:8083
59 |   #     - --advertise-pandaproxy-addr
60 |   #     - PLAINTEXT://redpanda-2:28083,OUTSIDE://localhost:8083
61 |   #     - --rpc-addr
62 |   #     - 0.0.0.0:33146
63 |   #     - --advertise-rpc-addr
64 |   #     - redpanda-2:33146
65 |   #   ports:
66 |   #     - 8083:8083
67 |   #     - 9093:9093
68 | 
69 |   redpanda-console:
70 |     image: docker.redpanda.com/redpandadata/console:v2.2.2
71 |     container_name: redpanda-console
72 |     entrypoint: /bin/sh
73 |     command: -c "echo \"$$CONSOLE_CONFIG_FILE\" > /tmp/config.yml; /app/console"
74 |     environment:
75 |       CONFIG_FILEPATH: /tmp/config.yml
76 |       CONSOLE_CONFIG_FILE: |
77 |         kafka:
78 |           brokers: ["redpanda-1:29092"]
79 |           schemaRegistry:
80 |             enabled: false
81 |         redpanda:
82 |           adminApi:
83 |             enabled: true
84 |             urls: ["http://redpanda-1:9644"]
85 |         connect:
86 |           enabled: false
87 |     ports:
88 |       - 8080:8080
89 |     depends_on:
90 |       - redpanda-1
91 | 


--------------------------------------------------------------------------------
/06-streaming/python/redpanda_example/producer.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | import json
 3 | from typing import List, Dict
 4 | from kafka import KafkaProducer
 5 | from kafka.errors import KafkaTimeoutError
 6 | 
 7 | from ride import Ride
 8 | from settings import BOOTSTRAP_SERVERS, INPUT_DATA_PATH, KAFKA_TOPIC
 9 | 
10 | 
11 | class JsonProducer(KafkaProducer):
12 |     def __init__(self, props: Dict):
13 |         self.producer = KafkaProducer(**props)
14 | 
15 |     @staticmethod
16 |     def read_records(resource_path: str):
17 |         records = []
18 |         with open(resource_path, 'r') as f:
19 |             reader = csv.reader(f)
20 |             header = next(reader)  # skip the header row
21 |             for row in reader:
22 |                 records.append(Ride(arr=row))
23 |         return records
24 | 
25 |     def publish_rides(self, topic: str, messages: List[Ride]):
26 |         for ride in messages:
27 |             try:
28 |                 record = self.producer.send(topic=topic, key=ride.pu_location_id, value=ride)
29 |                 print('Record {} successfully produced at offset {}'.format(ride.pu_location_id, record.get().offset))
30 |             except KafkaTimeoutError as e:
31 |                 print(e.__str__())
32 | 
33 | 
34 | if __name__ == '__main__':
35 |     # Config Should match with the KafkaProducer expectation
36 |     # kafka expects binary format for the key-value pair
37 |     config = {
38 |         'bootstrap_servers': BOOTSTRAP_SERVERS,
39 |         'key_serializer': lambda key: str(key).encode(),
40 |         'value_serializer': lambda x: json.dumps(x.__dict__, default=str).encode('utf-8')
41 |     }
42 |     producer = JsonProducer(props=config)
43 |     rides = producer.read_records(resource_path=INPUT_DATA_PATH)
44 |     producer.publish_rides(topic=KAFKA_TOPIC, messages=rides)
45 | 


--------------------------------------------------------------------------------
/06-streaming/python/redpanda_example/ride.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Dict
 2 | from decimal import Decimal
 3 | from datetime import datetime
 4 | 
 5 | 
 6 | class Ride:
 7 |     def __init__(self, arr: List[str]):
 8 |         self.vendor_id = arr[0]
 9 |         self.tpep_pickup_datetime = datetime.strptime(arr[1], "%Y-%m-%d %H:%M:%S"),
10 |         self.tpep_dropoff_datetime = datetime.strptime(arr[2], "%Y-%m-%d %H:%M:%S"),
11 |         self.passenger_count = int(arr[3])
12 |         self.trip_distance = Decimal(arr[4])
13 |         self.rate_code_id = int(arr[5])
14 |         self.store_and_fwd_flag = arr[6]
15 |         self.pu_location_id = int(arr[7])
16 |         self.do_location_id = int(arr[8])
17 |         self.payment_type = arr[9]
18 |         self.fare_amount = Decimal(arr[10])
19 |         self.extra = Decimal(arr[11])
20 |         self.mta_tax = Decimal(arr[12])
21 |         self.tip_amount = Decimal(arr[13])
22 |         self.tolls_amount = Decimal(arr[14])
23 |         self.improvement_surcharge = Decimal(arr[15])
24 |         self.total_amount = Decimal(arr[16])
25 |         self.congestion_surcharge = Decimal(arr[17])
26 | 
27 |     @classmethod
28 |     def from_dict(cls, d: Dict):
29 |         return cls(arr=[
30 |             d['vendor_id'],
31 |             d['tpep_pickup_datetime'][0],
32 |             d['tpep_dropoff_datetime'][0],
33 |             d['passenger_count'],
34 |             d['trip_distance'],
35 |             d['rate_code_id'],
36 |             d['store_and_fwd_flag'],
37 |             d['pu_location_id'],
38 |             d['do_location_id'],
39 |             d['payment_type'],
40 |             d['fare_amount'],
41 |             d['extra'],
42 |             d['mta_tax'],
43 |             d['tip_amount'],
44 |             d['tolls_amount'],
45 |             d['improvement_surcharge'],
46 |             d['total_amount'],
47 |             d['congestion_surcharge'],
48 |         ]
49 |         )
50 | 
51 |     def __repr__(self):
52 |         return f'{self.__class__.__name__}: {self.__dict__}'
53 | 


--------------------------------------------------------------------------------
/06-streaming/python/redpanda_example/settings.py:
--------------------------------------------------------------------------------
1 | INPUT_DATA_PATH = '../resources/rides.csv'
2 | 
3 | BOOTSTRAP_SERVERS = ['localhost:9092']
4 | KAFKA_TOPIC = 'rides_json'
5 | 


--------------------------------------------------------------------------------
/06-streaming/python/requirements.txt:
--------------------------------------------------------------------------------
1 | kafka-python==1.4.6
2 | confluent_kafka
3 | requests
4 | avro
5 | faust
6 | fastavro
7 | 


--------------------------------------------------------------------------------
/06-streaming/python/resources/schemas/taxi_ride_key.avsc:
--------------------------------------------------------------------------------
 1 | {
 2 |   "namespace": "com.datatalksclub.taxi",
 3 |   "type": "record",
 4 |   "name": "RideRecordKey",
 5 |   "fields": [
 6 |     {
 7 |       "name": "vendor_id",
 8 |       "type": "int"
 9 |     }
10 |   ]
11 | }


--------------------------------------------------------------------------------
/06-streaming/python/resources/schemas/taxi_ride_value.avsc:
--------------------------------------------------------------------------------
 1 | {
 2 |   "namespace": "com.datatalksclub.taxi",
 3 |   "type": "record",
 4 |   "name": "RideRecord",
 5 |   "fields": [
 6 |     {
 7 |       "name": "vendor_id",
 8 |       "type": "int"
 9 |     },
10 |     {
11 |       "name": "passenger_count",
12 |       "type": "int"
13 |     },
14 |     {
15 |       "name": "trip_distance",
16 |       "type": "float"
17 |     },
18 |     {
19 |       "name": "payment_type",
20 |       "type": "int"
21 |     },
22 |     {
23 |       "name": "total_amount",
24 |       "type": "float"
25 |     }
26 |   ]
27 | }


--------------------------------------------------------------------------------
/06-streaming/python/streams-example/faust/branch_price.py:
--------------------------------------------------------------------------------
 1 | import faust
 2 | from taxi_rides import TaxiRide
 3 | from faust import current_event
 4 | 
 5 | app = faust.App('datatalksclub.stream.v3', broker='kafka://localhost:9092', consumer_auto_offset_reset="earliest")
 6 | topic = app.topic('datatalkclub.yellow_taxi_ride.json', value_type=TaxiRide)
 7 | 
 8 | high_amount_rides = app.topic('datatalks.yellow_taxi_rides.high_amount')
 9 | low_amount_rides = app.topic('datatalks.yellow_taxi_rides.low_amount')
10 | 
11 | 
12 | @app.agent(topic)
13 | async def process(stream):
14 |     async for event in stream:
15 |         if event.total_amount >= 40.0:
16 |             await current_event().forward(high_amount_rides)
17 |         else:
18 |             await current_event().forward(low_amount_rides)
19 | 
20 | if __name__ == '__main__':
21 |     app.main()
22 | 


--------------------------------------------------------------------------------
/06-streaming/python/streams-example/faust/producer_taxi_json.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | from json import dumps
 3 | from kafka import KafkaProducer
 4 | from time import sleep
 5 | 
 6 | 
 7 | producer = KafkaProducer(bootstrap_servers=['localhost:9092'],
 8 |                          key_serializer=lambda x: dumps(x).encode('utf-8'),
 9 |                          value_serializer=lambda x: dumps(x).encode('utf-8'))
10 | 
11 | file = open('../../resources/rides.csv')
12 | 
13 | csvreader = csv.reader(file)
14 | header = next(csvreader)
15 | for row in csvreader:
16 |     key = {"vendorId": int(row[0])}
17 |     value = {"vendorId": int(row[0]), "passenger_count": int(row[3]), "trip_distance": float(row[4]), "payment_type": int(row[9]), "total_amount": float(row[16])}
18 |     producer.send('datatalkclub.yellow_taxi_ride.json', value=value, key=key)
19 |     print("producing")
20 |     sleep(1)


--------------------------------------------------------------------------------
/06-streaming/python/streams-example/faust/stream.py:
--------------------------------------------------------------------------------
 1 | import faust
 2 | from taxi_rides import TaxiRide
 3 | 
 4 | 
 5 | app = faust.App('datatalksclub.stream.v2', broker='kafka://localhost:9092')
 6 | topic = app.topic('datatalkclub.yellow_taxi_ride.json', value_type=TaxiRide)
 7 | 
 8 | 
 9 | @app.agent(topic)
10 | async def start_reading(records):
11 |     async for record in records:
12 |         print(record)
13 | 
14 | 
15 | if __name__ == '__main__':
16 |     app.main()
17 | 


--------------------------------------------------------------------------------
/06-streaming/python/streams-example/faust/stream_count_vendor_trips.py:
--------------------------------------------------------------------------------
 1 | import faust
 2 | from taxi_rides import TaxiRide
 3 | 
 4 | 
 5 | app = faust.App('datatalksclub.stream.v2', broker='kafka://localhost:9092')
 6 | topic = app.topic('datatalkclub.yellow_taxi_ride.json', value_type=TaxiRide)
 7 | 
 8 | vendor_rides = app.Table('vendor_rides', default=int)
 9 | 
10 | 
11 | @app.agent(topic)
12 | async def process(stream):
13 |     async for event in stream.group_by(TaxiRide.vendorId):
14 |         vendor_rides[event.vendorId] += 1
15 | 
16 | if __name__ == '__main__':
17 |     app.main()
18 | 


--------------------------------------------------------------------------------
/06-streaming/python/streams-example/faust/taxi_rides.py:
--------------------------------------------------------------------------------
 1 | import faust
 2 | 
 3 | 
 4 | class TaxiRide(faust.Record, validation=True):
 5 |     vendorId: str
 6 |     passenger_count: int
 7 |     trip_distance: float
 8 |     payment_type: int
 9 |     total_amount: float
10 | 


--------------------------------------------------------------------------------
/06-streaming/python/streams-example/faust/windowing.py:
--------------------------------------------------------------------------------
 1 | from datetime import timedelta
 2 | import faust
 3 | from taxi_rides import TaxiRide
 4 | 
 5 | 
 6 | app = faust.App('datatalksclub.stream.v2', broker='kafka://localhost:9092')
 7 | topic = app.topic('datatalkclub.yellow_taxi_ride.json', value_type=TaxiRide)
 8 | 
 9 | vendor_rides = app.Table('vendor_rides_windowed', default=int).tumbling(
10 |     timedelta(minutes=1),
11 |     expires=timedelta(hours=1),
12 | )
13 | 
14 | 
15 | @app.agent(topic)
16 | async def process(stream):
17 |     async for event in stream.group_by(TaxiRide.vendorId):
18 |         vendor_rides[event.vendorId] += 1
19 | 
20 | 
21 | if __name__ == '__main__':
22 |     app.main()
23 | 


--------------------------------------------------------------------------------
/06-streaming/python/streams-example/pyspark/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # Running PySpark Streaming 
 3 | 
 4 | #### Prerequisite
 5 | 
 6 | Ensure your Kafka and Spark services up and running by following the [docker setup readme](./../../docker/README.md). 
 7 | It is important to create network and volume as described in the document. Therefore please ensure, your volume and network are created correctly
 8 | 
 9 | ```bash
10 | docker volume ls # should list hadoop-distributed-file-system
11 | docker network ls # should list kafka-spark-network 
12 | ```
13 | 
14 | 
15 | ### Running Producer and Consumer
16 | ```bash
17 | # Run producer
18 | python3 producer.py
19 | 
20 | # Run consumer with default settings
21 | python3 consumer.py
22 | # Run consumer for specific topic
23 | python3 consumer.py --topic <topic-name>
24 | ```
25 | 
26 | ### Running Streaming Script
27 | 
28 | spark-submit script ensures installation of necessary jars before running the streaming.py
29 | 
30 | ```bash
31 | ./spark-submit.sh streaming.py 
32 | ```
33 | 
34 | ### Additional Resources
35 | - [Structured Streaming Programming Guide](https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html#structured-streaming-programming-guide)
36 | - [Structured Streaming + Kafka Integration](https://spark.apache.org/docs/latest/structured-streaming-kafka-integration.html#structured-streaming-kafka-integration-guide-kafka-broker-versio)
37 | 


--------------------------------------------------------------------------------
/06-streaming/python/streams-example/pyspark/consumer.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from typing import Dict, List
 3 | from kafka import KafkaConsumer
 4 | 
 5 | from settings import BOOTSTRAP_SERVERS, CONSUME_TOPIC_RIDES_CSV
 6 | 
 7 | 
 8 | class RideCSVConsumer:
 9 |     def __init__(self, props: Dict):
10 |         self.consumer = KafkaConsumer(**props)
11 | 
12 |     def consume_from_kafka(self, topics: List[str]):
13 |         self.consumer.subscribe(topics=topics)
14 |         print('Consuming from Kafka started')
15 |         print('Available topics to consume: ', self.consumer.subscription())
16 |         while True:
17 |             try:
18 |                 # SIGINT can't be handled when polling, limit timeout to 1 second.
19 |                 msg = self.consumer.poll(1.0)
20 |                 if msg is None or msg == {}:
21 |                     continue
22 |                 for msg_key, msg_values in msg.items():
23 |                     for msg_val in msg_values:
24 |                         print(f'Key:{msg_val.key}-type({type(msg_val.key)}), '
25 |                               f'Value:{msg_val.value}-type({type(msg_val.value)})')
26 |             except KeyboardInterrupt:
27 |                 break
28 | 
29 |         self.consumer.close()
30 | 
31 | 
32 | if __name__ == '__main__':
33 |     parser = argparse.ArgumentParser(description='Kafka Consumer')
34 |     parser.add_argument('--topic', type=str, default=CONSUME_TOPIC_RIDES_CSV)
35 |     args = parser.parse_args()
36 | 
37 |     topic = args.topic
38 |     config = {
39 |         'bootstrap_servers': [BOOTSTRAP_SERVERS],
40 |         'auto_offset_reset': 'earliest',
41 |         'enable_auto_commit': True,
42 |         'key_deserializer': lambda key: int(key.decode('utf-8')),
43 |         'value_deserializer': lambda value: value.decode('utf-8'),
44 |         'group_id': 'consumer.group.id.csv-example.1',
45 |     }
46 |     csv_consumer = RideCSVConsumer(props=config)
47 |     csv_consumer.consume_from_kafka(topics=[topic])
48 | 


--------------------------------------------------------------------------------
/06-streaming/python/streams-example/pyspark/producer.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | from time import sleep
 3 | from typing import Dict
 4 | from kafka import KafkaProducer
 5 | 
 6 | from settings import BOOTSTRAP_SERVERS, INPUT_DATA_PATH, PRODUCE_TOPIC_RIDES_CSV
 7 | 
 8 | 
 9 | def delivery_report(err, msg):
10 |     if err is not None:
11 |         print("Delivery failed for record {}: {}".format(msg.key(), err))
12 |         return
13 |     print('Record {} successfully produced to {} [{}] at offset {}'.format(
14 |         msg.key(), msg.topic(), msg.partition(), msg.offset()))
15 | 
16 | 
17 | class RideCSVProducer:
18 |     def __init__(self, props: Dict):
19 |         self.producer = KafkaProducer(**props)
20 |         # self.producer = Producer(producer_props)
21 | 
22 |     @staticmethod
23 |     def read_records(resource_path: str):
24 |         records, ride_keys = [], []
25 |         i = 0
26 |         with open(resource_path, 'r') as f:
27 |             reader = csv.reader(f)
28 |             header = next(reader)  # skip the header
29 |             for row in reader:
30 |                 # vendor_id, passenger_count, trip_distance, payment_type, total_amount
31 |                 records.append(f'{row[0]}, {row[1]}, {row[2]}, {row[3]}, {row[4]}, {row[9]}, {row[16]}')
32 |                 ride_keys.append(str(row[0]))
33 |                 i += 1
34 |                 if i == 5:
35 |                     break
36 |         return zip(ride_keys, records)
37 | 
38 |     def publish(self, topic: str, records: [str, str]):
39 |         for key_value in records:
40 |             key, value = key_value
41 |             try:
42 |                 self.producer.send(topic=topic, key=key, value=value)
43 |                 print(f"Producing record for <key: {key}, value:{value}>")
44 |             except KeyboardInterrupt:
45 |                 break
46 |             except Exception as e:
47 |                 print(f"Exception while producing record - {value}: {e}")
48 | 
49 |         self.producer.flush()
50 |         sleep(1)
51 | 
52 | 
53 | if __name__ == "__main__":
54 |     config = {
55 |         'bootstrap_servers': [BOOTSTRAP_SERVERS],
56 |         'key_serializer': lambda x: x.encode('utf-8'),
57 |         'value_serializer': lambda x: x.encode('utf-8')
58 |     }
59 |     producer = RideCSVProducer(props=config)
60 |     ride_records = producer.read_records(resource_path=INPUT_DATA_PATH)
61 |     print(ride_records)
62 |     producer.publish(topic=PRODUCE_TOPIC_RIDES_CSV, records=ride_records)
63 | 


--------------------------------------------------------------------------------
/06-streaming/python/streams-example/pyspark/settings.py:
--------------------------------------------------------------------------------
 1 | import pyspark.sql.types as T
 2 | 
 3 | INPUT_DATA_PATH = '../../resources/rides.csv'
 4 | BOOTSTRAP_SERVERS = 'localhost:9092'
 5 | 
 6 | TOPIC_WINDOWED_VENDOR_ID_COUNT = 'vendor_counts_windowed'
 7 | 
 8 | PRODUCE_TOPIC_RIDES_CSV = CONSUME_TOPIC_RIDES_CSV = 'rides_csv'
 9 | 
10 | RIDE_SCHEMA = T.StructType(
11 |     [T.StructField("vendor_id", T.IntegerType()),
12 |      T.StructField('tpep_pickup_datetime', T.TimestampType()),
13 |      T.StructField('tpep_dropoff_datetime', T.TimestampType()),
14 |      T.StructField("passenger_count", T.IntegerType()),
15 |      T.StructField("trip_distance", T.FloatType()),
16 |      T.StructField("payment_type", T.IntegerType()),
17 |      T.StructField("total_amount", T.FloatType()),
18 |      ])
19 | 


--------------------------------------------------------------------------------
/06-streaming/python/streams-example/pyspark/spark-submit.sh:
--------------------------------------------------------------------------------
 1 | # Submit Python code to SparkMaster
 2 | 
 3 | if [ $# -lt 1 ]
 4 | then
 5 | 	echo "Usage: $0 <pyspark-job.py> [ executor-memory ]"
 6 | 	echo "(specify memory in string format such as \"512M\" or \"2G\")"
 7 | 	exit 1
 8 | fi
 9 | PYTHON_JOB=$1
10 | 
11 | if [ -z $2 ]
12 | then
13 | 	EXEC_MEM="1G"
14 | else
15 | 	EXEC_MEM=$2
16 | fi
17 | spark-submit --master spark://localhost:7077 --num-executors 2 \
18 | 	           --executor-memory $EXEC_MEM --executor-cores 1 \
19 |              --packages org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.1,org.apache.spark:spark-avro_2.12:3.3.1,org.apache.spark:spark-streaming-kafka-0-10_2.12:3.3.1 \
20 |              $PYTHON_JOB


--------------------------------------------------------------------------------
/06-streaming/python/streams-example/redpanda/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # Running PySpark Streaming with Redpanda
 3 | 
 4 | ### 1. Prerequisite
 5 | 
 6 | It is important to create network and volume as described in the document. Therefore please ensure, your volume and network are created correctly.
 7 | 
 8 | ```bash
 9 | docker volume ls # should list hadoop-distributed-file-system
10 | docker network ls # should list kafka-spark-network 
11 | ```
12 | 
13 | ### 2. Create Docker Network & Volume
14 | 
15 | If you have not followed any other examples, and above `ls` steps shows no output, create them now.
16 | 
17 | ```bash
18 | # Create Network
19 | docker network create kafka-spark-network
20 | 
21 | # Create Volume
22 | docker volume create --name=hadoop-distributed-file-system
23 | ```
24 | 
25 | ### Running Producer and Consumer
26 | ```bash
27 | # Run producer
28 | python producer.py
29 | 
30 | # Run consumer with default settings
31 | python consumer.py
32 | # Run consumer for specific topic
33 | python consumer.py --topic <topic-name>
34 | ```
35 | 
36 | ### Running Streaming Script
37 | 
38 | spark-submit script ensures installation of necessary jars before running the streaming.py
39 | 
40 | ```bash
41 | ./spark-submit.sh streaming.py 
42 | ```
43 | 
44 | ### Additional Resources
45 | - [Structured Streaming Programming Guide](https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html#structured-streaming-programming-guide)
46 | - [Structured Streaming + Kafka Integration](https://spark.apache.org/docs/latest/structured-streaming-kafka-integration.html#structured-streaming-kafka-integration-guide-kafka-broker-versio)
47 | 


--------------------------------------------------------------------------------
/06-streaming/python/streams-example/redpanda/consumer.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from typing import Dict, List
 3 | from kafka import KafkaConsumer
 4 | 
 5 | from settings import BOOTSTRAP_SERVERS, CONSUME_TOPIC_RIDES_CSV
 6 | 
 7 | 
 8 | class RideCSVConsumer:
 9 |     def __init__(self, props: Dict):
10 |         self.consumer = KafkaConsumer(**props)
11 | 
12 |     def consume_from_kafka(self, topics: List[str]):
13 |         self.consumer.subscribe(topics=topics)
14 |         print('Consuming from Kafka started')
15 |         print('Available topics to consume: ', self.consumer.subscription())
16 |         while True:
17 |             try:
18 |                 # SIGINT can't be handled when polling, limit timeout to 1 second.
19 |                 msg = self.consumer.poll(1.0)
20 |                 if msg is None or msg == {}:
21 |                     continue
22 |                 for msg_key, msg_values in msg.items():
23 |                     for msg_val in msg_values:
24 |                         print(f'Key:{msg_val.key}-type({type(msg_val.key)}), '
25 |                               f'Value:{msg_val.value}-type({type(msg_val.value)})')
26 |             except KeyboardInterrupt:
27 |                 break
28 | 
29 |         self.consumer.close()
30 | 
31 | 
32 | if __name__ == '__main__':
33 |     parser = argparse.ArgumentParser(description='Kafka Consumer')
34 |     parser.add_argument('--topic', type=str, default=CONSUME_TOPIC_RIDES_CSV)
35 |     args = parser.parse_args()
36 | 
37 |     topic = args.topic
38 |     config = {
39 |         'bootstrap_servers': [BOOTSTRAP_SERVERS],
40 |         'auto_offset_reset': 'earliest',
41 |         'enable_auto_commit': True,
42 |         'key_deserializer': lambda key: int(key.decode('utf-8')),
43 |         'value_deserializer': lambda value: value.decode('utf-8'),
44 |         'group_id': 'consumer.group.id.csv-example.1',
45 |     }
46 |     csv_consumer = RideCSVConsumer(props=config)
47 |     csv_consumer.consume_from_kafka(topics=[topic])
48 | 


--------------------------------------------------------------------------------
/06-streaming/python/streams-example/redpanda/producer.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | from time import sleep
 3 | from typing import Dict
 4 | from kafka import KafkaProducer
 5 | 
 6 | from settings import BOOTSTRAP_SERVERS, INPUT_DATA_PATH, PRODUCE_TOPIC_RIDES_CSV
 7 | 
 8 | 
 9 | def delivery_report(err, msg):
10 |     if err is not None:
11 |         print("Delivery failed for record {}: {}".format(msg.key(), err))
12 |         return
13 |     print('Record {} successfully produced to {} [{}] at offset {}'.format(
14 |         msg.key(), msg.topic(), msg.partition(), msg.offset()))
15 | 
16 | 
17 | class RideCSVProducer:
18 |     def __init__(self, props: Dict):
19 |         self.producer = KafkaProducer(**props)
20 |         # self.producer = Producer(producer_props)
21 | 
22 |     @staticmethod
23 |     def read_records(resource_path: str):
24 |         records, ride_keys = [], []
25 |         i = 0
26 |         with open(resource_path, 'r') as f:
27 |             reader = csv.reader(f)
28 |             header = next(reader)  # skip the header
29 |             for row in reader:
30 |                 # vendor_id, passenger_count, trip_distance, payment_type, total_amount
31 |                 records.append(f'{row[0]}, {row[1]}, {row[2]}, {row[3]}, {row[4]}, {row[9]}, {row[16]}')
32 |                 ride_keys.append(str(row[0]))
33 |                 i += 1
34 |                 if i == 5:
35 |                     break
36 |         return zip(ride_keys, records)
37 | 
38 |     def publish(self, topic: str, records: [str, str]):
39 |         for key_value in records:
40 |             key, value = key_value
41 |             try:
42 |                 self.producer.send(topic=topic, key=key, value=value)
43 |                 print(f"Producing record for <key: {key}, value:{value}>")
44 |             except KeyboardInterrupt:
45 |                 break
46 |             except Exception as e:
47 |                 print(f"Exception while producing record - {value}: {e}")
48 | 
49 |         self.producer.flush()
50 |         sleep(1)
51 | 
52 | 
53 | if __name__ == "__main__":
54 |     config = {
55 |         'bootstrap_servers': [BOOTSTRAP_SERVERS],
56 |         'key_serializer': lambda x: x.encode('utf-8'),
57 |         'value_serializer': lambda x: x.encode('utf-8')
58 |     }
59 |     producer = RideCSVProducer(props=config)
60 |     ride_records = producer.read_records(resource_path=INPUT_DATA_PATH)
61 |     print(ride_records)
62 |     producer.publish(topic=PRODUCE_TOPIC_RIDES_CSV, records=ride_records)
63 | 


--------------------------------------------------------------------------------
/06-streaming/python/streams-example/redpanda/settings.py:
--------------------------------------------------------------------------------
 1 | import pyspark.sql.types as T
 2 | 
 3 | INPUT_DATA_PATH = '../../resources/rides.csv'
 4 | BOOTSTRAP_SERVERS = 'localhost:9092'
 5 | 
 6 | TOPIC_WINDOWED_VENDOR_ID_COUNT = 'vendor_counts_windowed'
 7 | 
 8 | PRODUCE_TOPIC_RIDES_CSV = CONSUME_TOPIC_RIDES_CSV = 'rides_csv'
 9 | 
10 | RIDE_SCHEMA = T.StructType(
11 |     [T.StructField("vendor_id", T.IntegerType()),
12 |      T.StructField('tpep_pickup_datetime', T.TimestampType()),
13 |      T.StructField('tpep_dropoff_datetime', T.TimestampType()),
14 |      T.StructField("passenger_count", T.IntegerType()),
15 |      T.StructField("trip_distance", T.FloatType()),
16 |      T.StructField("payment_type", T.IntegerType()),
17 |      T.StructField("total_amount", T.FloatType()),
18 |      ])
19 | 


--------------------------------------------------------------------------------
/06-streaming/python/streams-example/redpanda/spark-submit.sh:
--------------------------------------------------------------------------------
 1 | # Submit Python code to SparkMaster
 2 | 
 3 | if [ $# -lt 1 ]
 4 | then
 5 | 	echo "Usage: $0 <pyspark-job.py> [ executor-memory ]"
 6 | 	echo "(specify memory in string format such as \"512M\" or \"2G\")"
 7 | 	exit 1
 8 | fi
 9 | PYTHON_JOB=$1
10 | 
11 | if [ -z $2 ]
12 | then
13 | 	EXEC_MEM="1G"
14 | else
15 | 	EXEC_MEM=$2
16 | fi
17 | spark-submit --master spark://localhost:7077 --num-executors 2 \
18 | 	           --executor-memory $EXEC_MEM --executor-cores 1 \
19 |              --packages org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.1,org.apache.spark:spark-avro_2.12:3.5.1,org.apache.spark:spark-streaming-kafka-0-10_2.12:3.5.1 \
20 |              $PYTHON_JOB
21 | 


--------------------------------------------------------------------------------
/after-sign-up.md:
--------------------------------------------------------------------------------
 1 | ## Thank you!
 2 | 
 3 | Thanks for signing up for the course.
 4 | 
 5 | The process of adding you to the mailing list is not automated yet, 
 6 | but you will hear from us closer to the course start. 
 7 | 
 8 | To make sure you don't miss any announcements
 9 | 
10 | - Register in [DataTalks.Club's Slack](https://datatalks.club/slack.html) and
11 |   join the [`#course-data-engineering`](https://app.slack.com/client/T01ATQK62F8/C01FABYF2RG) channel
12 | - Join the [course Telegram channel with announcements](https://t.me/dezoomcamp)
13 | - Subscribe to [DataTalks.Club's YouTube channel](https://www.youtube.com/c/DataTalksClub) and check 
14 |   [the course playlist](https://www.youtube.com/playlist?list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb)
15 | - Subscribe to our [public Google Calendar](https://calendar.google.com/calendar/?cid=ZXIxcjA1M3ZlYjJpcXU0dTFmaG02MzVxMG9AZ3JvdXAuY2FsZW5kYXIuZ29vZ2xlLmNvbQ) (it works from Desktop only)
16 | 
17 | See you in January!
18 | 


--------------------------------------------------------------------------------
/certificates.md:
--------------------------------------------------------------------------------
 1 | ## Getting your certificate
 2 | 
 3 | Congratulations on finishing the course!
 4 | 
 5 | You can find your certificate in your enrollment profile (you need to be logged in):
 6 | 
 7 | * For the 2025 edition, it's https://courses.datatalks.club/de-zoomcamp-2025/enrollment
 8 | 
 9 | If you can't find a certificate in your profile, it means you didn't pass the project.
10 | If you believe it's a mistake, write in the course channel in Slack.
11 | 
12 | 
13 | ## Adding to LinkedIn
14 | 
15 | You can add your certificate to LinkedIn:
16 | 
17 | * Log in to your LinkedIn account, then go to your profile.
18 | * On the right, in the "Add profile" section dropdown, choose "Background" and then select the drop-down triangle next to "Licenses & Certifications".
19 | * In "Name", enter "Data Engineering Zoomcamp".
20 | * In "Issuing Organization", enter "DataTalksClub".
21 | * (Optional) In "Issue Date", enter the time when the certificate was created.
22 | * (Optional) Select the checkbox This certification does not expire. 
23 | * Put your certificate ID.
24 | * In "Certification URL", enter the URL for your certificate.
25 | 
26 | [Adapted from here](https://support.edx.org/hc/en-us/articles/206501938-How-can-I-add-my-certificate-to-my-LinkedIn-profile-)
27 | 


--------------------------------------------------------------------------------
/cohorts/2022/README.md:
--------------------------------------------------------------------------------
1 | 
2 | ### 2022 Cohort
3 | 
4 | * **Start**: 17 January 2022
5 | * **Registration link**: https://airtable.com/shr6oVXeQvSI5HuWD
6 | * [Leaderboard](https://docs.google.com/spreadsheets/d/e/2PACX-1vR9oQiYnAVvzL4dagnhvp0sngqagF0AceD0FGjhS-dnzMTBzNQIal3-hOgkTibVQvfuqbQ69b0fvRnf/pubhtml)
7 | * Subscribe to our [public Google Calendar](https://calendar.google.com/calendar/?cid=ZXIxcjA1M3ZlYjJpcXU0dTFmaG02MzVxMG9AZ3JvdXAuY2FsZW5kYXIuZ29vZ2xlLmNvbQ) (it works from Desktop only)
8 | 


--------------------------------------------------------------------------------
/cohorts/2022/project.md:
--------------------------------------------------------------------------------
 1 | ## Course Project
 2 | 
 3 | The goal of this project is to apply everything we learned
 4 | in this course and build an end-to-end data pipeline.
 5 | 
 6 | Remember that to pass the project, you must evaluate 3 peers. If you don't do that, your project can't be considered compelete.
 7 | 
 8 | 
 9 | ### Submitting 
10 | 
11 | #### Project Cohort #2
12 | 
13 | Project:
14 | 
15 | * Form: https://forms.gle/JECXB9jYQ1vBXbsw6
16 | * Deadline: 2 May, 22:00 CET
17 | 
18 | Peer reviewing:
19 | 
20 | * Peer review assignments: [link](https://docs.google.com/spreadsheets/d/e/2PACX-1vShnv8T4iY_5NA8h0nySIS8Wzr-DZGGigEikIW4ZMSi9HlvhaEB4RhwmepVIuIUGaQHS90r5iHR2YXV/pubhtml?gid=964123374&single=true)
21 | * Form: https://forms.gle/Pb2fBwYLQ3GGFsaK6
22 | * Deadline: 9 May, 22:00 CET
23 | 
24 | 
25 | #### Project Cohort #1
26 | 
27 | Project:
28 | 
29 | * Form: https://forms.gle/6aeVcEVJipqR2BqC8
30 | * Deadline: 4 April, 22:00 CET
31 | 
32 | Peer reviewing:
33 | 
34 | * Peer review assignments: [link](https://docs.google.com/spreadsheets/d/e/2PACX-1vShnv8T4iY_5NA8h0nySIS8Wzr-DZGGigEikIW4ZMSi9HlvhaEB4RhwmepVIuIUGaQHS90r5iHR2YXV/pubhtml)
35 | * Form: https://forms.gle/AZ62bXMp4SGcVUmK7
36 | * Deadline: 11 April, 22:00 CET
37 | 
38 | Project feedback: [link](https://docs.google.com/spreadsheets/d/e/2PACX-1vRcVCkO-jes5mbPAcikn9X_s2laJ1KhsO8aibHYQxxKqdCUYMVTEJLJQdM8C5aAUWKFl_0SJW4rme7H/pubhtml)
39 | 


--------------------------------------------------------------------------------
/cohorts/2022/week_1_basics_n_setup/homework.md:
--------------------------------------------------------------------------------
  1 | ## Week 1 Homework
  2 | 
  3 | In this homework we'll prepare the environment 
  4 | and practice with terraform and SQL
  5 | 
  6 | 
  7 | ## Question 1. Google Cloud SDK
  8 | 
  9 | Install Google Cloud SDK. What's the version you have? 
 10 | 
 11 | To get the version, run `gcloud --version`
 12 | 
 13 | ## Google Cloud account 
 14 | 
 15 | Create an account in Google Cloud and create a project.
 16 | 
 17 | 
 18 | ## Question 2. Terraform 
 19 | 
 20 | Now install terraform and go to the terraform directory (`week_1_basics_n_setup/1_terraform_gcp/terraform`)
 21 | 
 22 | After that, run
 23 | 
 24 | * `terraform init`
 25 | * `terraform plan`
 26 | * `terraform apply` 
 27 | 
 28 | Apply the plan and copy the output (after running `apply`) to the form.
 29 | 
 30 | It should be the entire output - from the moment you typed `terraform init` to the very end.
 31 | 
 32 | ## Prepare Postgres 
 33 | 
 34 | Run Postgres and load data as shown in the videos
 35 | 
 36 | We'll use the yellow taxi trips from January 2021:
 37 | 
 38 | ```bash
 39 | wget https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2021-01.csv
 40 | ```
 41 | 
 42 | You will also need the dataset with zones:
 43 | 
 44 | ```bash 
 45 | wget https://s3.amazonaws.com/nyc-tlc/misc/taxi+_zone_lookup.csv
 46 | ```
 47 | 
 48 | Download this data and put it to Postgres
 49 | 
 50 | ## Question 3. Count records 
 51 | 
 52 | How many taxi trips were there on January 15?
 53 | 
 54 | Consider only trips that started on January 15.
 55 | 
 56 | 
 57 | ## Question 4. Largest tip for each day
 58 | 
 59 | Find the largest tip for each day. 
 60 | On which day it was the largest tip in January?
 61 | 
 62 | Use the pick up time for your calculations.
 63 | 
 64 | (note: it's not a typo, it's "tip", not "trip")
 65 | 
 66 | 
 67 | ## Question 5. Most popular destination
 68 | 
 69 | What was the most popular destination for passengers picked up 
 70 | in central park on January 14?
 71 | 
 72 | Use the pick up time for your calculations.
 73 | 
 74 | Enter the zone name (not id). If the zone name is unknown (missing), write "Unknown" 
 75 | 
 76 | 
 77 | ## Question 6. Most expensive locations
 78 | 
 79 | What's the pickup-dropoff pair with the largest 
 80 | average price for a ride (calculated based on `total_amount`)?
 81 | 
 82 | Enter two zone names separated by a slash
 83 | 
 84 | For example:
 85 | 
 86 | "Jamaica Bay / Clinton East"
 87 | 
 88 | If any of the zone names are unknown (missing), write "Unknown". For example, "Unknown / Clinton East". 
 89 | 
 90 | 
 91 | ## Submitting the solutions
 92 | 
 93 | * Form for submitting: https://forms.gle/yGQrkgRdVbiFs8Vd7
 94 | * You can submit your homework multiple times. In this case, only the last submission will be used. 
 95 | 
 96 | Deadline: 26 January (Wednesday), 22:00 CET
 97 | 
 98 | 
 99 | ## Solution
100 | 
101 | Here is the solution to questions 3-6: [video](https://www.youtube.com/watch?v=HxHqH2ARfxM&list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb)
102 | 
103 | 


--------------------------------------------------------------------------------
/cohorts/2022/week_2_data_ingestion/airflow/.env_example:
--------------------------------------------------------------------------------
 1 | # Custom
 2 | COMPOSE_PROJECT_NAME=dtc-de
 3 | GOOGLE_APPLICATION_CREDENTIALS=/.google/credentials/google_credentials.json
 4 | AIRFLOW_CONN_GOOGLE_CLOUD_DEFAULT=google-cloud-platform://?extra__google_cloud_platform__key_path=/.google/credentials/google_credentials.json
 5 | # AIRFLOW_UID=
 6 | GCP_PROJECT_ID=
 7 | GCP_GCS_BUCKET=
 8 | 
 9 | # Postgres
10 | POSTGRES_USER=airflow
11 | POSTGRES_PASSWORD=airflow
12 | POSTGRES_DB=airflow
13 | 
14 | # Airflow
15 | AIRFLOW__CORE__EXECUTOR=LocalExecutor
16 | AIRFLOW__SCHEDULER__SCHEDULER_HEARTBEAT_SEC=10
17 | 
18 | AIRFLOW__CORE__SQL_ALCHEMY_CONN=postgresql+psycopg2://${POSTGRES_USER}:${POSTGRES_PASSWORD}@postgres:5432/${POSTGRES_DB}
19 | AIRFLOW_CONN_METADATA_DB=postgres+psycopg2://airflow:airflow@postgres:5432/airflow
20 | AIRFLOW_VAR__METADATA_DB_SCHEMA=airflow
21 | 
22 | _AIRFLOW_WWW_USER_CREATE=True
23 | _AIRFLOW_WWW_USER_USERNAME=${_AIRFLOW_WWW_USER_USERNAME:airflow}
24 | _AIRFLOW_WWW_USER_PASSWORD=${_AIRFLOW_WWW_USER_PASSWORD:airflow}
25 | 
26 | AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION=True
27 | AIRFLOW__CORE__LOAD_EXAMPLES=False
28 | 


--------------------------------------------------------------------------------
/cohorts/2022/week_2_data_ingestion/airflow/Dockerfile:
--------------------------------------------------------------------------------
 1 | # First-time build can take upto 10 mins.
 2 | 
 3 | FROM apache/airflow:2.2.3
 4 | 
 5 | ENV AIRFLOW_HOME=/opt/airflow
 6 | 
 7 | USER root
 8 | RUN apt-get update -qq && apt-get install vim -qqq
 9 | # git gcc g++ -qqq
10 | 
11 | COPY requirements.txt .
12 | RUN pip install --no-cache-dir -r requirements.txt
13 | 
14 | # Ref: https://airflow.apache.org/docs/docker-stack/recipes.html
15 | 
16 | SHELL ["/bin/bash", "-o", "pipefail", "-e", "-u", "-x", "-c"]
17 | 
18 | ARG CLOUD_SDK_VERSION=322.0.0
19 | ENV GCLOUD_HOME=/home/google-cloud-sdk
20 | 
21 | ENV PATH="${GCLOUD_HOME}/bin/:${PATH}"
22 | 
23 | RUN DOWNLOAD_URL="https://dl.google.com/dl/cloudsdk/channels/rapid/downloads/google-cloud-sdk-${CLOUD_SDK_VERSION}-linux-x86_64.tar.gz" \
24 |     && TMP_DIR="$(mktemp -d)" \
25 |     && curl -fL "${DOWNLOAD_URL}" --output "${TMP_DIR}/google-cloud-sdk.tar.gz" \
26 |     && mkdir -p "${GCLOUD_HOME}" \
27 |     && tar xzf "${TMP_DIR}/google-cloud-sdk.tar.gz" -C "${GCLOUD_HOME}" --strip-components=1 \
28 |     && "${GCLOUD_HOME}/install.sh" \
29 |        --bash-completion=false \
30 |        --path-update=false \
31 |        --usage-reporting=false \
32 |        --quiet \
33 |     && rm -rf "${TMP_DIR}" \
34 |     && gcloud --version
35 | 
36 | WORKDIR $AIRFLOW_HOME
37 | 
38 | COPY scripts scripts
39 | RUN chmod +x scripts
40 | 
41 | USER $AIRFLOW_UID
42 | 


--------------------------------------------------------------------------------
/cohorts/2022/week_2_data_ingestion/airflow/dags_local/data_ingestion_local.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from datetime import datetime
 4 | 
 5 | from airflow import DAG
 6 | 
 7 | from airflow.operators.bash import BashOperator
 8 | from airflow.operators.python import PythonOperator
 9 | 
10 | from ingest_script import ingest_callable
11 | 
12 | 
13 | AIRFLOW_HOME = os.environ.get("AIRFLOW_HOME", "/opt/airflow/")
14 | 
15 | 
16 | PG_HOST = os.getenv('PG_HOST')
17 | PG_USER = os.getenv('PG_USER')
18 | PG_PASSWORD = os.getenv('PG_PASSWORD')
19 | PG_PORT = os.getenv('PG_PORT')
20 | PG_DATABASE = os.getenv('PG_DATABASE')
21 | 
22 | 
23 | local_workflow = DAG(
24 |     "LocalIngestionDag",
25 |     schedule_interval="0 6 2 * *",
26 |     start_date=datetime(2021, 1, 1)
27 | )
28 | 
29 | 
30 | URL_PREFIX = 'https://s3.amazonaws.com/nyc-tlc/trip+data' 
31 | URL_TEMPLATE = URL_PREFIX + '/yellow_tripdata_{{ execution_date.strftime(\'%Y-%m\') }}.csv'
32 | OUTPUT_FILE_TEMPLATE = AIRFLOW_HOME + '/output_{{ execution_date.strftime(\'%Y-%m\') }}.csv'
33 | TABLE_NAME_TEMPLATE = 'yellow_taxi_{{ execution_date.strftime(\'%Y_%m\') }}'
34 | 
35 | with local_workflow:
36 |     wget_task = BashOperator(
37 |         task_id='wget',
38 |         bash_command=f'curl -sSL {URL_TEMPLATE} > {OUTPUT_FILE_TEMPLATE}'
39 |     )
40 | 
41 |     ingest_task = PythonOperator(
42 |         task_id="ingest",
43 |         python_callable=ingest_callable,
44 |         op_kwargs=dict(
45 |             user=PG_USER,
46 |             password=PG_PASSWORD,
47 |             host=PG_HOST,
48 |             port=PG_PORT,
49 |             db=PG_DATABASE,
50 |             table_name=TABLE_NAME_TEMPLATE,
51 |             csv_file=OUTPUT_FILE_TEMPLATE
52 |         ),
53 |     )
54 | 
55 |     wget_task >> ingest_task


--------------------------------------------------------------------------------
/cohorts/2022/week_2_data_ingestion/airflow/dags_local/ingest_script.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from time import time
 4 | 
 5 | import pandas as pd
 6 | from sqlalchemy import create_engine
 7 | 
 8 | 
 9 | def ingest_callable(user, password, host, port, db, table_name, csv_file, execution_date):
10 |     print(table_name, csv_file, execution_date)
11 | 
12 |     engine = create_engine(f'postgresql://{user}:{password}@{host}:{port}/{db}')
13 |     engine.connect()
14 | 
15 |     print('connection established successfully, inserting data...')
16 | 
17 |     t_start = time()
18 |     df_iter = pd.read_csv(csv_file, iterator=True, chunksize=100000)
19 | 
20 |     df = next(df_iter)
21 | 
22 |     df.tpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime)
23 |     df.tpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime)
24 | 
25 |     df.head(n=0).to_sql(name=table_name, con=engine, if_exists='replace')
26 | 
27 |     df.to_sql(name=table_name, con=engine, if_exists='append')
28 | 
29 |     t_end = time()
30 |     print('inserted the first chunk, took %.3f second' % (t_end - t_start))
31 | 
32 |     while True: 
33 |         t_start = time()
34 | 
35 |         try:
36 |             df = next(df_iter)
37 |         except StopIteration:
38 |             print("completed")
39 |             break
40 | 
41 |         df.tpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime)
42 |         df.tpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime)
43 | 
44 |         df.to_sql(name=table_name, con=engine, if_exists='append')
45 | 
46 |         t_end = time()
47 | 
48 |         print('inserted another chunk, took %.3f second' % (t_end - t_start))
49 | 


--------------------------------------------------------------------------------
/cohorts/2022/week_2_data_ingestion/airflow/docker-compose-nofrills.yml:
--------------------------------------------------------------------------------
 1 | version: '3'
 2 | services:
 3 |     postgres:
 4 |         image: postgres:13
 5 |         env_file:
 6 |             - .env
 7 |         volumes:
 8 |             - postgres-db-volume:/var/lib/postgresql/data
 9 |         healthcheck:
10 |             test: ["CMD", "pg_isready", "-U", "airflow"]
11 |             interval: 5s
12 |             retries: 5
13 |         restart: always
14 | 
15 |     scheduler:
16 |         build: .
17 |         command: scheduler
18 |         restart: on-failure
19 |         depends_on:
20 |             - postgres
21 |         env_file:
22 |             - .env
23 |         volumes:
24 |             - ./dags:/opt/airflow/dags
25 |             - ./logs:/opt/airflow/logs
26 |             - ./plugins:/opt/airflow/plugins
27 |             - ./scripts:/opt/airflow/scripts
28 |             - ~/.google/credentials/:/.google/credentials
29 | 
30 | 
31 |     webserver:
32 |         build: .
33 |         entrypoint: ./scripts/entrypoint.sh
34 |         restart: on-failure
35 |         depends_on:
36 |             - postgres
37 |             - scheduler
38 |         env_file:
39 |             - .env
40 |         volumes:
41 |             - ./dags:/opt/airflow/dags
42 |             - ./logs:/opt/airflow/logs
43 |             - ./plugins:/opt/airflow/plugins
44 |             - ~/.google/credentials/:/.google/credentials:ro
45 |             - ./scripts:/opt/airflow/scripts
46 | 
47 |         user: "${AIRFLOW_UID:-50000}:0"
48 |         ports:
49 |             - "8080:8080"
50 |         healthcheck:
51 |             test: [ "CMD-SHELL", "[ -f /home/airflow/airflow-webserver.pid ]" ]
52 |             interval: 30s
53 |             timeout: 30s
54 |             retries: 3
55 | 
56 | volumes:
57 |   postgres-db-volume:


--------------------------------------------------------------------------------
/cohorts/2022/week_2_data_ingestion/airflow/docs/1_concepts.md:
--------------------------------------------------------------------------------
 1 | ## Airflow concepts
 2 | 
 3 | 
 4 | ### Airflow architecture
 5 | ![](arch-diag-airflow.png)
 6 | 
 7 | Ref: https://airflow.apache.org/docs/apache-airflow/stable/concepts/overview.html
 8 | 
 9 | * **Web server**:
10 | GUI to inspect, trigger and debug the behaviour of DAGs and tasks. 
11 | Available at http://localhost:8080.
12 | 
13 | * **Scheduler**:
14 | Responsible for scheduling jobs. Handles both triggering & scheduled workflows, submits Tasks to the executor to run, monitors all tasks and DAGs, and
15 | then triggers the task instances once their dependencies are complete.
16 | 
17 | * **Worker**:
18 | This component executes the tasks given by the scheduler.
19 | 
20 | * **Metadata database (postgres)**:
21 | Backend to the Airflow environment. Used by the scheduler, executor and webserver to store state.
22 | 
23 | * **Other components** (seen in docker-compose services):
24 |     * `redis`: Message broker that forwards messages from scheduler to worker.
25 |     * `flower`: The flower app for monitoring the environment. It is available at http://localhost:5555.
26 |     * `airflow-init`: initialization service (customized as per this design)
27 | 
28 | All these services allow you to run Airflow with CeleryExecutor. 
29 | For more information, see [Architecture Overview](https://airflow.apache.org/docs/apache-airflow/stable/concepts/overview.html).
30 | 
31 | 
32 | ### Project Structure:
33 | 
34 | * `./dags` - `DAG_FOLDER` for DAG files (use `./dags_local` for the local ingestion DAG)
35 | * `./logs` - contains logs from task execution and scheduler.
36 | * `./plugins` - for custom plugins
37 | 
38 | 
39 | ### Workflow components
40 | 
41 | * `DAG`: Directed acyclic graph, specifies the dependencies between a set of tasks with explicit execution order, and has a beginning as well as an end. (Hence, “acyclic”)
42 |     * `DAG Structure`: DAG Definition, Tasks (eg. Operators), Task Dependencies (control flow: `>>` or `<<` )
43 |     
44 | * `Task`: a defined unit of work (aka, operators in Airflow). The Tasks themselves describe what to do, be it fetching data, running analysis, triggering other systems, or more.
45 |     * Common Types: Operators (used in this workshop), Sensors, TaskFlow decorators
46 |     * Sub-classes of Airflow's BaseOperator
47 | 
48 | * `DAG Run`: individual execution/run of a DAG
49 |     * scheduled or triggered
50 | 
51 | * `Task Instance`: an individual run of a single task. Task instances also have an indicative state, which could be “running”, “success”, “failed”, “skipped”, “up for retry”, etc.
52 |     * Ideally, a task should flow from `none`, to `scheduled`, to `queued`, to `running`, and finally to `success`.
53 | 
54 | 
55 | ### References
56 | 
57 | https://airflow.apache.org/docs/apache-airflow/stable/concepts/dags.html
58 | 
59 | https://airflow.apache.org/docs/apache-airflow/stable/concepts/tasks.html
60 | 
61 | 


--------------------------------------------------------------------------------
/cohorts/2022/week_2_data_ingestion/airflow/docs/arch-diag-airflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataTalksClub/data-engineering-zoomcamp/76fed2419e917176369a0d3a2eeff6c936f87286/cohorts/2022/week_2_data_ingestion/airflow/docs/arch-diag-airflow.png


--------------------------------------------------------------------------------
/cohorts/2022/week_2_data_ingestion/airflow/docs/gcs_ingestion_dag.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataTalksClub/data-engineering-zoomcamp/76fed2419e917176369a0d3a2eeff6c936f87286/cohorts/2022/week_2_data_ingestion/airflow/docs/gcs_ingestion_dag.png


--------------------------------------------------------------------------------
/cohorts/2022/week_2_data_ingestion/airflow/extras/web_to_gcs.sh:
--------------------------------------------------------------------------------
1 | dataset_url=${dataset_url}
2 | dataset_file=${dataset_file}
3 | path_to_local_file=${path_to_local_file}
4 | path_to_creds=${path_to_creds}
5 | 
6 | curl -sS "$dataset_url" > $path_to_local_file/$dataset_file
7 | gcloud auth activate-service-account --key-file=$path_to_creds
8 | gsutil -m cp $path_to_local_file/$dataset_file gs://$BUCKET
9 | 


--------------------------------------------------------------------------------
/cohorts/2022/week_2_data_ingestion/airflow/requirements.txt:
--------------------------------------------------------------------------------
1 | apache-airflow-providers-google
2 | pyarrow
3 | 


--------------------------------------------------------------------------------
/cohorts/2022/week_2_data_ingestion/airflow/scripts/entrypoint.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | export GOOGLE_APPLICATION_CREDENTIALS=${GOOGLE_APPLICATION_CREDENTIALS}
 3 | export AIRFLOW_CONN_GOOGLE_CLOUD_DEFAULT=${AIRFLOW_CONN_GOOGLE_CLOUD_DEFAULT}
 4 | 
 5 | airflow db upgrade
 6 | 
 7 | airflow users create -r Admin -u admin -p admin -e admin@example.com -f admin -l airflow
 8 | # "$_AIRFLOW_WWW_USER_USERNAME" -p "$_AIRFLOW_WWW_USER_PASSWORD"
 9 | 
10 | airflow webserver
11 | 


--------------------------------------------------------------------------------
/cohorts/2022/week_2_data_ingestion/transfer_service/README.md:
--------------------------------------------------------------------------------
 1 | ## Generate AWS Access key
 2 | - Login in to AWS account  
 3 | - Search for IAM
 4 |   ![aws iam](../../images/aws/iam.png)
 5 | - Click on `Manage access key`
 6 | - Click on `Create New Access Key`
 7 | - Download the csv, your access key and secret would be in that csv (Please note that once lost secret cannot be recovered)
 8 | 
 9 | ## Transfer service
10 | https://console.cloud.google.com/transfer/cloud/jobs
11 | 
12 | 
13 | 


--------------------------------------------------------------------------------
/cohorts/2022/week_3_data_warehouse/airflow/.env_example:
--------------------------------------------------------------------------------
 1 | # Custom
 2 | COMPOSE_PROJECT_NAME=dtc-de
 3 | GOOGLE_APPLICATION_CREDENTIALS=/.google/credentials/google_credentials.json
 4 | AIRFLOW_CONN_GOOGLE_CLOUD_DEFAULT=google-cloud-platform://?extra__google_cloud_platform__key_path=/.google/credentials/google_credentials.json
 5 | # AIRFLOW_UID=
 6 | GCP_PROJECT_ID=
 7 | GCP_GCS_BUCKET=
 8 | 
 9 | # Postgres
10 | POSTGRES_USER=airflow
11 | POSTGRES_PASSWORD=airflow
12 | POSTGRES_DB=airflow
13 | 
14 | # Airflow
15 | AIRFLOW__CORE__EXECUTOR=LocalExecutor
16 | AIRFLOW__SCHEDULER__SCHEDULER_HEARTBEAT_SEC=10
17 | 
18 | AIRFLOW__CORE__SQL_ALCHEMY_CONN=postgresql+psycopg2://${POSTGRES_USER}:${POSTGRES_PASSWORD}@postgres:5432/${POSTGRES_DB}
19 | AIRFLOW_CONN_METADATA_DB=postgres+psycopg2://airflow:airflow@postgres:5432/airflow
20 | AIRFLOW_VAR__METADATA_DB_SCHEMA=airflow
21 | 
22 | _AIRFLOW_WWW_USER_CREATE=True
23 | _AIRFLOW_WWW_USER_USERNAME=${_AIRFLOW_WWW_USER_USERNAME:airflow}
24 | _AIRFLOW_WWW_USER_PASSWORD=${_AIRFLOW_WWW_USER_PASSWORD:airflow}
25 | 
26 | AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION=True
27 | AIRFLOW__CORE__LOAD_EXAMPLES=False
28 | 


--------------------------------------------------------------------------------
/cohorts/2022/week_3_data_warehouse/airflow/docker-compose-nofrills.yml:
--------------------------------------------------------------------------------
 1 | version: '3'
 2 | services:
 3 |     postgres:
 4 |         image: postgres:13
 5 |         env_file:
 6 |             - .env
 7 |         volumes:
 8 |             - postgres-db-volume:/var/lib/postgresql/data
 9 |         healthcheck:
10 |             test: ["CMD", "pg_isready", "-U", "airflow"]
11 |             interval: 5s
12 |             retries: 5
13 |         restart: always
14 | 
15 |     scheduler:
16 |         build: .
17 |         command: scheduler
18 |         restart: on-failure
19 |         depends_on:
20 |             - postgres
21 |         env_file:
22 |             - .env
23 |         volumes:
24 |             - ./dags:/opt/airflow/dags
25 |             - ./logs:/opt/airflow/logs
26 |             - ./plugins:/opt/airflow/plugins
27 |             - ./scripts:/opt/airflow/scripts
28 |             - ~/.google/credentials/:/.google/credentials:ro
29 | 
30 | 
31 |     webserver:
32 |         build: .
33 |         entrypoint: ./scripts/entrypoint.sh
34 |         restart: on-failure
35 |         depends_on:
36 |             - postgres
37 |             - scheduler
38 |         env_file:
39 |             - .env
40 |         volumes:
41 |             - ./dags:/opt/airflow/dags
42 |             - ./logs:/opt/airflow/logs
43 |             - ./plugins:/opt/airflow/plugins
44 |             - ~/.google/credentials/:/.google/credentials:ro
45 |             - ./scripts:/opt/airflow/scripts
46 | 
47 |         user: "${AIRFLOW_UID:-50000}:0"
48 |         ports:
49 |             - "8080:8080"
50 |         healthcheck:
51 |             test: [ "CMD-SHELL", "[ -f /home/airflow/airflow-webserver.pid ]" ]
52 |             interval: 30s
53 |             timeout: 30s
54 |             retries: 3
55 | 
56 | volumes:
57 |   postgres-db-volume:


--------------------------------------------------------------------------------
/cohorts/2022/week_3_data_warehouse/airflow/docs/gcs_2_bq_dag_graph_view.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataTalksClub/data-engineering-zoomcamp/76fed2419e917176369a0d3a2eeff6c936f87286/cohorts/2022/week_3_data_warehouse/airflow/docs/gcs_2_bq_dag_graph_view.png


--------------------------------------------------------------------------------
/cohorts/2022/week_3_data_warehouse/airflow/docs/gcs_2_bq_dag_tree_view.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataTalksClub/data-engineering-zoomcamp/76fed2419e917176369a0d3a2eeff6c936f87286/cohorts/2022/week_3_data_warehouse/airflow/docs/gcs_2_bq_dag_tree_view.png


--------------------------------------------------------------------------------
/cohorts/2022/week_3_data_warehouse/airflow/scripts/entrypoint.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | export GOOGLE_APPLICATION_CREDENTIALS=${GOOGLE_APPLICATION_CREDENTIALS}
 3 | export AIRFLOW_CONN_GOOGLE_CLOUD_DEFAULT=${AIRFLOW_CONN_GOOGLE_CLOUD_DEFAULT}
 4 | 
 5 | airflow db upgrade
 6 | 
 7 | airflow users create -r Admin -u admin -p admin -e admin@example.com -f admin -l airflow
 8 | # "$_AIRFLOW_WWW_USER_USERNAME" -p "$_AIRFLOW_WWW_USER_PASSWORD"
 9 | 
10 | airflow webserver
11 | 


--------------------------------------------------------------------------------
/cohorts/2022/week_5_batch_processing/homework.md:
--------------------------------------------------------------------------------
 1 | ## Week 5 Homework
 2 | 
 3 | In this homework we'll put what we learned about Spark
 4 | in practice.
 5 | 
 6 | We'll use high volume for-hire vehicles (HVFHV) dataset for that.
 7 | 
 8 | ## Question 1. Install Spark and PySpark
 9 | 
10 | * Install Spark
11 | * Run PySpark
12 | * Create a local spark session 
13 | * Execute `spark.version`
14 | 
15 | What's the output?
16 | 
17 | 
18 | ## Question 2. HVFHW February 2021
19 | 
20 | Download the HVFHV data for february 2021:
21 | 
22 | ```bash
23 | wget https://nyc-tlc.s3.amazonaws.com/trip+data/fhvhv_tripdata_2021-02.csv
24 | ```
25 | 
26 | Read it with Spark using the same schema as we did 
27 | in the lessons. We will use this dataset for all
28 | the remaining questions.
29 | 
30 | Repartition it to 24 partitions and save it to
31 | parquet.
32 | 
33 | What's the size of the folder with results (in MB)?
34 | 
35 | 
36 | ## Question 3. Count records 
37 | 
38 | How many taxi trips were there on February 15?
39 | 
40 | Consider only trips that started on February 15.
41 | 
42 | 
43 | ## Question 4. Longest trip for each day
44 | 
45 | Now calculate the duration for each trip.
46 | 
47 | Trip starting on which day was the longest? 
48 | 
49 | 
50 | ## Question 5. Most frequent `dispatching_base_num`
51 | 
52 | Now find the most frequently occurring `dispatching_base_num` 
53 | in this dataset.
54 | 
55 | How many stages this spark job has?
56 | 
57 | > Note: the answer may depend on how you write the query,
58 | > so there are multiple correct answers. 
59 | > Select the one you have.
60 | 
61 | 
62 | ## Question 6. Most common locations pair
63 | 
64 | Find the most common pickup-dropoff pair. 
65 | 
66 | For example:
67 | 
68 | "Jamaica Bay / Clinton East"
69 | 
70 | Enter two zone names separated by a slash
71 | 
72 | If any of the zone names are unknown (missing), use "Unknown". For example, "Unknown / Clinton East". 
73 | 
74 | 
75 | ## Bonus question. Join type
76 | 
77 | (not graded) 
78 | 
79 | For finding the answer to Q6, you'll need to perform a join.
80 | 
81 | What type of join is it?
82 | 
83 | And how many stages your spark job has?
84 | 
85 | 
86 | ## Submitting the solutions
87 | 
88 | * Form for submitting: https://forms.gle/dBkVK9yT8cSMDwuw7
89 | * You can submit your homework multiple times. In this case, only the last submission will be used. 
90 | 
91 | Deadline: 07 March (Monday), 22:00 CET
92 | 


--------------------------------------------------------------------------------
/cohorts/2022/week_6_stream_processing/homework.md:
--------------------------------------------------------------------------------
1 | ## Week 6 Homework
2 | [Form](https://forms.gle/mSzfpPCXskWCabeu5)
3 | 
4 | The homework is mostly theoretical. In the last question you have to provide working code link, please keep in mind that this
5 | question is not scored.
6 | 
7 | Deadline: 14 March, 22:00 CET


--------------------------------------------------------------------------------
/cohorts/2023/README.md:
--------------------------------------------------------------------------------
 1 | ## Data Engineering Zoomcamp 2023 Cohort
 2 | 
 3 | * [Launch stream with course overview](https://www.youtube.com/watch?v=-zpVha7bw5A)
 4 | * [Course Google calendar](https://calendar.google.com/calendar/?cid=ZXIxcjA1M3ZlYjJpcXU0dTFmaG02MzVxMG9AZ3JvdXAuY2FsZW5kYXIuZ29vZ2xlLmNvbQ)
 5 | * [FAQ](https://docs.google.com/document/d/19bnYs80DwuUimHM65UV3sylsCn2j1vziPOwzBwQrebw/edit?usp=sharing)
 6 | * [Public Leaderboard](leaderboard.md) and [Private Leaderboard](https://docs.google.com/spreadsheets/d/e/2PACX-1vTbL00GcdQp0bJt9wf1ROltMq7s3qyxl-NYF7Pvk79Jfxgwfn9dNWmPD_yJHTDq_Wzvps8EIr6cOKWm/pubhtml)
 7 | * [Course Playlist: Only 2023 Live videos & homeworks](https://www.youtube.com/playlist?list=PL3MmuxUbc_hJjEePXIdE-LVUx_1ZZjYGW)
 8 | 
 9 | [**Week 1: Introduction & Prerequisites**](week_1_docker_sql/)
10 | 
11 | * [Homework SQL](week_1_docker_sql/homework.md) and [solution](https://www.youtube.com/watch?v=KIh_9tZiroA)
12 | * [Homework Terraform](week_1_terraform/homework.md)
13 | * [Office hours](https://www.youtube.com/watch?v=RVTryVvSyw4&list=PL3MmuxUbc_hJjEePXIdE-LVUx_1ZZjYGW)
14 | 
15 | [**Week 2: Workflow Orchestration**](week_2_workflow_orchestration)
16 | 
17 | * [Homework](week_2_workflow_orchestration/homework.md)
18 | * [Office hours part 1](https://www.youtube.com/watch?v=a_nmLHb8hzw&list=PL3MmuxUbc_hJjEePXIdE-LVUx_1ZZjYGW) and [part 2](https://www.youtube.com/watch?v=PK8yyMY54Vk&list=PL3MmuxUbc_hJjEePXIdE-LVUx_1ZZjYGW&index=7) 
19 | 
20 | [**Week 3: Data Warehouse**](week_3_data_warehouse)
21 | 
22 | * [Homework](week_3_data_warehouse/homework.md)
23 | * [Office hours](https://www.youtube.com/watch?v=QXfmtJp3bXE&list=PL3MmuxUbc_hJjEePXIdE-LVUx_1ZZjYGW)
24 | 
25 | [**Week 4: Analytics Engineering**](week_4_analytics_engineering/)
26 | 
27 | * [Homework](week_4_analytics_engineering/homework.md)
28 | * [PipeRider + dbt Workshop](workshops/piperider.md)
29 | * [Office hours](https://www.youtube.com/watch?v=ODYg_r72qaE&list=PL3MmuxUbc_hJjEePXIdE-LVUx_1ZZjYGW)
30 | 
31 | [**Week 5: Batch processing**](week_5_batch_processing/)
32 | 
33 | * [Homework](week_5_batch_processing/homework.md)
34 | * [Office hours](https://www.youtube.com/watch?v=5_69yL2PPYI&list=PL3MmuxUbc_hJjEePXIdE-LVUx_1ZZjYGW)
35 | 
36 | [**Week 6: Stream Processing**](week_6_stream_processing)
37 | 
38 | * [Homework](week_6_stream_processing/homework.md)
39 | 
40 | 
41 | [**Week 7, 8 & 9: Project**](project.md)
42 | 
43 | More information [here](project.md)
44 | 


--------------------------------------------------------------------------------
/cohorts/2023/project.md:
--------------------------------------------------------------------------------
 1 | ## Course Project
 2 | 
 3 | The goal of this project is to apply everything we learned
 4 | in this course and build an end-to-end data pipeline.
 5 | 
 6 | You will have two attempts to submit your project. If you don't have 
 7 | time to submit your project by the end of attempt #1 (you started the 
 8 | course late, you have vacation plans, life/work got in the way, etc.)
 9 | or you fail your first attempt, 
10 | then you will have a second chance to submit your project as attempt
11 | #2. 
12 | 
13 | There are only two attempts.
14 | 
15 | Remember that to pass the project, you must evaluate 3 peers. If you don't do that,
16 | your project can't be considered complete.
17 | 
18 | To find the projects assigned to you, use the peer review assignments link 
19 | and find your hash in the first column. You will see three rows: you need to evaluate 
20 | each of these projects. For each project, you need to submit the form once,
21 | so in total, you will make three submissions. 
22 | 
23 | 
24 | ### Submitting
25 | 
26 | #### Project Attempt #1
27 | 
28 | Project:
29 | 
30 | * Form: https://forms.gle/zTJiVYSmCgsENj6y8
31 | * Deadline: 10 April, 22:00 CET
32 | 
33 | Peer reviewing:
34 | 
35 | * Peer review assignments: [link](https://docs.google.com/spreadsheets/d/e/2PACX-1vRYQ0A9C7AkRK-YPSFhqaRMmuPR97QPfl2PjI8n11l5jntc6YMHIJXVVS0GQNqAYIGwzyevyManDB08/pubhtml?gid=0&single=true) ("project-01" sheet)
36 | * Form: https://forms.gle/1bxmgR8yPwV359zb7
37 | * Deadline: 17 April, 22:00 CET
38 | 
39 | Project feedback: [link](https://docs.google.com/spreadsheets/d/e/2PACX-1vQuMt9m1XlPrCACqnsFTXTV_KGiSnsl9UjL7kdTMsLJ8DLu3jNJlPzoUKG6baxc8APeEQ8RaSP1U2VX/pubhtml?gid=27207346&single=true) ("project-01" sheet)
40 | 
41 | #### Project Attempt #2
42 | 
43 | Project:
44 | 
45 | * Form: https://forms.gle/gCXUSYBm1KgMKXVm8
46 | * Deadline: 4 May, 22:00 CET
47 | 
48 | Peer reviewing:
49 | 
50 | * Peer review assignments: [link](https://docs.google.com/spreadsheets/d/e/2PACX-1vRYQ0A9C7AkRK-YPSFhqaRMmuPR97QPfl2PjI8n11l5jntc6YMHIJXVVS0GQNqAYIGwzyevyManDB08/pubhtml?gid=303437788&single=true) ("project-02" sheet)
51 | * Form: https://forms.gle/2x5MT4xxczR8isy37
52 | * Deadline: 11 May, 22:00 CET
53 | 
54 | Project feedback: [link](https://docs.google.com/spreadsheets/d/e/2PACX-1vQuMt9m1XlPrCACqnsFTXTV_KGiSnsl9UjL7kdTMsLJ8DLu3jNJlPzoUKG6baxc8APeEQ8RaSP1U2VX/pubhtml?gid=246029638&single=true)
55 | 
56 | ### Evaluation criteria
57 | 
58 | See [here](../../week_7_project/README.md)
59 | 
60 | 
61 | ### Misc
62 | 
63 | To get the hash for your project, use this function to hash your email:
64 | 
65 | ```python
66 | from hashlib import sha1
67 | 
68 | def compute_hash(email):
69 |     return sha1(email.lower().encode('utf-8')).hexdigest()
70 | ```
71 | 
72 | Or use [this website](http://www.sha1-online.com/). 
73 | 


--------------------------------------------------------------------------------
/cohorts/2023/week_1_docker_sql/homework.md:
--------------------------------------------------------------------------------
  1 | ## Week 1 Homework
  2 | 
  3 | In this homework we'll prepare the environment 
  4 | and practice with Docker and SQL
  5 | 
  6 | 
  7 | ## Question 1. Knowing docker tags
  8 | 
  9 | Run the command to get information on Docker 
 10 | 
 11 | ```docker --help```
 12 | 
 13 | Now run the command to get help on the "docker build" command
 14 | 
 15 | Which tag has the following text? - *Write the image ID to the file* 
 16 | 
 17 | - `--imageid string`
 18 | - `--iidfile string`
 19 | - `--idimage string`
 20 | - `--idfile string`
 21 | 
 22 | 
 23 | ## Question 2. Understanding docker first run 
 24 | 
 25 | Run docker with the python:3.9 image in an interactive mode and the entrypoint of bash.
 26 | Now check the python modules that are installed ( use pip list). 
 27 | How many python packages/modules are installed?
 28 | 
 29 | - 1
 30 | - 6
 31 | - 3
 32 | - 7
 33 | 
 34 | # Prepare Postgres
 35 | 
 36 | Run Postgres and load data as shown in the videos
 37 | We'll use the green taxi trips from January 2019:
 38 | 
 39 | ```wget https://github.com/DataTalksClub/nyc-tlc-data/releases/download/green/green_tripdata_2019-01.csv.gz```
 40 | 
 41 | You will also need the dataset with zones:
 42 | 
 43 | ```wget https://s3.amazonaws.com/nyc-tlc/misc/taxi+_zone_lookup.csv```
 44 | 
 45 | Download this data and put it into Postgres (with jupyter notebooks or with a pipeline)
 46 | 
 47 | 
 48 | ## Question 3. Count records 
 49 | 
 50 | How many taxi trips were totally made on January 15?
 51 | 
 52 | Tip: started and finished on 2019-01-15. 
 53 | 
 54 | Remember that `lpep_pickup_datetime` and `lpep_dropoff_datetime` columns are in the format timestamp (date and hour+min+sec) and not in date.
 55 | 
 56 | - 20689
 57 | - 20530
 58 | - 17630
 59 | - 21090
 60 | 
 61 | ## Question 4. Largest trip for each day
 62 | 
 63 | Which was the day with the largest trip distance
 64 | Use the pick up time for your calculations.
 65 | 
 66 | - 2019-01-18
 67 | - 2019-01-28
 68 | - 2019-01-15
 69 | - 2019-01-10
 70 | 
 71 | ## Question 5. The number of passengers
 72 | 
 73 | In 2019-01-01 how many trips had 2 and 3 passengers?
 74 |  
 75 | - 2: 1282 ; 3: 266
 76 | - 2: 1532 ; 3: 126
 77 | - 2: 1282 ; 3: 254
 78 | - 2: 1282 ; 3: 274
 79 | 
 80 | 
 81 | ## Question 6. Largest tip
 82 | 
 83 | For the passengers picked up in the Astoria Zone which was the drop off zone that had the largest tip?
 84 | We want the name of the zone, not the id.
 85 | 
 86 | Note: it's not a typo, it's `tip` , not `trip`
 87 | 
 88 | - Central Park
 89 | - Jamaica
 90 | - South Ozone Park
 91 | - Long Island City/Queens Plaza
 92 | 
 93 | 
 94 | ## Submitting the solutions
 95 | 
 96 | * Form for submitting: [form](https://forms.gle/EjphSkR1b3nsdojv7)
 97 | * You can submit your homework multiple times. In this case, only the last submission will be used. 
 98 | 
 99 | Deadline: 30 January (Monday), 22:00 CET
100 | 
101 | 
102 | ## Solution
103 | 
104 | See here: https://www.youtube.com/watch?v=KIh_9tZiroA
105 | 


--------------------------------------------------------------------------------
/cohorts/2023/week_1_terraform/homework.md:
--------------------------------------------------------------------------------
 1 | ## Week 1 Homework
 2 | 
 3 | In this homework we'll prepare the environment by creating resources in GCP with Terraform.
 4 | 
 5 | In your VM on GCP install Terraform. Copy the files from the course repo
 6 | [here](https://github.com/DataTalksClub/data-engineering-zoomcamp/tree/main/week_1_basics_n_setup/1_terraform_gcp/terraform) to your VM.
 7 | 
 8 | Modify the files as necessary to create a GCP Bucket and Big Query Dataset.
 9 | 
10 | 
11 | ## Question 1. Creating Resources
12 | 
13 | After updating the main.tf and variable.tf files run:
14 | 
15 | ```
16 | terraform apply
17 | ```
18 | 
19 | Paste the output of this command into the homework submission form.
20 | 
21 | 
22 | ## Submitting the solutions
23 | 
24 | * Form for submitting: [form](https://forms.gle/S57Xs3HL9nB3YTzj9)
25 | * You can submit your homework multiple times. In this case, only the last submission will be used. 
26 | 
27 | Deadline: 30 January (Monday), 22:00 CET
28 | 
29 | 


--------------------------------------------------------------------------------
/cohorts/2023/week_6_stream_processing/client.properties:
--------------------------------------------------------------------------------
1 | # Required connection configs for Kafka producer, consumer, and admin
2 | bootstrap.servers=<CONFLUENT CLOUD KAFKA BROKER>:9092
3 | security.protocol=SASL_SSL
4 | sasl.mechanisms=PLAIN
5 | sasl.username=<CONFLUENT CLOUD API USER NAME>
6 | sasl.password=<CONFLUENT CLOUD API PASSWORD>
7 | 
8 | # Best practice for higher availability in librdkafka clients prior to 1.7
9 | session.timeout.ms=45000


--------------------------------------------------------------------------------
/cohorts/2023/week_6_stream_processing/producer_confluent.py:
--------------------------------------------------------------------------------
 1 | from confluent_kafka import Producer
 2 | 
 3 | import argparse
 4 | import csv
 5 | from typing import Dict
 6 | from time import sleep
 7 | 
 8 | from settings import CONFLUENT_CLOUD_CONFIG, \
 9 |     GREEN_TAXI_TOPIC, FHV_TAXI_TOPIC, \
10 |     GREEN_TRIP_DATA_PATH, FHV_TRIP_DATA_PATH
11 | 
12 | 
13 | class RideCSVProducer:
14 |     def __init__(self, probs: Dict, ride_type: str):
15 | 
16 |         self.producer = Producer(**probs)
17 |         self.ride_type = ride_type
18 | 
19 |     def parse_row(self, row):
20 |         if self.ride_type == 'green':
21 |             record = f'{row[5]}, {row[6]}'  # PULocationID, DOLocationID
22 |             key = str(row[0])  # vendor_id
23 |         elif self.ride_type == 'fhv':
24 |             record = f'{row[3]}, {row[4]}'  # PULocationID, DOLocationID,
25 |             key = str(row[0])  # dispatching_base_num
26 |         return key, record
27 | 
28 |     def read_records(self, resource_path: str):
29 |         records, ride_keys = [], []
30 |         with open(resource_path, 'r') as f:
31 |             reader = csv.reader(f)
32 |             header = next(reader)  # skip the header
33 |             for row in reader:
34 |                 key, record = self.parse_row(row)
35 |                 ride_keys.append(key)
36 |                 records.append(record)
37 |         return zip(ride_keys, records)
38 | 
39 |     def publish(self, records: [str, str], topic: str):
40 |         for key_value in records:
41 |             key, value = key_value
42 |             try:
43 |                 self.producer.poll(0)
44 |                 self.producer.produce(topic=topic, key=key, value=value)
45 |                 print(f"Producing record for <key: {key}, value:{value}>")
46 |             except KeyboardInterrupt:
47 |                 break
48 |             except BufferError as bfer:
49 |                 self.producer.poll(0.1)
50 |             except Exception as e:
51 |                 print(f"Exception while producing record - {value}: {e}")
52 | 
53 |         self.producer.flush()
54 |         sleep(10)
55 | 
56 | 
57 | if __name__ == "__main__":
58 |     parser = argparse.ArgumentParser(description='Kafka Consumer')
59 |     parser.add_argument('--type', type=str, default='green')
60 |     args = parser.parse_args()
61 | 
62 |     if args.type == 'green':
63 |         kafka_topic = GREEN_TAXI_TOPIC
64 |         data_path = GREEN_TRIP_DATA_PATH
65 |     elif args.type == 'fhv':
66 |         kafka_topic = FHV_TAXI_TOPIC
67 |         data_path = FHV_TRIP_DATA_PATH
68 | 
69 |     producer = RideCSVProducer(ride_type=args.type, probs=CONFLUENT_CLOUD_CONFIG)
70 |     ride_records = producer.read_records(resource_path=data_path)
71 |     producer.publish(records=ride_records, topic=kafka_topic)
72 | 


--------------------------------------------------------------------------------
/cohorts/2023/week_6_stream_processing/settings.py:
--------------------------------------------------------------------------------
 1 | import pyspark.sql.types as T
 2 | 
 3 | GREEN_TRIP_DATA_PATH = './resources/green_tripdata/green_tripdata_2019-01.csv'
 4 | FHV_TRIP_DATA_PATH = './resources/fhv_tripdata/fhv_tripdata_2019-01.csv'
 5 | BOOTSTRAP_SERVERS = 'localhost:9092'
 6 | 
 7 | RIDES_TOPIC = 'all_rides'
 8 | FHV_TAXI_TOPIC = 'fhv_taxi_rides'
 9 | GREEN_TAXI_TOPIC = 'green_taxi_rides'
10 | 
11 | ALL_RIDE_SCHEMA = T.StructType(
12 |     [T.StructField("PUlocationID", T.StringType()),
13 |      T.StructField("DOlocationID", T.StringType()),
14 |      ])
15 | 
16 | 
17 | def read_ccloud_config(config_file):
18 |     conf = {}
19 |     with open(config_file) as fh:
20 |         for line in fh:
21 |             line = line.strip()
22 |             if len(line) != 0 and line[0] != "#":
23 |                 parameter, value = line.strip().split('=', 1)
24 |                 conf[parameter] = value.strip()
25 |     return conf
26 | 
27 | 
28 | CONFLUENT_CLOUD_CONFIG = read_ccloud_config('client_original.properties')
29 | 


--------------------------------------------------------------------------------
/cohorts/2023/week_6_stream_processing/spark-submit.sh:
--------------------------------------------------------------------------------
 1 | # Submit Python code to SparkMaster
 2 | 
 3 | if [ $# -lt 1 ]
 4 | then
 5 | 	echo "Usage: $0 <pyspark-job.py> [ executor-memory ]"
 6 | 	echo "(specify memory in string format such as \"512M\" or \"2G\")"
 7 | 	exit 1
 8 | fi
 9 | PYTHON_JOB=$1
10 | 
11 | if [ -z $2 ]
12 | then
13 | 	EXEC_MEM="1G"
14 | else
15 | 	EXEC_MEM=$2
16 | fi
17 | spark-submit --master spark://localhost:7077 --num-executors 2 \
18 | 	           --executor-memory $EXEC_MEM --executor-cores 1 \
19 |              --packages org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.1,org.apache.spark:spark-avro_2.12:3.3.1,org.apache.spark:spark-streaming-kafka-0-10_2.12:3.3.1 \
20 |              $PYTHON_JOB


--------------------------------------------------------------------------------
/cohorts/2023/workshops/piperider.md:
--------------------------------------------------------------------------------
 1 | 
 2 | ## Workshop: Maximizing Confidence in Your Data Model Changes with dbt and PipeRider
 3 | 
 4 | To learn how to use PipeRider together with dbt for detecting changes in model and data, sign up for a workshop
 5 | 
 6 | - Video: https://www.youtube.com/watch?v=O-tyUOQccSs
 7 | - Repository: https://github.com/InfuseAI/taxi_rides_ny_duckdb
 8 | 
 9 | 
10 | ## Homework
11 | 
12 | The following questions follow on from the original Week 4 homework, and so use the same data as required by those questions:
13 | 
14 | https://github.com/DataTalksClub/data-engineering-zoomcamp/blob/main/cohorts/2023/week_4_analytics_engineering/homework.md
15 | 
16 | Yellow taxi data - Years 2019 and 2020
17 | Green taxi data - Years 2019 and 2020
18 | fhv data - Year 2019.
19 | 
20 | ### Question 1:
21 | 
22 | What is the distribution between vendor id filtering by years 2019 and 2020 data?
23 | 
24 | You will need to run PipeRider and check the report
25 | 
26 | * 70.1/29.6/0.5
27 | * 60.1/39.5/0.4
28 | * 90.2/9.5/0.3
29 | * 80.1/19.7/0.2
30 | 
31 | ### Question 2:
32 | 
33 | What is the composition of total amount (positive/zero/negative) filtering by years 2019 and 2020 data?
34 | 
35 | You will need to run PipeRider and check the report
36 | 
37 | 
38 | * 51.4M/15K/48.6K
39 | * 21.4M/5K/248.6K
40 | * 61.4M/25K/148.6K
41 | * 81.4M/35K/14.6K
42 | 
43 | ### Question 3:
44 | 
45 | What is the numeric statistics (average/standard deviation/min/max/sum) of trip distances filtering by years 2019 and 2020 data?
46 | 
47 | You will need to run PipeRider and check the report
48 | 
49 | 
50 | * 1.95/35.43/0/16.3K/151.5M
51 | * 3.95/25.43/23.88/267.3K/281.5M
52 | * 5.95/75.43/-63.88/67.3K/81.5M
53 | * 2.95/35.43/-23.88/167.3K/181.5M
54 | 
55 | 
56 | 
57 | ## Submitting the solutions
58 | 
59 | * Form for submitting: https://forms.gle/WyLQHBu1DNwNTfqe8
60 | * You can submit your homework multiple times. In this case, only the last submission will be used. 
61 | 
62 | Deadline: 20 March, 22:00 CET
63 | 
64 | 
65 | ## Solution
66 | 
67 | Video: https://www.youtube.com/watch?v=inNrUys7W8U&list=PL3MmuxUbc_hJjEePXIdE-LVUx_1ZZjYGW
68 | 


--------------------------------------------------------------------------------
/cohorts/2024/05-batch/homework.md:
--------------------------------------------------------------------------------
  1 | ## Module 5 Homework 
  2 | 
  3 | Solution: https://www.youtube.com/watch?v=YtddC7vJOgQ
  4 | 
  5 | In this homework we'll put what we learned about Spark in practice.
  6 | 
  7 | For this homework we will be using the FHV 2019-10 data found here. [FHV Data](https://github.com/DataTalksClub/nyc-tlc-data/releases/download/fhv/fhv_tripdata_2019-10.csv.gz)
  8 | 
  9 | ### Question 1: 
 10 | 
 11 | **Install Spark and PySpark** 
 12 | 
 13 | - Install Spark
 14 | - Run PySpark
 15 | - Create a local spark session
 16 | - Execute spark.version.
 17 | 
 18 | What's the output?
 19 | 
 20 | > [!NOTE]
 21 | > To install PySpark follow this [guide](https://github.com/DataTalksClub/data-engineering-zoomcamp/blob/main/05-batch/setup/pyspark.md)
 22 | 
 23 | ### Question 2: 
 24 | 
 25 | **FHV October 2019**
 26 | 
 27 | Read the October 2019 FHV into a Spark Dataframe with a schema as we did in the lessons.
 28 | 
 29 | Repartition the Dataframe to 6 partitions and save it to parquet.
 30 | 
 31 | What is the average size of the Parquet (ending with .parquet extension) Files that were created (in MB)? Select the answer which most closely matches.
 32 | 
 33 | - 1MB
 34 | - 6MB
 35 | - 25MB
 36 | - 87MB
 37 | 
 38 | 
 39 | 
 40 | ### Question 3: 
 41 | 
 42 | **Count records** 
 43 | 
 44 | How many taxi trips were there on the 15th of October?
 45 | 
 46 | Consider only trips that started on the 15th of October.
 47 | 
 48 | - 108,164
 49 | - 12,856
 50 | - 452,470
 51 | - 62,610
 52 | 
 53 | > [!IMPORTANT]
 54 | > Be aware of columns order when defining schema
 55 | 
 56 | ### Question 4: 
 57 | 
 58 | **Longest trip for each day** 
 59 | 
 60 | What is the length of the longest trip in the dataset in hours?
 61 | 
 62 | - 631,152.50 Hours
 63 | - 243.44 Hours
 64 | - 7.68 Hours
 65 | - 3.32 Hours
 66 | 
 67 | 
 68 | 
 69 | ### Question 5: 
 70 | 
 71 | **User Interface**
 72 | 
 73 | Spark’s User Interface which shows the application's dashboard runs on which local port?
 74 | 
 75 | - 80
 76 | - 443
 77 | - 4040
 78 | - 8080
 79 | 
 80 | 
 81 | 
 82 | ### Question 6: 
 83 | 
 84 | **Least frequent pickup location zone**
 85 | 
 86 | Load the zone lookup data into a temp view in Spark</br>
 87 | [Zone Data](https://github.com/DataTalksClub/nyc-tlc-data/releases/download/misc/taxi_zone_lookup.csv)
 88 | 
 89 | Using the zone lookup data and the FHV October 2019 data, what is the name of the LEAST frequent pickup location Zone?</br>
 90 | 
 91 | - East Chelsea
 92 | - Jamaica Bay
 93 | - Union Sq
 94 | - Crown Heights North
 95 | 
 96 | 
 97 | ## Submitting the solutions
 98 | 
 99 | - Form for submitting: https://courses.datatalks.club/de-zoomcamp-2024/homework/hw5
100 | - Deadline: See the website
101 | 


--------------------------------------------------------------------------------
/cohorts/2024/06-streaming/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: '3.7'
 2 | services:
 3 |   # Redpanda cluster
 4 |   redpanda-1:
 5 |     image: docker.redpanda.com/vectorized/redpanda:v22.3.5
 6 |     container_name: redpanda-1
 7 |     command:
 8 |       - redpanda
 9 |       - start
10 |       - --smp
11 |       - '1'
12 |       - --reserve-memory
13 |       - 0M
14 |       - --overprovisioned
15 |       - --node-id
16 |       - '1'
17 |       - --kafka-addr
18 |       - PLAINTEXT://0.0.0.0:29092,OUTSIDE://0.0.0.0:9092
19 |       - --advertise-kafka-addr
20 |       - PLAINTEXT://redpanda-1:29092,OUTSIDE://localhost:9092
21 |       - --pandaproxy-addr
22 |       - PLAINTEXT://0.0.0.0:28082,OUTSIDE://0.0.0.0:8082
23 |       - --advertise-pandaproxy-addr
24 |       - PLAINTEXT://redpanda-1:28082,OUTSIDE://localhost:8082
25 |       - --rpc-addr
26 |       - 0.0.0.0:33145
27 |       - --advertise-rpc-addr
28 |       - redpanda-1:33145
29 |     ports:
30 |       # - 8081:8081
31 |       - 8082:8082
32 |       - 9092:9092
33 |       - 28082:28082
34 |       - 29092:29092


--------------------------------------------------------------------------------
/cohorts/2024/README.md:
--------------------------------------------------------------------------------
 1 | ## Data Engineering Zoomcamp 2024 Cohort
 2 | 
 3 | * [Pre-launch Q&A stream](https://www.youtube.com/watch?v=91b8u9GmqB4)
 4 | * [Launch stream with course overview](https://www.youtube.com/live/AtRhA-NfS24?si=5JzA_E8BmJjiLi8l)
 5 | * [Deadline calendar](https://docs.google.com/spreadsheets/d/e/2PACX-1vQACMLuutV5rvXg5qICuJGL-yZqIV0FBD84CxPdC5eZHf8TfzB-CJT_3Mo7U7oGVTXmSihPgQxuuoku/pubhtml)
 6 | * [Course Google calendar](https://calendar.google.com/calendar/?cid=ZXIxcjA1M3ZlYjJpcXU0dTFmaG02MzVxMG9AZ3JvdXAuY2FsZW5kYXIuZ29vZ2xlLmNvbQ)
 7 | * [FAQ](https://docs.google.com/document/d/19bnYs80DwuUimHM65UV3sylsCn2j1vziPOwzBwQrebw/edit?usp=sharing)
 8 | * Course Playlist: Only 2024 Live videos & homeworks (TODO)
 9 | * [Public Leaderboard of Top-100 Participants](leaderboard.md)
10 | 
11 | 
12 | [**Module 1: Introduction & Prerequisites**](01-docker-terraform/)
13 | 
14 | * [Homework](01-docker-terraform/homework.md)
15 | 
16 | 
17 | [**Module 2: Workflow Orchestration**](02-workflow-orchestration)
18 | 
19 | * [Homework](02-workflow-orchestration/homework.md)
20 | * Office hours
21 | 
22 | [**Workshop 1: Data Ingestion**](workshops/dlt.md)
23 | 
24 | * Workshop with dlt
25 | * [Homework](workshops/dlt.md)
26 | 
27 | 
28 | [**Module 3: Data Warehouse**](03-data-warehouse)
29 | 
30 | * [Homework](03-data-warehouse/homework.md)
31 | 
32 | 
33 | [**Module 4: Analytics Engineering**](04-analytics-engineering/)
34 | 
35 | * [Homework](04-analytics-engineering/homework.md)
36 | 
37 | 
38 | [**Module 5: Batch processing**](05-batch/)
39 | 
40 | * [Homework](05-batch/homework.md)
41 | 
42 | 
43 | [**Module 6: Stream Processing**](06-streaming)
44 | 
45 | * [Homework](06-streaming/homework.md)
46 | 
47 | 
48 | [**Project**](project.md)
49 | 
50 | More information [here](project.md)
51 | 


--------------------------------------------------------------------------------
/cohorts/2024/project.md:
--------------------------------------------------------------------------------
 1 | ## Course Project
 2 | 
 3 | The goal of this project is to apply everything we learned
 4 | in this course and build an end-to-end data pipeline.
 5 | 
 6 | You will have two attempts to submit your project. If you don't have 
 7 | time to submit your project by the end of attempt #1 (you started the 
 8 | course late, you have vacation plans, life/work got in the way, etc.)
 9 | or you fail your first attempt, 
10 | then you will have a second chance to submit your project as attempt
11 | #2. 
12 | 
13 | There are only two attempts.
14 | 
15 | Remember that to pass the project, you must evaluate 3 peers. If you don't do that,
16 | your project can't be considered complete.
17 | 
18 | To find the projects assigned to you, use the peer review assignments link 
19 | and find your hash in the first column. You will see three rows: you need to evaluate 
20 | each of these projects. For each project, you need to submit the form once,
21 | so in total, you will make three submissions. 
22 | 
23 | 
24 | ### Submitting
25 | 
26 | #### Project Attempt #1
27 | 
28 | * Project: https://courses.datatalks.club/de-zoomcamp-2024/project/project1
29 | * Review: https://courses.datatalks.club/de-zoomcamp-2024/project/project1/eval
30 | 
31 | #### Project Attempt #2
32 | 
33 | * Project: https://courses.datatalks.club/de-zoomcamp-2024/project/project2
34 | * Review: https://courses.datatalks.club/de-zoomcamp-2024/project/project2/eval
35 | 
36 | > **Important**: update your "Certificate name" here: https://courses.datatalks.club/de-zoomcamp-2024/enrollment -
37 | this is what we will use when generating certificates for you.
38 | 
39 | ### Evaluation criteria
40 | 
41 | See [here](../../week_7_project/README.md)
42 | 
43 | 
44 | 


--------------------------------------------------------------------------------
/cohorts/2024/workshops/dlt_resources/incremental_loading.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataTalksClub/data-engineering-zoomcamp/76fed2419e917176369a0d3a2eeff6c936f87286/cohorts/2024/workshops/dlt_resources/incremental_loading.png


--------------------------------------------------------------------------------
/cohorts/2025/02-workflow-orchestration/solution.md:
--------------------------------------------------------------------------------
 1 | ## Question 1
 2 | 
 3 | ```
 4 | Within the execution for Yellow Taxi data for the year 2020 and month 12: what is the uncompressed file size (i.e. the output file yellow_tripdata_2020-12.csv of the extract task)?
 5 | ```
 6 | 
 7 | To get this answer, you need to go to the Outputs tab in Kestra and select the file. The size will be next to the preview and download button.
 8 | Answer: `128.3 MB`
 9 | 
10 | ## Question 2
11 | 
12 | ```
13 | What is the rendered value of the variable file when the inputs taxi is set to green, year is set to 2020, and month is set to 04 during execution?
14 | ```
15 | 
16 | To get this answer, you can run the expression in [Debug Outputs](https://youtu.be/SPGmXSJN3VE) to see it rendered.
17 | 
18 | Answer: `green_tripdata_2020-04.csv`
19 | 
20 | ## Question 3
21 | 
22 | ```
23 | How many rows are there for the Yellow Taxi data for all CSV files in the year 2020?
24 | ```
25 | 
26 | Answer: `24,648,499`
27 | 
28 | ## Question 4
29 | 
30 | ```
31 | How many rows are there for the Green Taxi data for all CSV files in the year 2020?
32 | ```
33 | 
34 | Answer: `1,734,051`
35 | 
36 | ## Question 5
37 | 
38 | ```
39 | How many rows are there for the Yellow Taxi data for the March 2021 CSV file?
40 | ```
41 | 
42 | Answer: `1,925,152`
43 | 
44 | ## Question 6
45 | 
46 | ```
47 | How would you configure the timezone to New York in a Schedule trigger?
48 | ```
49 | 
50 | Answer: `Add a timezone property set to America/New_York in the Schedule trigger configuration`
51 | 


--------------------------------------------------------------------------------
/cohorts/2025/04-analytics-engineering/homework_q2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataTalksClub/data-engineering-zoomcamp/76fed2419e917176369a0d3a2eeff6c936f87286/cohorts/2025/04-analytics-engineering/homework_q2.png


--------------------------------------------------------------------------------
/cohorts/2025/05-batch/homework.md:
--------------------------------------------------------------------------------
 1 | # Module 5 Homework
 2 | 
 3 | In this homework we'll put what we learned about Spark in practice.
 4 | 
 5 | For this homework we will be using the Yellow 2024-10 data from the official website: 
 6 | 
 7 | ```bash
 8 | wget https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-10.parquet
 9 | ```
10 | 
11 | 
12 | ## Question 1: Install Spark and PySpark
13 | 
14 | - Install Spark
15 | - Run PySpark
16 | - Create a local spark session
17 | - Execute spark.version.
18 | 
19 | What's the output?
20 | 
21 | > [!NOTE]
22 | > To install PySpark follow this [guide](https://github.com/DataTalksClub/data-engineering-zoomcamp/blob/main/05-batch/setup/pyspark.md)
23 | 
24 | 
25 | ## Question 2: Yellow October 2024
26 | 
27 | Read the October 2024 Yellow into a Spark Dataframe.
28 | 
29 | Repartition the Dataframe to 4 partitions and save it to parquet.
30 | 
31 | What is the average size of the Parquet (ending with .parquet extension) Files that were created (in MB)? Select the answer which most closely matches.
32 | 
33 | - 6MB
34 | - 25MB
35 | - 75MB
36 | - 100MB
37 | 
38 | 
39 | ## Question 3: Count records 
40 | 
41 | How many taxi trips were there on the 15th of October?
42 | 
43 | Consider only trips that started on the 15th of October.
44 | 
45 | - 85,567
46 | - 105,567
47 | - 125,567
48 | - 145,567
49 | 
50 | 
51 | ## Question 4: Longest trip
52 | 
53 | What is the length of the longest trip in the dataset in hours?
54 | 
55 | - 122
56 | - 142
57 | - 162
58 | - 182
59 | 
60 | 
61 | ## Question 5: User Interface
62 | 
63 | Spark’s User Interface which shows the application's dashboard runs on which local port?
64 | 
65 | - 80
66 | - 443
67 | - 4040
68 | - 8080
69 | 
70 | 
71 | 
72 | ## Question 6: Least frequent pickup location zone
73 | 
74 | Load the zone lookup data into a temp view in Spark:
75 | 
76 | ```bash
77 | wget https://d37ci6vzurychx.cloudfront.net/misc/taxi_zone_lookup.csv
78 | ```
79 | 
80 | Using the zone lookup data and the Yellow October 2024 data, what is the name of the LEAST frequent pickup location Zone?
81 | 
82 | - Governor's Island/Ellis Island/Liberty Island
83 | - Arden Heights
84 | - Rikers Island
85 | - Jamaica Bay
86 | 
87 | 
88 | ## Submitting the solutions
89 | 
90 | - Form for submitting: https://courses.datatalks.club/de-zoomcamp-2025/homework/hw5
91 | - Deadline: See the website
92 | 


--------------------------------------------------------------------------------
/cohorts/2025/README.md:
--------------------------------------------------------------------------------
 1 | ## Data Engineering Zoomcamp 2025 Cohort
 2 | 
 3 | * [Pre-launch Q&A stream](https://www.youtube.com/watch?v=DPnAOu2csYA)
 4 | * [Launch stream with course overview](https://www.youtube.com/watch?v=X8cEEwi8DTM)
 5 | * [Course Google calendar](https://calendar.google.com/calendar/?cid=ZXIxcjA1M3ZlYjJpcXU0dTFmaG02MzVxMG9AZ3JvdXAuY2FsZW5kYXIuZ29vZ2xlLmNvbQ)
 6 | * [FAQ](https://docs.google.com/document/d/19bnYs80DwuUimHM65UV3sylsCn2j1vziPOwzBwQrebw/edit?usp=sharing)
 7 | * [Course Playlist](https://www.youtube.com/playlist?list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb)
 8 | * [Cohort-specific playlist: only 2025 Live videos](https://www.youtube.com/playlist?list=PL3MmuxUbc_hJZdpLpRHp7dg6EOx828q6y)
 9 | 
10 | 
11 | [**Module 1: Introduction & Prerequisites**](01-docker-terraform/)
12 | 
13 | * [Homework](01-docker-terraform/homework.md)
14 | 
15 | 
16 | [**Module 2: Workflow Orchestration**](02-workflow-orchestration)
17 | 
18 | * [Homework](02-workflow-orchestration/homework.md)
19 | * Office hours
20 | 
21 | [**Workshop 1: Data Ingestion**](workshops/dlt/README.md)
22 | 
23 | * Workshop with dlt
24 | * [Homework](workshops/dlt/README.md)
25 | 
26 | 
27 | [**Module 3: Data Warehouse**](03-data-warehouse)
28 | 
29 | * [Homework](03-data-warehouse/homework.md)
30 | 
31 | 
32 | [**Module 4: Analytics Engineering**](04-analytics-engineering/)
33 | 
34 | * [Homework](04-analytics-engineering/homework.md)
35 | 
36 | 
37 | [**Module 5: Batch processing**](05-batch/)
38 | 
39 | * [Homework](05-batch/homework.md)
40 | 
41 | 
42 | [**Module 6: Stream Processing**](06-streaming)
43 | 
44 | * [Homework](06-streaming/homework.md)
45 | 
46 | 
47 | [**Project**](project.md)
48 | 
49 | More information [here](project.md)
50 | 


--------------------------------------------------------------------------------
/cohorts/2025/project.md:
--------------------------------------------------------------------------------
 1 | ## Course Project
 2 | 
 3 | The goal of this project is to apply everything we learned
 4 | in this course and build an end-to-end data pipeline.
 5 | 
 6 | You will have two attempts to submit your project. If you don't have 
 7 | time to submit your project by the end of attempt #1 (you started the 
 8 | course late, you have vacation plans, life/work got in the way, etc.)
 9 | or you fail your first attempt, 
10 | then you will have a second chance to submit your project as attempt
11 | #2. 
12 | 
13 | There are only two attempts.
14 | 
15 | Remember that to pass the project, you must evaluate 3 peers. If you don't do that,
16 | your project can't be considered complete.
17 | 
18 | To find the projects assigned to you, use the peer review assignments link 
19 | and find your hash in the first column. You will see three rows: you need to evaluate 
20 | each of these projects. For each project, you need to submit the form once,
21 | so in total, you will make three submissions. 
22 | 
23 | 
24 | ### Submitting
25 | 
26 | #### Project Attempt #1
27 | 
28 | * Project: https://courses.datatalks.club/de-zoomcamp-2025/project/project1
29 | * Review: https://courses.datatalks.club/de-zoomcamp-2025/project/project1/eval
30 | 
31 | #### Project Attempt #2
32 | 
33 | * Project: https://courses.datatalks.club/de-zoomcamp-2025/project/project2
34 | * Review: https://courses.datatalks.club/de-zoomcamp-2025/project/project2/eval
35 | 
36 | > **Important**: update your "Certificate name" here: https://courses.datatalks.club/de-zoomcamp-2025/enrollment -
37 | this is what we will use when generating certificates for you.
38 | 
39 | ### Evaluation criteria
40 | 
41 | See [here](../../projects/README.md)
42 | 


--------------------------------------------------------------------------------
/cohorts/2025/workshops/dlt/img/Rest_API.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataTalksClub/data-engineering-zoomcamp/76fed2419e917176369a0d3a2eeff6c936f87286/cohorts/2025/workshops/dlt/img/Rest_API.png


--------------------------------------------------------------------------------
/cohorts/2025/workshops/dlt/img/dlt.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataTalksClub/data-engineering-zoomcamp/76fed2419e917176369a0d3a2eeff6c936f87286/cohorts/2025/workshops/dlt/img/dlt.png


--------------------------------------------------------------------------------
/cohorts/2025/workshops/dlt/img/pipes.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataTalksClub/data-engineering-zoomcamp/76fed2419e917176369a0d3a2eeff6c936f87286/cohorts/2025/workshops/dlt/img/pipes.jpg


--------------------------------------------------------------------------------
/dataset.md:
--------------------------------------------------------------------------------
 1 | [Medium article](https://medium.com/@NYCTLC/what-makes-a-city-street-smart-23496d92f60d)
 2 | 
 3 | [Trip record user guide](https://www1.nyc.gov/assets/tlc/downloads/pdf/trip_record_user_guide.pdf)
 4 | 
 5 | The data set is divided into 4 parts:
 6 | 
 7 | - Yellow cabs
 8 | - Green cabs
 9 | - For Hire Vehicles
10 | - High volume for hire vehicles
11 | 
12 | 
13 | 
14 | Below I am only concentrating on Yellow and green cabs
15 | 
16 | ### Yellow and green cabs
17 | 
18 | ,
19 | 
20 | | Columns               | Definition | Example             |
21 | | --------------------- | ---------- | ------------------- |
22 | | VendorID              |            | 2                   |
23 | | lpep_pickup_datetime  |            | 2021-01-01 00:15:56 |
24 | | lpep_dropoff_datetime |            | 2021-01-01 00:19:52 |
25 | | store_and_fwd_flag    |            | N,                  |
26 | | RatecodeID            |            | 1                   |
27 | | PULocationID          |            | 43                  |
28 | | DOLocationID          |            | 151                 |
29 | | passenger_count       |            | 1                   |
30 | | trip_distance         |            | 1.01                |
31 | | fare_amount           |            | 5.5                 |
32 | | extra                 |            | 0.5                 |
33 | | mta_tax               |            | 0.5                 |
34 | | tip_amount            |            | 0                   |
35 | | tolls_amount          |            | 0                   |
36 | | ehail_fee             |            |                     |
37 | | improvement_surcharge |            | 0.3                 |
38 | | total_amount          |            | 6.8                 |
39 | | payment_type          |            | 2                   |
40 | | trip_type             |            | 1                   |
41 | | congestion_surcharge  |            | 0                   |
42 | 
43 | 
44 | 
45 | ### Taxi zone Loopup
46 | 
47 | | Columns      | Definition | Example        |
48 | | ------------ | ---------- | -------------- |
49 | | LocationID   |            | 1              |
50 | | Borough      |            | EWR            |
51 | | Zone         |            | Newark Airport |
52 | | service_zone |            | EWR            |
53 | 
54 | [Shapefile from S3](https://s3.amazonaws.com/nyctlc/misc/taxi_zones.zip)
55 | 
56 | [Taxi zones](https://data.cityofnewyork.us/Transportation/NYC-Taxi-Zones/d3c5-ddgc)
57 | 
58 | 


--------------------------------------------------------------------------------
/images/architecture/arch_v3_workshops.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataTalksClub/data-engineering-zoomcamp/76fed2419e917176369a0d3a2eeff6c936f87286/images/architecture/arch_v3_workshops.jpg


--------------------------------------------------------------------------------
/images/architecture/arch_v4_workshops.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataTalksClub/data-engineering-zoomcamp/76fed2419e917176369a0d3a2eeff6c936f87286/images/architecture/arch_v4_workshops.jpg


--------------------------------------------------------------------------------
/images/architecture/photo1700757552.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataTalksClub/data-engineering-zoomcamp/76fed2419e917176369a0d3a2eeff6c936f87286/images/architecture/photo1700757552.jpeg


--------------------------------------------------------------------------------
/images/aws/iam.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataTalksClub/data-engineering-zoomcamp/76fed2419e917176369a0d3a2eeff6c936f87286/images/aws/iam.png


--------------------------------------------------------------------------------
/images/dlthub.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataTalksClub/data-engineering-zoomcamp/76fed2419e917176369a0d3a2eeff6c936f87286/images/dlthub.png


--------------------------------------------------------------------------------
/images/mage.svg:
--------------------------------------------------------------------------------
 1 | <svg width="5854" height="1996" viewBox="0 0 5854 1996" fill="none" xmlns="http://www.w3.org/2000/svg">
 2 | <path fill-rule="evenodd" clip-rule="evenodd" d="M4436.93 976.545H4795.03V1279.35H4688.13L4681.9 1190.29C4628.45 1248.18 4543.83 1287.36 4423.57 1287.36C4215.13 1287.36 4087.74 1184.05 4087.74 997.029C4087.74 810.005 4215.12 706.696 4447.62 706.696C4673.88 706.696 4786.12 794.865 4795.03 934.687H4647.16C4639.14 891.939 4599.95 827.817 4446.73 827.817C4257.88 827.817 4232.05 932.016 4232.05 999.701C4232.05 1068.28 4258.77 1175.15 4446.73 1175.15C4587.48 1175.15 4643.6 1113.7 4647.16 1070.95H4436.93V976.545ZM3550.6 1160.01H3888.21L3949.68 1280.24H4110.02L3817.84 715.602H3621.86L3330.58 1280.24H3490.03L3550.6 1160.01ZM3719.85 823.364L3834.76 1052.25H3604.05L3719.85 823.364ZM2406 715.998V1280H2551.22V868.618L2765.42 1280H2917.92L3132.12 868.618V1280H3277.34V715.998H3059.23L2841.67 1133.84L2624.11 715.998H2406ZM5043.56 946.265V831.379H5495.19V715.602H4898.36V1282.02H5495.19V1166.24H5043.56V1051.36H5495.19V946.265H5043.56Z" fill="#885EFF"/>
 3 | <path opacity="0.4" d="M1690 358L2048 358L1381 1638L1023 1638L1690 358Z" fill="url(#paint0_linear_521_211)"/>
 4 | <path d="M1381 358L1025 358L358 1638L716 1638L1023 1048.86V1638H1381V358.008L1381 358Z" fill="url(#paint1_linear_521_211)"/>
 5 | <path d="M1690 358H2048V1638H1690V358Z" fill="url(#paint2_linear_521_211)"/>
 6 | <defs>
 7 | <linearGradient id="paint0_linear_521_211" x1="1023" y1="998" x2="2048" y2="998" gradientUnits="userSpaceOnUse">
 8 | <stop offset="0.28125" stop-color="#7F51FF"/>
 9 | <stop offset="1" stop-color="#2AB2FE"/>
10 | </linearGradient>
11 | <linearGradient id="paint1_linear_521_211" x1="358" y1="998" x2="2048" y2="998" gradientUnits="userSpaceOnUse">
12 | <stop offset="0.28125" stop-color="#7F51FF"/>
13 | <stop offset="1" stop-color="#2AB2FE"/>
14 | </linearGradient>
15 | <linearGradient id="paint2_linear_521_211" x1="358" y1="998" x2="2048" y2="998" gradientUnits="userSpaceOnUse">
16 | <stop offset="0.28125" stop-color="#7F51FF"/>
17 | <stop offset="1" stop-color="#2AB2FE"/>
18 | </linearGradient>
19 | </defs>
20 | </svg>
21 | 


--------------------------------------------------------------------------------
/images/piperider.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataTalksClub/data-engineering-zoomcamp/76fed2419e917176369a0d3a2eeff6c936f87286/images/piperider.png


--------------------------------------------------------------------------------
/images/rising-wave.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataTalksClub/data-engineering-zoomcamp/76fed2419e917176369a0d3a2eeff6c936f87286/images/rising-wave.png


--------------------------------------------------------------------------------
/learning-in-public.md:
--------------------------------------------------------------------------------
 1 | # Learning in public
 2 | 
 3 | Most people learn in private: they consume content but don't tell
 4 | anyone about it. There's nothing wrong with it.
 5 | 
 6 | But we want to encourage you to document your progress and
 7 | share it publicly on social media.
 8 | 
 9 | It helps you get noticed and will lead to:
10 | 
11 | * Expanding your network: meeting new people and making new friends
12 | * Being invited to meetups, conferences and podcasts
13 | * Landing a job or getting clients
14 | * Many other good things
15 | 
16 | Here's a more compresensive reading on why you want to do it: https://github.com/readme/guides/publishing-your-work
17 | 
18 | 
19 | ## Learning in Public for Zoomcamps
20 | 
21 | When you submit your homework or project, you can also submit
22 | learning in public posts:
23 | 
24 | <img src="https://github.com/DataTalksClub/mlops-zoomcamp/raw/main/images/learning-in-public-links.png" />
25 | 
26 | You can watch this video to see how your learning in public posts may look like:
27 | 
28 | <a href="https://www.loom.com/share/710e3297487b409d94df0e8da1c984ce" target="_blank">
29 |     <img src="https://github.com/DataTalksClub/mlops-zoomcamp/raw/main/images/learning-in-public.png" height="240" />
30 | </a>
31 | 
32 | ## Daily Documentation
33 | 
34 | - **Post Daily Diaries**: Document what you learn each day, including the challenges faced and the methods used to overcome them.
35 | - **Create Quick Videos**: Make short videos showcasing your work and upload them to GitHub.
36 | 
37 | Send a PR if you want to suggest improvements for this document
38 | 


--------------------------------------------------------------------------------
/projects/datasets.md:
--------------------------------------------------------------------------------
 1 | ## Datasets
 2 | 
 3 | Here are some datasets that you could use for the project:
 4 | 
 5 | 
 6 | * [Kaggle](https://www.kaggle.com/datasets)
 7 | * [AWS datasets](https://registry.opendata.aws/)
 8 | * [UK government open data](https://data.gov.uk/)
 9 | * [Github archive](https://www.gharchive.org)
10 | * [Awesome public datasets](https://github.com/awesomedata/awesome-public-datasets)
11 | * [Million songs dataset](http://millionsongdataset.com)
12 | * [Some random datasets](https://components.one/datasets/)
13 | * [COVID Datasets](https://www.reddit.com/r/datasets/comments/n3ph2d/coronavirus_datsets/)
14 | * [Datasets from Azure](https://docs.microsoft.com/en-us/azure/azure-sql/public-data-sets)
15 | * [Datasets from BigQuery](https://cloud.google.com/bigquery/public-data/)
16 | * [Dataset search engine from Google](https://datasetsearch.research.google.com/)
17 | * [Public datasets offered by different GCP services](https://cloud.google.com/solutions/datasets)
18 | * [European statistics datasets](https://ec.europa.eu/eurostat/data/database)
19 | * [Datasets for streaming](https://github.com/ColinEberhardt/awesome-public-streaming-datasets)
20 | * [Dataset for Santander bicycle rentals in London](https://cycling.data.tfl.gov.uk/)
21 | * [Common crawl data](https://commoncrawl.org/) (copy of the internet)
22 | * [NASA's EarthData](https://search.earthdata.nasa.gov/search) (May require introductory geospatial analysis)
23 | * Collection Of Data Repositories
24 |   * [part 1](https://www.kdnuggets.com/2022/04/complete-collection-data-repositories-part-1.html) (from agriculture and finance to government)
25 |   * [part 2](https://www.kdnuggets.com/2022/04/complete-collection-data-repositories-part-2.html) (from healthcare to transportation)
26 | * [Data For Good by Meta](https://dataforgood.facebook.com/dfg/tools)
27 | 
28 | PRs with more datasets are welcome!
29 | 
30 | It's not mandatory that you use a dataset from this list. You can use any dataset you want.
31 | 


--------------------------------------------------------------------------------