├── .gitignore ├── 01-docker-terraform ├── 1_terraform_gcp │ ├── 1_terraform_overview.md │ ├── 2_gcp_overview.md │ ├── README.md │ ├── terraform │ │ ├── README.md │ │ ├── terraform_basic │ │ │ └── main.tf │ │ └── terraform_with_variables │ │ │ ├── main.tf │ │ │ └── variables.tf │ └── windows.md ├── 2_docker_sql │ ├── .gitignore │ ├── Dockerfile │ ├── README.md │ ├── data-loading-parquet.ipynb │ ├── data-loading-parquet.py │ ├── docker-compose.yaml │ ├── ingest_data.py │ ├── pg-test-connection.ipynb │ ├── pipeline.py │ └── upload-data.ipynb └── README.md ├── 02-workflow-orchestration ├── README.md ├── docker │ ├── combined │ │ └── docker-compose.yml │ ├── kestra │ │ └── docker-compose.yml │ └── postgres │ │ └── docker-compose.yml ├── flows │ ├── 01_getting_started_data_pipeline.yaml │ ├── 02_postgres_taxi.yaml │ ├── 02_postgres_taxi_scheduled.yaml │ ├── 03_postgres_dbt.yaml │ ├── 04_gcp_kv.yaml │ ├── 05_gcp_setup.yaml │ ├── 06_gcp_taxi.yaml │ ├── 06_gcp_taxi_scheduled.yaml │ └── 07_gcp_dbt.yaml └── images │ └── homework.png ├── 03-data-warehouse ├── README.md ├── big_query.sql ├── big_query_hw.sql ├── big_query_ml.sql ├── extract_model.md └── extras │ ├── README.md │ └── web_to_gcs.py ├── 04-analytics-engineering ├── README.md ├── SQL_refresher.md ├── dbt_cloud_setup.md ├── docker_setup │ ├── Dockerfile │ ├── README.md │ └── docker-compose.yaml └── taxi_rides_ny │ ├── .gitignore │ ├── .gitkeep │ ├── README.md │ ├── analyses │ ├── .gitkeep │ └── hack-load-data.sql │ ├── dbt_project.yml │ ├── macros │ ├── .gitkeep │ ├── get_payment_type_description.sql │ └── macros_properties.yml │ ├── models │ ├── core │ │ ├── dim_zones.sql │ │ ├── dm_monthly_zone_revenue.sql │ │ ├── fact_trips.sql │ │ └── schema.yml │ └── staging │ │ ├── schema.yml │ │ ├── stg_green_tripdata.sql │ │ └── stg_yellow_tripdata.sql │ ├── package-lock.yml │ ├── packages.yml │ ├── seeds │ ├── .gitkeep │ ├── seeds_properties.yml │ └── taxi_zone_lookup.csv │ └── snapshots │ └── .gitkeep ├── 05-batch ├── .gitignore ├── README.md ├── code │ ├── 03_test.ipynb │ ├── 04_pyspark.ipynb │ ├── 05_taxi_schema.ipynb │ ├── 06_spark_sql.ipynb │ ├── 06_spark_sql.py │ ├── 06_spark_sql_big_query.py │ ├── 07_groupby_join.ipynb │ ├── 08_rdds.ipynb │ ├── 09_spark_gcs.ipynb │ ├── cloud.md │ ├── download_data.sh │ └── homework.ipynb └── setup │ ├── config │ ├── core-site.xml │ ├── spark-defaults.conf │ └── spark.dockerfile │ ├── hadoop-yarn.md │ ├── linux.md │ ├── macos.md │ ├── pyspark.md │ └── windows.md ├── 06-streaming ├── .gitignore ├── README.md ├── java │ └── kafka_examples │ │ ├── .gitignore │ │ ├── build.gradle │ │ ├── build │ │ └── generated-main-avro-java │ │ │ └── schemaregistry │ │ │ ├── RideRecord.java │ │ │ ├── RideRecordCompatible.java │ │ │ └── RideRecordNoneCompatible.java │ │ ├── gradle │ │ └── wrapper │ │ │ ├── gradle-wrapper.jar │ │ │ └── gradle-wrapper.properties │ │ ├── gradlew │ │ ├── gradlew.bat │ │ ├── settings.gradle │ │ └── src │ │ ├── main │ │ ├── avro │ │ │ ├── rides.avsc │ │ │ ├── rides_compatible.avsc │ │ │ └── rides_non_compatible.avsc │ │ ├── java │ │ │ └── org │ │ │ │ └── example │ │ │ │ ├── AvroProducer.java │ │ │ │ ├── JsonConsumer.java │ │ │ │ ├── JsonKStream.java │ │ │ │ ├── JsonKStreamJoins.java │ │ │ │ ├── JsonKStreamWindow.java │ │ │ │ ├── JsonProducer.java │ │ │ │ ├── JsonProducerPickupLocation.java │ │ │ │ ├── Secrets.java │ │ │ │ ├── Topics.java │ │ │ │ ├── customserdes │ │ │ │ └── CustomSerdes.java │ │ │ │ └── data │ │ │ │ ├── PickupLocation.java │ │ │ │ ├── Ride.java │ │ │ │ └── VendorInfo.java │ │ └── resources │ │ │ └── rides.csv │ │ └── test │ │ └── java │ │ └── org │ │ └── example │ │ ├── JsonKStreamJoinsTest.java │ │ ├── JsonKStreamTest.java │ │ └── helper │ │ └── DataGeneratorHelper.java ├── ksqldb │ └── commands.md ├── pyflink │ ├── .gitignore │ ├── Dockerfile.flink │ ├── LICENSE │ ├── Makefile │ ├── README.md │ ├── docker-compose.yml │ ├── homework.md │ ├── requirements.txt │ └── src │ │ ├── job │ │ ├── aggregation_job.py │ │ ├── start_job.py │ │ └── taxi_job.py │ │ └── producers │ │ ├── load_taxi_data.py │ │ └── producer.py └── python │ ├── README.md │ ├── avro_example │ ├── consumer.py │ ├── producer.py │ ├── ride_record.py │ ├── ride_record_key.py │ └── settings.py │ ├── docker │ ├── README.md │ ├── docker-compose.yml │ ├── kafka │ │ └── docker-compose.yml │ └── spark │ │ ├── build.sh │ │ ├── cluster-base.Dockerfile │ │ ├── docker-compose.yml │ │ ├── jupyterlab.Dockerfile │ │ ├── spark-base.Dockerfile │ │ ├── spark-master.Dockerfile │ │ └── spark-worker.Dockerfile │ ├── json_example │ ├── consumer.py │ ├── producer.py │ ├── ride.py │ └── settings.py │ ├── redpanda_example │ ├── README.md │ ├── consumer.py │ ├── docker-compose.yaml │ ├── producer.py │ ├── ride.py │ └── settings.py │ ├── requirements.txt │ ├── resources │ ├── rides.csv │ └── schemas │ │ ├── taxi_ride_key.avsc │ │ └── taxi_ride_value.avsc │ └── streams-example │ ├── faust │ ├── branch_price.py │ ├── producer_taxi_json.py │ ├── stream.py │ ├── stream_count_vendor_trips.py │ ├── taxi_rides.py │ └── windowing.py │ ├── pyspark │ ├── README.md │ ├── consumer.py │ ├── producer.py │ ├── settings.py │ ├── spark-submit.sh │ ├── streaming-notebook.ipynb │ └── streaming.py │ └── redpanda │ ├── README.md │ ├── consumer.py │ ├── docker-compose.yaml │ ├── producer.py │ ├── settings.py │ ├── spark-submit.sh │ ├── streaming-notebook.ipynb │ └── streaming.py ├── README.md ├── after-sign-up.md ├── asking-questions.md ├── awesome-data-engineering.md ├── certificates.md ├── cohorts ├── 2022 │ ├── README.md │ ├── project.md │ ├── week_1_basics_n_setup │ │ └── homework.md │ ├── week_2_data_ingestion │ │ ├── README.md │ │ ├── airflow │ │ │ ├── .env_example │ │ │ ├── 1_setup_official.md │ │ │ ├── 2_setup_nofrills.md │ │ │ ├── Dockerfile │ │ │ ├── README.md │ │ │ ├── dags │ │ │ │ └── data_ingestion_gcs_dag.py │ │ │ ├── dags_local │ │ │ │ ├── data_ingestion_local.py │ │ │ │ └── ingest_script.py │ │ │ ├── docker-compose-nofrills.yml │ │ │ ├── docker-compose.yaml │ │ │ ├── docker-compose_2.3.4.yaml │ │ │ ├── docs │ │ │ │ ├── 1_concepts.md │ │ │ │ ├── arch-diag-airflow.png │ │ │ │ └── gcs_ingestion_dag.png │ │ │ ├── extras │ │ │ │ ├── data_ingestion_gcs_dag_ex2.py │ │ │ │ └── web_to_gcs.sh │ │ │ ├── requirements.txt │ │ │ └── scripts │ │ │ │ └── entrypoint.sh │ │ ├── homework │ │ │ ├── homework.md │ │ │ └── solution.py │ │ └── transfer_service │ │ │ └── README.md │ ├── week_3_data_warehouse │ │ └── airflow │ │ │ ├── .env_example │ │ │ ├── 1_setup_official.md │ │ │ ├── 2_setup_nofrills.md │ │ │ ├── README.md │ │ │ ├── dags │ │ │ └── gcs_to_bq_dag.py │ │ │ ├── docker-compose-nofrills.yml │ │ │ ├── docker-compose.yaml │ │ │ ├── docs │ │ │ ├── gcs_2_bq_dag_graph_view.png │ │ │ └── gcs_2_bq_dag_tree_view.png │ │ │ └── scripts │ │ │ └── entrypoint.sh │ ├── week_5_batch_processing │ │ └── homework.md │ └── week_6_stream_processing │ │ └── homework.md ├── 2023 │ ├── README.md │ ├── leaderboard.md │ ├── project.md │ ├── week_1_docker_sql │ │ └── homework.md │ ├── week_1_terraform │ │ └── homework.md │ ├── week_2_workflow_orchestration │ │ ├── README.md │ │ └── homework.md │ ├── week_3_data_warehouse │ │ └── homework.md │ ├── week_4_analytics_engineering │ │ └── homework.md │ ├── week_5_batch_processing │ │ └── homework.md │ ├── week_6_stream_processing │ │ ├── client.properties │ │ ├── homework.md │ │ ├── producer_confluent.py │ │ ├── settings.py │ │ ├── spark-submit.sh │ │ └── streaming_confluent.py │ └── workshops │ │ └── piperider.md ├── 2024 │ ├── 01-docker-terraform │ │ └── homework.md │ ├── 02-workflow-orchestration │ │ ├── README.md │ │ └── homework.md │ ├── 03-data-warehouse │ │ └── homework.md │ ├── 04-analytics-engineering │ │ └── homework.md │ ├── 05-batch │ │ └── homework.md │ ├── 06-streaming │ │ ├── docker-compose.yml │ │ └── homework.md │ ├── README.md │ ├── leaderboard.md │ ├── project.md │ └── workshops │ │ ├── dlt.md │ │ ├── dlt_resources │ │ ├── data_ingestion_workshop.md │ │ ├── homework_solution.ipynb │ │ ├── homework_starter.ipynb │ │ ├── incremental_loading.png │ │ └── workshop.ipynb │ │ └── rising-wave.md └── 2025 │ ├── 01-docker-terraform │ ├── homework.md │ └── solution.md │ ├── 02-workflow-orchestration │ ├── homework.md │ └── solution.md │ ├── 03-data-warehouse │ ├── DLT_upload_to_GCP.ipynb │ ├── homework.md │ └── load_yellow_taxi_data.py │ ├── 04-analytics-engineering │ ├── homework.md │ └── homework_q2.png │ ├── 05-batch │ ├── homework.md │ └── homework │ │ └── solution.ipynb │ ├── 06-streaming │ ├── homework.md │ └── homework │ │ └── homework.ipynb │ ├── README.md │ ├── project.md │ └── workshops │ ├── dlt │ ├── README.md │ ├── data_ingestion_workshop.md │ ├── dlt_homework.md │ └── img │ │ ├── Rest_API.png │ │ ├── dlt.png │ │ └── pipes.jpg │ └── dynamic_load_dlt.py ├── dataset.md ├── images ├── architecture │ ├── arch_v3_workshops.jpg │ ├── arch_v4_workshops.jpg │ └── photo1700757552.jpeg ├── aws │ └── iam.png ├── dlthub.png ├── kestra.svg ├── mage.svg ├── piperider.png └── rising-wave.png ├── learning-in-public.md └── projects ├── README.md └── datasets.md /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | .DS_Store 3 | .idea 4 | *.tfstate 5 | *.tfstate.* 6 | **.terraform 7 | **.terraform.lock.* 8 | **google_credentials.json 9 | **logs/ 10 | **.env 11 | **__pycache__/ 12 | .history 13 | **/ny_taxi_postgres_data/* 14 | serving_dir 15 | .ipynb_checkpoints/ 16 | !week_6_stream_processing/avro_example/data/rides.csv 17 | -------------------------------------------------------------------------------- /01-docker-terraform/1_terraform_gcp/1_terraform_overview.md: -------------------------------------------------------------------------------- 1 | ## Terraform Overview 2 | 3 | [Video](https://www.youtube.com/watch?v=18jIzE41fJ4&list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb&index=2) 4 | 5 | ### Concepts 6 | 7 | #### Introduction 8 | 9 | 1. What is [Terraform](https://www.terraform.io)? 10 | * open-source tool by [HashiCorp](https://www.hashicorp.com), used for provisioning infrastructure resources 11 | * supports DevOps best practices for change management 12 | * Managing configuration files in source control to maintain an ideal provisioning state 13 | for testing and production environments 14 | 2. What is IaC? 15 | * Infrastructure-as-Code 16 | * build, change, and manage your infrastructure in a safe, consistent, and repeatable way 17 | by defining resource configurations that you can version, reuse, and share. 18 | 3. Some advantages 19 | * Infrastructure lifecycle management 20 | * Version control commits 21 | * Very useful for stack-based deployments, and with cloud providers such as AWS, GCP, Azure, K8S… 22 | * State-based approach to track resource changes throughout deployments 23 | 24 | 25 | #### Files 26 | 27 | * `main.tf` 28 | * `variables.tf` 29 | * Optional: `resources.tf`, `output.tf` 30 | * `.tfstate` 31 | 32 | #### Declarations 33 | * `terraform`: configure basic Terraform settings to provision your infrastructure 34 | * `required_version`: minimum Terraform version to apply to your configuration 35 | * `backend`: stores Terraform's "state" snapshots, to map real-world resources to your configuration. 36 | * `local`: stores state file locally as `terraform.tfstate` 37 | * `required_providers`: specifies the providers required by the current module 38 | * `provider`: 39 | * adds a set of resource types and/or data sources that Terraform can manage 40 | * The Terraform Registry is the main directory of publicly available providers from most major infrastructure platforms. 41 | * `resource` 42 | * blocks to define components of your infrastructure 43 | * Project modules/resources: google_storage_bucket, google_bigquery_dataset, google_bigquery_table 44 | * `variable` & `locals` 45 | * runtime arguments and constants 46 | 47 | 48 | #### Execution steps 49 | 1. `terraform init`: 50 | * Initializes & configures the backend, installs plugins/providers, & checks out an existing configuration from a version control 51 | 2. `terraform plan`: 52 | * Matches/previews local changes against a remote state, and proposes an Execution Plan. 53 | 3. `terraform apply`: 54 | * Asks for approval to the proposed plan, and applies changes to cloud 55 | 4. `terraform destroy` 56 | * Removes your stack from the Cloud 57 | 58 | 59 | ### Terraform Workshop to create GCP Infra 60 | Continue [here](./terraform): `week_1_basics_n_setup/1_terraform_gcp/terraform` 61 | 62 | 63 | ### References 64 | https://learn.hashicorp.com/collections/terraform/gcp-get-started 65 | -------------------------------------------------------------------------------- /01-docker-terraform/1_terraform_gcp/2_gcp_overview.md: -------------------------------------------------------------------------------- 1 | ## GCP Overview 2 | 3 | [Video](https://www.youtube.com/watch?v=18jIzE41fJ4&list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb&index=2) 4 | 5 | 6 | ### Project infrastructure modules in GCP: 7 | * Google Cloud Storage (GCS): Data Lake 8 | * BigQuery: Data Warehouse 9 | 10 | (Concepts explained in Week 2 - Data Ingestion) 11 | 12 | ### Initial Setup 13 | 14 | For this course, we'll use a free version (upto EUR 300 credits). 15 | 16 | 1. Create an account with your Google email ID 17 | 2. Setup your first [project](https://console.cloud.google.com/) if you haven't already 18 | * eg. "DTC DE Course", and note down the "Project ID" (we'll use this later when deploying infra with TF) 19 | 3. Setup [service account & authentication](https://cloud.google.com/docs/authentication/getting-started) for this project 20 | * Grant `Viewer` role to begin with. 21 | * Download service-account-keys (.json) for auth. 22 | 4. Download [SDK](https://cloud.google.com/sdk/docs/quickstart) for local setup 23 | 5. Set environment variable to point to your downloaded GCP keys: 24 | ```shell 25 | export GOOGLE_APPLICATION_CREDENTIALS=".json" 26 | 27 | # Refresh token/session, and verify authentication 28 | gcloud auth application-default login 29 | ``` 30 | 31 | ### Setup for Access 32 | 33 | 1. [IAM Roles](https://cloud.google.com/storage/docs/access-control/iam-roles) for Service account: 34 | * Go to the *IAM* section of *IAM & Admin* https://console.cloud.google.com/iam-admin/iam 35 | * Click the *Edit principal* icon for your service account. 36 | * Add these roles in addition to *Viewer* : **Storage Admin** + **Storage Object Admin** + **BigQuery Admin** 37 | 38 | 2. Enable these APIs for your project: 39 | * https://console.cloud.google.com/apis/library/iam.googleapis.com 40 | * https://console.cloud.google.com/apis/library/iamcredentials.googleapis.com 41 | 42 | 3. Please ensure `GOOGLE_APPLICATION_CREDENTIALS` env-var is set. 43 | ```shell 44 | export GOOGLE_APPLICATION_CREDENTIALS=".json" 45 | ``` 46 | 47 | ### Terraform Workshop to create GCP Infra 48 | Continue [here](./terraform): `week_1_basics_n_setup/1_terraform_gcp/terraform` 49 | -------------------------------------------------------------------------------- /01-docker-terraform/1_terraform_gcp/README.md: -------------------------------------------------------------------------------- 1 | ## Local Setup for Terraform and GCP 2 | 3 | ### Pre-Requisites 4 | 1. Terraform client installation: https://www.terraform.io/downloads 5 | 2. Cloud Provider account: https://console.cloud.google.com/ 6 | 7 | ### Terraform Concepts 8 | [Terraform Overview](1_terraform_overview.md) 9 | 10 | ### GCP setup 11 | 12 | 1. [Setup for First-time](2_gcp_overview.md#initial-setup) 13 | * [Only for Windows](windows.md) - Steps 4 & 5 14 | 2. [IAM / Access specific to this course](2_gcp_overview.md#setup-for-access) 15 | 16 | ### Terraform Workshop for GCP Infra 17 | Your setup is ready! 18 | Now head to the [terraform](terraform) directory, and perform the execution steps to create your infrastructure. 19 | -------------------------------------------------------------------------------- /01-docker-terraform/1_terraform_gcp/terraform/README.md: -------------------------------------------------------------------------------- 1 | ### Concepts 2 | * [Terraform_overview](../1_terraform_overview.md) 3 | 4 | ### Execution 5 | 6 | ```shell 7 | # Refresh service-account's auth-token for this session 8 | gcloud auth application-default login 9 | 10 | # Initialize state file (.tfstate) 11 | terraform init 12 | 13 | # Check changes to new infra plan 14 | terraform plan -var="project=" 15 | ``` 16 | 17 | ```shell 18 | # Create new infra 19 | terraform apply -var="project=" 20 | ``` 21 | 22 | ```shell 23 | # Delete infra after your work, to avoid costs on any running services 24 | terraform destroy 25 | ``` 26 | -------------------------------------------------------------------------------- /01-docker-terraform/1_terraform_gcp/terraform/terraform_basic/main.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | required_providers { 3 | google = { 4 | source = "hashicorp/google" 5 | version = "4.51.0" 6 | } 7 | } 8 | } 9 | 10 | provider "google" { 11 | # Credentials only needs to be set if you do not have the GOOGLE_APPLICATION_CREDENTIALS set 12 | # credentials = 13 | project = "" 14 | region = "us-central1" 15 | } 16 | 17 | 18 | 19 | resource "google_storage_bucket" "data-lake-bucket" { 20 | name = "" 21 | location = "US" 22 | 23 | # Optional, but recommended settings: 24 | storage_class = "STANDARD" 25 | uniform_bucket_level_access = true 26 | 27 | versioning { 28 | enabled = true 29 | } 30 | 31 | lifecycle_rule { 32 | action { 33 | type = "Delete" 34 | } 35 | condition { 36 | age = 30 // days 37 | } 38 | } 39 | 40 | force_destroy = true 41 | } 42 | 43 | 44 | resource "google_bigquery_dataset" "dataset" { 45 | dataset_id = "" 46 | project = "" 47 | location = "US" 48 | } -------------------------------------------------------------------------------- /01-docker-terraform/1_terraform_gcp/terraform/terraform_with_variables/main.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | required_providers { 3 | google = { 4 | source = "hashicorp/google" 5 | version = "5.6.0" 6 | } 7 | } 8 | } 9 | 10 | provider "google" { 11 | credentials = file(var.credentials) 12 | project = var.project 13 | region = var.region 14 | } 15 | 16 | 17 | resource "google_storage_bucket" "demo-bucket" { 18 | name = var.gcs_bucket_name 19 | location = var.location 20 | force_destroy = true 21 | 22 | 23 | lifecycle_rule { 24 | condition { 25 | age = 1 26 | } 27 | action { 28 | type = "AbortIncompleteMultipartUpload" 29 | } 30 | } 31 | } 32 | 33 | 34 | 35 | resource "google_bigquery_dataset" "demo_dataset" { 36 | dataset_id = var.bq_dataset_name 37 | location = var.location 38 | } -------------------------------------------------------------------------------- /01-docker-terraform/1_terraform_gcp/terraform/terraform_with_variables/variables.tf: -------------------------------------------------------------------------------- 1 | variable "credentials" { 2 | description = "My Credentials" 3 | default = "" 4 | #ex: if you have a directory where this file is called keys with your service account json file 5 | #saved there as my-creds.json you could use default = "./keys/my-creds.json" 6 | } 7 | 8 | 9 | variable "project" { 10 | description = "Project" 11 | default = "" 12 | } 13 | 14 | variable "region" { 15 | description = "Region" 16 | #Update the below to your desired region 17 | default = "us-central1" 18 | } 19 | 20 | variable "location" { 21 | description = "Project Location" 22 | #Update the below to your desired location 23 | default = "US" 24 | } 25 | 26 | variable "bq_dataset_name" { 27 | description = "My BigQuery Dataset Name" 28 | #Update the below to what you want your dataset to be called 29 | default = "demo_dataset" 30 | } 31 | 32 | variable "gcs_bucket_name" { 33 | description = "My Storage Bucket Name" 34 | #Update the below to a unique bucket name 35 | default = "terraform-demo-terra-bucket" 36 | } 37 | 38 | variable "gcs_storage_class" { 39 | description = "Bucket Storage Class" 40 | default = "STANDARD" 41 | } -------------------------------------------------------------------------------- /01-docker-terraform/2_docker_sql/.gitignore: -------------------------------------------------------------------------------- 1 | .ipynb_checkpoints/ 2 | ny_taxi_postgres_data/ 3 | *.csv -------------------------------------------------------------------------------- /01-docker-terraform/2_docker_sql/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.9.1 2 | 3 | RUN apt-get install wget 4 | RUN pip install pandas sqlalchemy psycopg2 5 | 6 | WORKDIR /app 7 | COPY ingest_data.py ingest_data.py 8 | 9 | ENTRYPOINT [ "python", "ingest_data.py" ] -------------------------------------------------------------------------------- /01-docker-terraform/2_docker_sql/data-loading-parquet.py: -------------------------------------------------------------------------------- 1 | #Cleaned up version of data-loading.ipynb 2 | import argparse, os, sys 3 | from time import time 4 | import pandas as pd 5 | import pyarrow.parquet as pq 6 | from sqlalchemy import create_engine 7 | 8 | 9 | def main(params): 10 | user = params.user 11 | password = params.password 12 | host = params.host 13 | port = params.port 14 | db = params.db 15 | tb = params.tb 16 | url = params.url 17 | 18 | # Get the name of the file from url 19 | file_name = url.rsplit('/', 1)[-1].strip() 20 | print(f'Downloading {file_name} ...') 21 | # Download file from url 22 | os.system(f'curl {url.strip()} -o {file_name}') 23 | print('\n') 24 | 25 | # Create SQL engine 26 | engine = create_engine(f'postgresql://{user}:{password}@{host}:{port}/{db}') 27 | 28 | # Read file based on csv or parquet 29 | if '.csv' in file_name: 30 | df = pd.read_csv(file_name, nrows=10) 31 | df_iter = pd.read_csv(file_name, iterator=True, chunksize=100000) 32 | elif '.parquet' in file_name: 33 | file = pq.ParquetFile(file_name) 34 | df = next(file.iter_batches(batch_size=10)).to_pandas() 35 | df_iter = file.iter_batches(batch_size=100000) 36 | else: 37 | print('Error. Only .csv or .parquet files allowed.') 38 | sys.exit() 39 | 40 | 41 | # Create the table 42 | df.head(0).to_sql(name=tb, con=engine, if_exists='replace') 43 | 44 | 45 | # Insert values 46 | t_start = time() 47 | count = 0 48 | for batch in df_iter: 49 | count+=1 50 | 51 | if '.parquet' in file_name: 52 | batch_df = batch.to_pandas() 53 | else: 54 | batch_df = batch 55 | 56 | print(f'inserting batch {count}...') 57 | 58 | b_start = time() 59 | batch_df.to_sql(name=tb, con=engine, if_exists='append') 60 | b_end = time() 61 | 62 | print(f'inserted! time taken {b_end-b_start:10.3f} seconds.\n') 63 | 64 | t_end = time() 65 | print(f'Completed! Total time taken was {t_end-t_start:10.3f} seconds for {count} batches.') 66 | 67 | 68 | 69 | if __name__ == '__main__': 70 | #Parsing arguments 71 | parser = argparse.ArgumentParser(description='Loading data from .paraquet file link to a Postgres datebase.') 72 | 73 | parser.add_argument('--user', help='Username for Postgres.') 74 | parser.add_argument('--password', help='Password to the username for Postgres.') 75 | parser.add_argument('--host', help='Hostname for Postgres.') 76 | parser.add_argument('--port', help='Port for Postgres connection.') 77 | parser.add_argument('--db', help='Databse name for Postgres') 78 | parser.add_argument('--tb', help='Destination table name for Postgres.') 79 | parser.add_argument('--url', help='URL for .paraquet file.') 80 | 81 | args = parser.parse_args() 82 | main(args) 83 | 84 | 85 | 86 | 87 | -------------------------------------------------------------------------------- /01-docker-terraform/2_docker_sql/docker-compose.yaml: -------------------------------------------------------------------------------- 1 | services: 2 | pgdatabase: 3 | image: postgres:13 4 | environment: 5 | - POSTGRES_USER=root 6 | - POSTGRES_PASSWORD=root 7 | - POSTGRES_DB=ny_taxi 8 | volumes: 9 | - "./ny_taxi_postgres_data:/var/lib/postgresql/data:rw" 10 | ports: 11 | - "5432:5432" 12 | pgadmin: 13 | image: dpage/pgadmin4 14 | environment: 15 | - PGADMIN_DEFAULT_EMAIL=admin@admin.com 16 | - PGADMIN_DEFAULT_PASSWORD=root 17 | ports: 18 | - "8080:80" 19 | -------------------------------------------------------------------------------- /01-docker-terraform/2_docker_sql/ingest_data.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | import os 5 | import argparse 6 | 7 | from time import time 8 | 9 | import pandas as pd 10 | from sqlalchemy import create_engine 11 | 12 | 13 | def main(params): 14 | user = params.user 15 | password = params.password 16 | host = params.host 17 | port = params.port 18 | db = params.db 19 | table_name = params.table_name 20 | url = params.url 21 | 22 | # the backup files are gzipped, and it's important to keep the correct extension 23 | # for pandas to be able to open the file 24 | if url.endswith('.csv.gz'): 25 | csv_name = 'output.csv.gz' 26 | else: 27 | csv_name = 'output.csv' 28 | 29 | os.system(f"wget {url} -O {csv_name}") 30 | 31 | engine = create_engine(f'postgresql://{user}:{password}@{host}:{port}/{db}') 32 | 33 | df_iter = pd.read_csv(csv_name, iterator=True, chunksize=100000) 34 | 35 | df = next(df_iter) 36 | 37 | df.tpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime) 38 | df.tpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime) 39 | 40 | df.head(n=0).to_sql(name=table_name, con=engine, if_exists='replace') 41 | 42 | df.to_sql(name=table_name, con=engine, if_exists='append') 43 | 44 | 45 | while True: 46 | 47 | try: 48 | t_start = time() 49 | 50 | df = next(df_iter) 51 | 52 | df.tpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime) 53 | df.tpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime) 54 | 55 | df.to_sql(name=table_name, con=engine, if_exists='append') 56 | 57 | t_end = time() 58 | 59 | print('inserted another chunk, took %.3f second' % (t_end - t_start)) 60 | 61 | except StopIteration: 62 | print("Finished ingesting data into the postgres database") 63 | break 64 | 65 | if __name__ == '__main__': 66 | parser = argparse.ArgumentParser(description='Ingest CSV data to Postgres') 67 | 68 | parser.add_argument('--user', required=True, help='user name for postgres') 69 | parser.add_argument('--password', required=True, help='password for postgres') 70 | parser.add_argument('--host', required=True, help='host for postgres') 71 | parser.add_argument('--port', required=True, help='port for postgres') 72 | parser.add_argument('--db', required=True, help='database name for postgres') 73 | parser.add_argument('--table_name', required=True, help='name of the table where we will write the results to') 74 | parser.add_argument('--url', required=True, help='url of the csv file') 75 | 76 | args = parser.parse_args() 77 | 78 | main(args) 79 | -------------------------------------------------------------------------------- /01-docker-terraform/2_docker_sql/pipeline.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | import pandas as pd 4 | 5 | print(sys.argv) 6 | 7 | day = sys.argv[1] 8 | 9 | # some fancy stuff with pandas 10 | 11 | print(f'job finished successfully for day = {day}') -------------------------------------------------------------------------------- /02-workflow-orchestration/docker/combined/docker-compose.yml: -------------------------------------------------------------------------------- 1 | volumes: 2 | postgres-data: 3 | driver: local 4 | kestra-data: 5 | driver: local 6 | zoomcamp-data: 7 | driver: local 8 | 9 | services: 10 | postgres: 11 | image: postgres 12 | volumes: 13 | - postgres-data:/var/lib/postgresql/data 14 | environment: 15 | POSTGRES_DB: kestra 16 | POSTGRES_USER: kestra 17 | POSTGRES_PASSWORD: k3str4 18 | healthcheck: 19 | test: ["CMD-SHELL", "pg_isready -d $${POSTGRES_DB} -U $${POSTGRES_USER}"] 20 | interval: 30s 21 | timeout: 10s 22 | retries: 10 23 | 24 | kestra: 25 | image: kestra/kestra:v0.20.7 26 | pull_policy: always 27 | # Note that this setup with a root user is intended for development purpose. 28 | # Our base image runs without root, but the Docker Compose implementation needs root to access the Docker socket 29 | # To run Kestra in a rootless mode in production, see: https://kestra.io/docs/installation/podman-compose 30 | user: "root" 31 | command: server standalone 32 | volumes: 33 | - kestra-data:/app/storage 34 | - /var/run/docker.sock:/var/run/docker.sock 35 | - /tmp/kestra-wd:/tmp/kestra-wd 36 | environment: 37 | KESTRA_CONFIGURATION: | 38 | datasources: 39 | postgres: 40 | url: jdbc:postgresql://postgres:5432/kestra 41 | driverClassName: org.postgresql.Driver 42 | username: kestra 43 | password: k3str4 44 | kestra: 45 | server: 46 | basicAuth: 47 | enabled: false 48 | username: "admin@kestra.io" # it must be a valid email address 49 | password: kestra 50 | repository: 51 | type: postgres 52 | storage: 53 | type: local 54 | local: 55 | basePath: "/app/storage" 56 | queue: 57 | type: postgres 58 | tasks: 59 | tmpDir: 60 | path: /tmp/kestra-wd/tmp 61 | url: http://localhost:8080/ 62 | ports: 63 | - "8080:8080" 64 | - "8081:8081" 65 | depends_on: 66 | postgres: 67 | condition: service_started 68 | 69 | postgres_zoomcamp: 70 | image: postgres 71 | environment: 72 | POSTGRES_USER: kestra 73 | POSTGRES_PASSWORD: k3str4 74 | POSTGRES_DB: postgres-zoomcamp 75 | ports: 76 | - "5432:5432" 77 | volumes: 78 | - zoomcamp-data:/var/lib/postgresql/data 79 | depends_on: 80 | kestra: 81 | condition: service_started 82 | 83 | pgadmin: 84 | image: dpage/pgadmin4 85 | environment: 86 | - PGADMIN_DEFAULT_EMAIL=admin@admin.com 87 | - PGADMIN_DEFAULT_PASSWORD=root 88 | ports: 89 | - "8085:80" 90 | depends_on: 91 | postgres_zoomcamp: 92 | condition: service_started 93 | -------------------------------------------------------------------------------- /02-workflow-orchestration/docker/kestra/docker-compose.yml: -------------------------------------------------------------------------------- 1 | volumes: 2 | postgres-data: 3 | driver: local 4 | kestra-data: 5 | driver: local 6 | 7 | services: 8 | postgres: 9 | image: postgres 10 | volumes: 11 | - postgres-data:/var/lib/postgresql/data 12 | environment: 13 | POSTGRES_DB: kestra 14 | POSTGRES_USER: kestra 15 | POSTGRES_PASSWORD: k3str4 16 | healthcheck: 17 | test: ["CMD-SHELL", "pg_isready -d $${POSTGRES_DB} -U $${POSTGRES_USER}"] 18 | interval: 30s 19 | timeout: 10s 20 | retries: 10 21 | 22 | kestra: 23 | image: kestra/kestra:v0.20.7 24 | pull_policy: always 25 | user: "root" 26 | command: server standalone 27 | volumes: 28 | - kestra-data:/app/storage 29 | - /var/run/docker.sock:/var/run/docker.sock 30 | - /tmp/kestra-wd:/tmp/kestra-wd 31 | environment: 32 | KESTRA_CONFIGURATION: | 33 | datasources: 34 | postgres: 35 | url: jdbc:postgresql://postgres:5432/kestra 36 | driverClassName: org.postgresql.Driver 37 | username: kestra 38 | password: k3str4 39 | kestra: 40 | server: 41 | basicAuth: 42 | enabled: false 43 | username: "admin@kestra.io" # it must be a valid email address 44 | password: kestra 45 | repository: 46 | type: postgres 47 | storage: 48 | type: local 49 | local: 50 | basePath: "/app/storage" 51 | queue: 52 | type: postgres 53 | tasks: 54 | tmpDir: 55 | path: /tmp/kestra-wd/tmp 56 | url: http://localhost:8080/ 57 | ports: 58 | - "8080:8080" 59 | - "8081:8081" 60 | depends_on: 61 | postgres: 62 | condition: service_started 63 | -------------------------------------------------------------------------------- /02-workflow-orchestration/docker/postgres/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3.8" 2 | services: 3 | postgres: 4 | image: postgres 5 | container_name: postgres-db 6 | environment: 7 | POSTGRES_USER: kestra 8 | POSTGRES_PASSWORD: k3str4 9 | POSTGRES_DB: postgres-zoomcamp 10 | ports: 11 | - "5432:5432" 12 | volumes: 13 | - postgres-data:/var/lib/postgresql/data 14 | volumes: 15 | postgres-data: -------------------------------------------------------------------------------- /02-workflow-orchestration/flows/01_getting_started_data_pipeline.yaml: -------------------------------------------------------------------------------- 1 | id: 01_getting_started_data_pipeline 2 | namespace: zoomcamp 3 | 4 | inputs: 5 | - id: columns_to_keep 6 | type: ARRAY 7 | itemType: STRING 8 | defaults: 9 | - brand 10 | - price 11 | 12 | tasks: 13 | - id: extract 14 | type: io.kestra.plugin.core.http.Download 15 | uri: https://dummyjson.com/products 16 | 17 | - id: transform 18 | type: io.kestra.plugin.scripts.python.Script 19 | containerImage: python:3.11-alpine 20 | inputFiles: 21 | data.json: "{{outputs.extract.uri}}" 22 | outputFiles: 23 | - "*.json" 24 | env: 25 | COLUMNS_TO_KEEP: "{{inputs.columns_to_keep}}" 26 | script: | 27 | import json 28 | import os 29 | 30 | columns_to_keep_str = os.getenv("COLUMNS_TO_KEEP") 31 | columns_to_keep = json.loads(columns_to_keep_str) 32 | 33 | with open("data.json", "r") as file: 34 | data = json.load(file) 35 | 36 | filtered_data = [ 37 | {column: product.get(column, "N/A") for column in columns_to_keep} 38 | for product in data["products"] 39 | ] 40 | 41 | with open("products.json", "w") as file: 42 | json.dump(filtered_data, file, indent=4) 43 | 44 | - id: query 45 | type: io.kestra.plugin.jdbc.duckdb.Query 46 | inputFiles: 47 | products.json: "{{outputs.transform.outputFiles['products.json']}}" 48 | sql: | 49 | INSTALL json; 50 | LOAD json; 51 | SELECT brand, round(avg(price), 2) as avg_price 52 | FROM read_json_auto('{{workingDir}}/products.json') 53 | GROUP BY brand 54 | ORDER BY avg_price DESC; 55 | fetchType: STORE 56 | -------------------------------------------------------------------------------- /02-workflow-orchestration/flows/03_postgres_dbt.yaml: -------------------------------------------------------------------------------- 1 | id: 03_postgres_dbt 2 | namespace: zoomcamp 3 | inputs: 4 | - id: dbt_command 5 | type: SELECT 6 | allowCustomValue: true 7 | defaults: dbt build 8 | values: 9 | - dbt build 10 | - dbt debug # use when running the first time to validate DB connection 11 | tasks: 12 | - id: sync 13 | type: io.kestra.plugin.git.SyncNamespaceFiles 14 | url: https://github.com/DataTalksClub/data-engineering-zoomcamp 15 | branch: main 16 | namespace: "{{ flow.namespace }}" 17 | gitDirectory: 04-analytics-engineering/taxi_rides_ny 18 | dryRun: false 19 | # disabled: true # this Git Sync is needed only when running it the first time, afterwards the task can be disabled 20 | 21 | - id: dbt-build 22 | type: io.kestra.plugin.dbt.cli.DbtCLI 23 | env: 24 | DBT_DATABASE: postgres-zoomcamp 25 | DBT_SCHEMA: public 26 | namespaceFiles: 27 | enabled: true 28 | containerImage: ghcr.io/kestra-io/dbt-postgres:latest 29 | taskRunner: 30 | type: io.kestra.plugin.scripts.runner.docker.Docker 31 | networkMode: host 32 | commands: 33 | - dbt deps 34 | - "{{ inputs.dbt_command }}" 35 | storeManifest: 36 | key: manifest.json 37 | namespace: "{{ flow.namespace }}" 38 | profiles: | 39 | default: 40 | outputs: 41 | dev: 42 | type: postgres 43 | host: host.docker.internal 44 | user: kestra 45 | password: k3str4 46 | port: 5432 47 | dbname: postgres-zoomcamp 48 | schema: public 49 | threads: 8 50 | connect_timeout: 10 51 | priority: interactive 52 | target: dev 53 | description: | 54 | Note that you need to adjust the models/staging/schema.yml file to match your database and schema. Select and edit that Namespace File from the UI. Save and run this flow. Once https://github.com/DataTalksClub/data-engineering-zoomcamp/pull/565/files is merged, you can ignore this note as it will be dynamically adjusted based on env variables. 55 | ```yaml 56 | sources: 57 | - name: staging 58 | database: postgres-zoomcamp 59 | schema: public 60 | ``` 61 | -------------------------------------------------------------------------------- /02-workflow-orchestration/flows/04_gcp_kv.yaml: -------------------------------------------------------------------------------- 1 | id: 04_gcp_kv 2 | namespace: zoomcamp 3 | 4 | tasks: 5 | - id: gcp_project_id 6 | type: io.kestra.plugin.core.kv.Set 7 | key: GCP_PROJECT_ID 8 | kvType: STRING 9 | value: kestra-sandbox # TODO replace with your project id 10 | 11 | - id: gcp_location 12 | type: io.kestra.plugin.core.kv.Set 13 | key: GCP_LOCATION 14 | kvType: STRING 15 | value: europe-west2 16 | 17 | - id: gcp_bucket_name 18 | type: io.kestra.plugin.core.kv.Set 19 | key: GCP_BUCKET_NAME 20 | kvType: STRING 21 | value: your-name-kestra # TODO make sure it's globally unique! 22 | 23 | - id: gcp_dataset 24 | type: io.kestra.plugin.core.kv.Set 25 | key: GCP_DATASET 26 | kvType: STRING 27 | value: zoomcamp 28 | -------------------------------------------------------------------------------- /02-workflow-orchestration/flows/05_gcp_setup.yaml: -------------------------------------------------------------------------------- 1 | id: 05_gcp_setup 2 | namespace: zoomcamp 3 | 4 | tasks: 5 | - id: create_gcs_bucket 6 | type: io.kestra.plugin.gcp.gcs.CreateBucket 7 | ifExists: SKIP 8 | storageClass: REGIONAL 9 | name: "{{kv('GCP_BUCKET_NAME')}}" # make sure it's globally unique! 10 | 11 | - id: create_bq_dataset 12 | type: io.kestra.plugin.gcp.bigquery.CreateDataset 13 | name: "{{kv('GCP_DATASET')}}" 14 | ifExists: SKIP 15 | 16 | pluginDefaults: 17 | - type: io.kestra.plugin.gcp 18 | values: 19 | serviceAccount: "{{kv('GCP_CREDS')}}" 20 | projectId: "{{kv('GCP_PROJECT_ID')}}" 21 | location: "{{kv('GCP_LOCATION')}}" 22 | bucket: "{{kv('GCP_BUCKET_NAME')}}" 23 | -------------------------------------------------------------------------------- /02-workflow-orchestration/flows/07_gcp_dbt.yaml: -------------------------------------------------------------------------------- 1 | id: 07_gcp_dbt 2 | namespace: zoomcamp 3 | inputs: 4 | - id: dbt_command 5 | type: SELECT 6 | allowCustomValue: true 7 | defaults: dbt build 8 | values: 9 | - dbt build 10 | - dbt debug # use when running the first time to validate DB connection 11 | 12 | tasks: 13 | - id: sync 14 | type: io.kestra.plugin.git.SyncNamespaceFiles 15 | url: https://github.com/DataTalksClub/data-engineering-zoomcamp 16 | branch: main 17 | namespace: "{{flow.namespace}}" 18 | gitDirectory: 04-analytics-engineering/taxi_rides_ny 19 | dryRun: false 20 | # disabled: true # this Git Sync is needed only when running it the first time, afterwards the task can be disabled 21 | 22 | - id: dbt-build 23 | type: io.kestra.plugin.dbt.cli.DbtCLI 24 | env: 25 | DBT_DATABASE: "{{kv('GCP_PROJECT_ID')}}" 26 | DBT_SCHEMA: "{{kv('GCP_DATASET')}}" 27 | namespaceFiles: 28 | enabled: true 29 | containerImage: ghcr.io/kestra-io/dbt-bigquery:latest 30 | taskRunner: 31 | type: io.kestra.plugin.scripts.runner.docker.Docker 32 | inputFiles: 33 | sa.json: "{{kv('GCP_CREDS')}}" 34 | commands: 35 | - dbt deps 36 | - "{{ inputs.dbt_command }}" 37 | storeManifest: 38 | key: manifest.json 39 | namespace: "{{ flow.namespace }}" 40 | profiles: | 41 | default: 42 | outputs: 43 | dev: 44 | type: bigquery 45 | dataset: "{{kv('GCP_DATASET')}}" 46 | project: "{{kv('GCP_PROJECT_ID')}}" 47 | location: "{{kv('GCP_LOCATION')}}" 48 | keyfile: sa.json 49 | method: service-account 50 | priority: interactive 51 | threads: 16 52 | timeout_seconds: 300 53 | fixed_retries: 1 54 | target: dev 55 | description: | 56 | Note that you need to adjust the models/staging/schema.yml file to match your database and schema. Select and edit that Namespace File from the UI. Save and run this flow. Once https://github.com/DataTalksClub/data-engineering-zoomcamp/pull/565/files is merged, you can ignore this note as it will be dynamically adjusted based on env variables. 57 | ```yaml 58 | sources: 59 | - name: staging 60 | database: kestra-sandbox 61 | schema: zoomcamp 62 | ``` 63 | -------------------------------------------------------------------------------- /02-workflow-orchestration/images/homework.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataTalksClub/data-engineering-zoomcamp/76fed2419e917176369a0d3a2eeff6c936f87286/02-workflow-orchestration/images/homework.png -------------------------------------------------------------------------------- /03-data-warehouse/big_query.sql: -------------------------------------------------------------------------------- 1 | -- Query public available table 2 | SELECT station_id, name FROM 3 | bigquery-public-data.new_york_citibike.citibike_stations 4 | LIMIT 100; 5 | 6 | 7 | -- Creating external table referring to gcs path 8 | CREATE OR REPLACE EXTERNAL TABLE `taxi-rides-ny.nytaxi.external_yellow_tripdata` 9 | OPTIONS ( 10 | format = 'CSV', 11 | uris = ['gs://nyc-tl-data/trip data/yellow_tripdata_2019-*.csv', 'gs://nyc-tl-data/trip data/yellow_tripdata_2020-*.csv'] 12 | ); 13 | 14 | -- Check yello trip data 15 | SELECT * FROM taxi-rides-ny.nytaxi.external_yellow_tripdata limit 10; 16 | 17 | -- Create a non partitioned table from external table 18 | CREATE OR REPLACE TABLE taxi-rides-ny.nytaxi.yellow_tripdata_non_partitioned AS 19 | SELECT * FROM taxi-rides-ny.nytaxi.external_yellow_tripdata; 20 | 21 | 22 | -- Create a partitioned table from external table 23 | CREATE OR REPLACE TABLE taxi-rides-ny.nytaxi.yellow_tripdata_partitioned 24 | PARTITION BY 25 | DATE(tpep_pickup_datetime) AS 26 | SELECT * FROM taxi-rides-ny.nytaxi.external_yellow_tripdata; 27 | 28 | -- Impact of partition 29 | -- Scanning 1.6GB of data 30 | SELECT DISTINCT(VendorID) 31 | FROM taxi-rides-ny.nytaxi.yellow_tripdata_non_partitioned 32 | WHERE DATE(tpep_pickup_datetime) BETWEEN '2019-06-01' AND '2019-06-30'; 33 | 34 | -- Scanning ~106 MB of DATA 35 | SELECT DISTINCT(VendorID) 36 | FROM taxi-rides-ny.nytaxi.yellow_tripdata_partitioned 37 | WHERE DATE(tpep_pickup_datetime) BETWEEN '2019-06-01' AND '2019-06-30'; 38 | 39 | -- Let's look into the partitions 40 | SELECT table_name, partition_id, total_rows 41 | FROM `nytaxi.INFORMATION_SCHEMA.PARTITIONS` 42 | WHERE table_name = 'yellow_tripdata_partitioned' 43 | ORDER BY total_rows DESC; 44 | 45 | -- Creating a partition and cluster table 46 | CREATE OR REPLACE TABLE taxi-rides-ny.nytaxi.yellow_tripdata_partitioned_clustered 47 | PARTITION BY DATE(tpep_pickup_datetime) 48 | CLUSTER BY VendorID AS 49 | SELECT * FROM taxi-rides-ny.nytaxi.external_yellow_tripdata; 50 | 51 | -- Query scans 1.1 GB 52 | SELECT count(*) as trips 53 | FROM taxi-rides-ny.nytaxi.yellow_tripdata_partitioned 54 | WHERE DATE(tpep_pickup_datetime) BETWEEN '2019-06-01' AND '2020-12-31' 55 | AND VendorID=1; 56 | 57 | -- Query scans 864.5 MB 58 | SELECT count(*) as trips 59 | FROM taxi-rides-ny.nytaxi.yellow_tripdata_partitioned_clustered 60 | WHERE DATE(tpep_pickup_datetime) BETWEEN '2019-06-01' AND '2020-12-31' 61 | AND VendorID=1; 62 | 63 | -------------------------------------------------------------------------------- /03-data-warehouse/big_query_hw.sql: -------------------------------------------------------------------------------- 1 | CREATE OR REPLACE EXTERNAL TABLE `taxi-rides-ny.nytaxi.fhv_tripdata` 2 | OPTIONS ( 3 | format = 'CSV', 4 | uris = ['gs://nyc-tl-data/trip data/fhv_tripdata_2019-*.csv'] 5 | ); 6 | 7 | 8 | SELECT count(*) FROM `taxi-rides-ny.nytaxi.fhv_tripdata`; 9 | 10 | 11 | SELECT COUNT(DISTINCT(dispatching_base_num)) FROM `taxi-rides-ny.nytaxi.fhv_tripdata`; 12 | 13 | 14 | CREATE OR REPLACE TABLE `taxi-rides-ny.nytaxi.fhv_nonpartitioned_tripdata` 15 | AS SELECT * FROM `taxi-rides-ny.nytaxi.fhv_tripdata`; 16 | 17 | CREATE OR REPLACE TABLE `taxi-rides-ny.nytaxi.fhv_partitioned_tripdata` 18 | PARTITION BY DATE(dropoff_datetime) 19 | CLUSTER BY dispatching_base_num AS ( 20 | SELECT * FROM `taxi-rides-ny.nytaxi.fhv_tripdata` 21 | ); 22 | 23 | SELECT count(*) FROM `taxi-rides-ny.nytaxi.fhv_nonpartitioned_tripdata` 24 | WHERE DATE(dropoff_datetime) BETWEEN '2019-01-01' AND '2019-03-31' 25 | AND dispatching_base_num IN ('B00987', 'B02279', 'B02060'); 26 | 27 | 28 | SELECT count(*) FROM `taxi-rides-ny.nytaxi.fhv_partitioned_tripdata` 29 | WHERE DATE(dropoff_datetime) BETWEEN '2019-01-01' AND '2019-03-31' 30 | AND dispatching_base_num IN ('B00987', 'B02279', 'B02060'); 31 | -------------------------------------------------------------------------------- /03-data-warehouse/big_query_ml.sql: -------------------------------------------------------------------------------- 1 | -- SELECT THE COLUMNS INTERESTED FOR YOU 2 | SELECT passenger_count, trip_distance, PULocationID, DOLocationID, payment_type, fare_amount, tolls_amount, tip_amount 3 | FROM `taxi-rides-ny.nytaxi.yellow_tripdata_partitioned` WHERE fare_amount != 0; 4 | 5 | -- CREATE A ML TABLE WITH APPROPRIATE TYPE 6 | CREATE OR REPLACE TABLE `taxi-rides-ny.nytaxi.yellow_tripdata_ml` ( 7 | `passenger_count` INTEGER, 8 | `trip_distance` FLOAT64, 9 | `PULocationID` STRING, 10 | `DOLocationID` STRING, 11 | `payment_type` STRING, 12 | `fare_amount` FLOAT64, 13 | `tolls_amount` FLOAT64, 14 | `tip_amount` FLOAT64 15 | ) AS ( 16 | SELECT passenger_count, trip_distance, cast(PULocationID AS STRING), CAST(DOLocationID AS STRING), 17 | CAST(payment_type AS STRING), fare_amount, tolls_amount, tip_amount 18 | FROM `taxi-rides-ny.nytaxi.yellow_tripdata_partitioned` WHERE fare_amount != 0 19 | ); 20 | 21 | -- CREATE MODEL WITH DEFAULT SETTING 22 | CREATE OR REPLACE MODEL `taxi-rides-ny.nytaxi.tip_model` 23 | OPTIONS 24 | (model_type='linear_reg', 25 | input_label_cols=['tip_amount'], 26 | DATA_SPLIT_METHOD='AUTO_SPLIT') AS 27 | SELECT 28 | * 29 | FROM 30 | `taxi-rides-ny.nytaxi.yellow_tripdata_ml` 31 | WHERE 32 | tip_amount IS NOT NULL; 33 | 34 | -- CHECK FEATURES 35 | SELECT * FROM ML.FEATURE_INFO(MODEL `taxi-rides-ny.nytaxi.tip_model`); 36 | 37 | -- EVALUATE THE MODEL 38 | SELECT 39 | * 40 | FROM 41 | ML.EVALUATE(MODEL `taxi-rides-ny.nytaxi.tip_model`, 42 | ( 43 | SELECT 44 | * 45 | FROM 46 | `taxi-rides-ny.nytaxi.yellow_tripdata_ml` 47 | WHERE 48 | tip_amount IS NOT NULL 49 | )); 50 | 51 | -- PREDICT THE MODEL 52 | SELECT 53 | * 54 | FROM 55 | ML.PREDICT(MODEL `taxi-rides-ny.nytaxi.tip_model`, 56 | ( 57 | SELECT 58 | * 59 | FROM 60 | `taxi-rides-ny.nytaxi.yellow_tripdata_ml` 61 | WHERE 62 | tip_amount IS NOT NULL 63 | )); 64 | 65 | -- PREDICT AND EXPLAIN 66 | SELECT 67 | * 68 | FROM 69 | ML.EXPLAIN_PREDICT(MODEL `taxi-rides-ny.nytaxi.tip_model`, 70 | ( 71 | SELECT 72 | * 73 | FROM 74 | `taxi-rides-ny.nytaxi.yellow_tripdata_ml` 75 | WHERE 76 | tip_amount IS NOT NULL 77 | ), STRUCT(3 as top_k_features)); 78 | 79 | -- HYPER PARAM TUNNING 80 | CREATE OR REPLACE MODEL `taxi-rides-ny.nytaxi.tip_hyperparam_model` 81 | OPTIONS 82 | (model_type='linear_reg', 83 | input_label_cols=['tip_amount'], 84 | DATA_SPLIT_METHOD='AUTO_SPLIT', 85 | num_trials=5, 86 | max_parallel_trials=2, 87 | l1_reg=hparam_range(0, 20), 88 | l2_reg=hparam_candidates([0, 0.1, 1, 10])) AS 89 | SELECT 90 | * 91 | FROM 92 | `taxi-rides-ny.nytaxi.yellow_tripdata_ml` 93 | WHERE 94 | tip_amount IS NOT NULL; 95 | 96 | -------------------------------------------------------------------------------- /03-data-warehouse/extract_model.md: -------------------------------------------------------------------------------- 1 | ## Model deployment 2 | [Tutorial](https://cloud.google.com/bigquery-ml/docs/export-model-tutorial) 3 | ### Steps 4 | - gcloud auth login 5 | - bq --project_id taxi-rides-ny extract -m nytaxi.tip_model gs://taxi_ml_model/tip_model 6 | - mkdir /tmp/model 7 | - gsutil cp -r gs://taxi_ml_model/tip_model /tmp/model 8 | - mkdir -p serving_dir/tip_model/1 9 | - cp -r /tmp/model/tip_model/* serving_dir/tip_model/1 10 | - docker pull tensorflow/serving 11 | - docker run -p 8501:8501 --mount type=bind,source=`pwd`/serving_dir/tip_model,target= 12 | /models/tip_model -e MODEL_NAME=tip_model -t tensorflow/serving & 13 | - curl -d '{"instances": [{"passenger_count":1, "trip_distance":12.2, "PULocationID":"193", "DOLocationID":"264", "payment_type":"2","fare_amount":20.4,"tolls_amount":0.0}]}' -X POST http://localhost:8501/v1/models/tip_model:predict 14 | - http://localhost:8501/v1/models/tip_model -------------------------------------------------------------------------------- /03-data-warehouse/extras/README.md: -------------------------------------------------------------------------------- 1 | Quick hack to load files directly to GCS, without Airflow. Downloads csv files from https://nyc-tlc.s3.amazonaws.com/trip+data/ and uploads them to your Cloud Storage Account as parquet files. 2 | 3 | 1. Install pre-reqs (more info in `web_to_gcs.py` script) 4 | 2. Run: `python web_to_gcs.py` 5 | -------------------------------------------------------------------------------- /03-data-warehouse/extras/web_to_gcs.py: -------------------------------------------------------------------------------- 1 | import io 2 | import os 3 | import requests 4 | import pandas as pd 5 | from google.cloud import storage 6 | 7 | """ 8 | Pre-reqs: 9 | 1. `pip install pandas pyarrow google-cloud-storage` 10 | 2. Set GOOGLE_APPLICATION_CREDENTIALS to your project/service-account key 11 | 3. Set GCP_GCS_BUCKET as your bucket or change default value of BUCKET 12 | """ 13 | 14 | # services = ['fhv','green','yellow'] 15 | init_url = 'https://github.com/DataTalksClub/nyc-tlc-data/releases/download/' 16 | # switch out the bucketname 17 | BUCKET = os.environ.get("GCP_GCS_BUCKET", "dtc-data-lake-bucketname") 18 | 19 | 20 | def upload_to_gcs(bucket, object_name, local_file): 21 | """ 22 | Ref: https://cloud.google.com/storage/docs/uploading-objects#storage-upload-object-python 23 | """ 24 | # # WORKAROUND to prevent timeout for files > 6 MB on 800 kbps upload speed. 25 | # # (Ref: https://github.com/googleapis/python-storage/issues/74) 26 | # storage.blob._MAX_MULTIPART_SIZE = 5 * 1024 * 1024 # 5 MB 27 | # storage.blob._DEFAULT_CHUNKSIZE = 5 * 1024 * 1024 # 5 MB 28 | 29 | client = storage.Client() 30 | bucket = client.bucket(bucket) 31 | blob = bucket.blob(object_name) 32 | blob.upload_from_filename(local_file) 33 | 34 | 35 | def web_to_gcs(year, service): 36 | for i in range(12): 37 | 38 | # sets the month part of the file_name string 39 | month = '0'+str(i+1) 40 | month = month[-2:] 41 | 42 | # csv file_name 43 | file_name = f"{service}_tripdata_{year}-{month}.csv.gz" 44 | 45 | # download it using requests via a pandas df 46 | request_url = f"{init_url}{service}/{file_name}" 47 | r = requests.get(request_url) 48 | open(file_name, 'wb').write(r.content) 49 | print(f"Local: {file_name}") 50 | 51 | # read it back into a parquet file 52 | df = pd.read_csv(file_name, compression='gzip') 53 | file_name = file_name.replace('.csv.gz', '.parquet') 54 | df.to_parquet(file_name, engine='pyarrow') 55 | print(f"Parquet: {file_name}") 56 | 57 | # upload it to gcs 58 | upload_to_gcs(BUCKET, f"{service}/{file_name}", file_name) 59 | print(f"GCS: {service}/{file_name}") 60 | 61 | 62 | web_to_gcs('2019', 'green') 63 | web_to_gcs('2020', 'green') 64 | # web_to_gcs('2019', 'yellow') 65 | # web_to_gcs('2020', 'yellow') 66 | 67 | -------------------------------------------------------------------------------- /04-analytics-engineering/docker_setup/docker-compose.yaml: -------------------------------------------------------------------------------- 1 | version: '3' 2 | services: 3 | dbt-bq-dtc: 4 | build: 5 | context: . 6 | target: dbt-bigquery 7 | image: dbt/bigquery 8 | volumes: 9 | - .:/usr/app 10 | - ~/.dbt/:/root/.dbt/ 11 | - ~/.google/credentials/google_credentials.json:/.google/credentials/google_credentials.json 12 | network_mode: host -------------------------------------------------------------------------------- /04-analytics-engineering/taxi_rides_ny/.gitignore: -------------------------------------------------------------------------------- 1 | # you shouldn't commit these into source control 2 | # these are the default directory names, adjust/add to fit your needs 3 | target/ 4 | dbt_packages/ 5 | logs/ 6 | -------------------------------------------------------------------------------- /04-analytics-engineering/taxi_rides_ny/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataTalksClub/data-engineering-zoomcamp/76fed2419e917176369a0d3a2eeff6c936f87286/04-analytics-engineering/taxi_rides_ny/.gitkeep -------------------------------------------------------------------------------- /04-analytics-engineering/taxi_rides_ny/README.md: -------------------------------------------------------------------------------- 1 | Welcome to your new dbt project! 2 | 3 | ### How to run this project 4 | ### About the project 5 | This project is based in [dbt starter project](https://github.com/dbt-labs/dbt-starter-project) (generated by running `dbt init`) 6 | Try running the following commands: 7 | - dbt run 8 | - dbt test 9 | 10 | A project includes the following files: 11 | - dbt_project.yml: file used to configure the dbt project. If you are using dbt locally, make sure the profile here matches the one setup during installation in ~/.dbt/profiles.yml 12 | - *.yml files under folders models, data, macros: documentation files 13 | - csv files in the data folder: these will be our sources, files described above 14 | - Files inside folder models: The sql files contain the scripts to run our models, this will cover staging, core and a datamarts models. At the end, these models will follow this structure: 15 | 16 | ![image](https://user-images.githubusercontent.com/4315804/152691312-e71b56a4-53ff-4884-859c-c9090dbd0db8.png) 17 | 18 | 19 | #### Workflow 20 | ![image](https://user-images.githubusercontent.com/4315804/148699280-964c4e0b-e685-4c0f-a266-4f3e097156c9.png) 21 | 22 | #### Execution 23 | After having installed the required tools and cloning this repo, execute the following commands: 24 | 25 | 1. Change into the project's directory from the command line: `$ cd [..]/taxi_rides_ny` 26 | 2. Load the CSVs into the database. This materializes the CSVs as tables in your target schema: `$ dbt seed` 27 | 3. Run the models: `$ dbt run` 28 | 4. Test your data: `$ dbt test` 29 | _Alternative: use `$ dbt build` to execute with one command the 3 steps above together_ 30 | 5. Generate documentation for the project: `$ dbt docs generate` 31 | 6. View the documentation for the project, this step should open the documentation page on a webserver, but it can also be accessed from http://localhost:8080 : `$ dbt docs serve` 32 | 33 | ### dbt resources: 34 | - Learn more about dbt [in the docs](https://docs.getdbt.com/docs/introduction) 35 | - Check out [Discourse](https://discourse.getdbt.com/) for commonly asked questions and answers 36 | - Join the [chat](http://slack.getdbt.com/) on Slack for live discussions and support 37 | - Find [dbt events](https://events.getdbt.com) near you 38 | - Check out [the blog](https://blog.getdbt.com/) for the latest news on dbt's development and best practices -------------------------------------------------------------------------------- /04-analytics-engineering/taxi_rides_ny/analyses/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataTalksClub/data-engineering-zoomcamp/76fed2419e917176369a0d3a2eeff6c936f87286/04-analytics-engineering/taxi_rides_ny/analyses/.gitkeep -------------------------------------------------------------------------------- /04-analytics-engineering/taxi_rides_ny/analyses/hack-load-data.sql: -------------------------------------------------------------------------------- 1 | -- MAKE SURE YOU REPLACE taxi-rides-ny-339813-412521 WITH THE NAME OF YOUR DATASET! 2 | -- When you run the query, only run 5 of the ALTER TABLE statements at one time (by highlighting only 5). 3 | -- Otherwise BigQuery will say too many alterations to the table are being made. 4 | 5 | CREATE TABLE `taxi-rides-ny-339813-412521.trips_data_all.green_tripdata` as 6 | SELECT * FROM `bigquery-public-data.new_york_taxi_trips.tlc_green_trips_2019`; 7 | 8 | 9 | CREATE TABLE `taxi-rides-ny-339813-412521.trips_data_all.yellow_tripdata` as 10 | SELECT * FROM `bigquery-public-data.new_york_taxi_trips.tlc_yellow_trips_2019`; 11 | 12 | insert into `taxi-rides-ny-339813-412521.trips_data_all.green_tripdata` 13 | SELECT * FROM `bigquery-public-data.new_york_taxi_trips.tlc_green_trips_2020` ; 14 | 15 | 16 | insert into `taxi-rides-ny-339813-412521.trips_data_all.yellow_tripdata` 17 | SELECT * FROM `bigquery-public-data.new_york_taxi_trips.tlc_yellow_trips_2020`; 18 | 19 | -- Fixes yellow table schema 20 | ALTER TABLE `taxi-rides-ny-339813-412521.trips_data_all.yellow_tripdata` 21 | RENAME COLUMN vendor_id TO VendorID; 22 | ALTER TABLE `taxi-rides-ny-339813-412521.trips_data_all.yellow_tripdata` 23 | RENAME COLUMN pickup_datetime TO tpep_pickup_datetime; 24 | ALTER TABLE `taxi-rides-ny-339813-412521.trips_data_all.yellow_tripdata` 25 | RENAME COLUMN dropoff_datetime TO tpep_dropoff_datetime; 26 | ALTER TABLE `taxi-rides-ny-339813-412521.trips_data_all.yellow_tripdata` 27 | RENAME COLUMN rate_code TO RatecodeID; 28 | ALTER TABLE `taxi-rides-ny-339813-412521.trips_data_all.yellow_tripdata` 29 | RENAME COLUMN imp_surcharge TO improvement_surcharge; 30 | ALTER TABLE `taxi-rides-ny-339813-412521.trips_data_all.yellow_tripdata` 31 | RENAME COLUMN pickup_location_id TO PULocationID; 32 | ALTER TABLE `taxi-rides-ny-339813-412521.trips_data_all.yellow_tripdata` 33 | RENAME COLUMN dropoff_location_id TO DOLocationID; 34 | 35 | -- Fixes green table schema 36 | ALTER TABLE `taxi-rides-ny-339813-412521.trips_data_all.green_tripdata` 37 | RENAME COLUMN vendor_id TO VendorID; 38 | ALTER TABLE `taxi-rides-ny-339813-412521.trips_data_all.green_tripdata` 39 | RENAME COLUMN pickup_datetime TO lpep_pickup_datetime; 40 | ALTER TABLE `taxi-rides-ny-339813-412521.trips_data_all.green_tripdata` 41 | RENAME COLUMN dropoff_datetime TO lpep_dropoff_datetime; 42 | ALTER TABLE `taxi-rides-ny-339813-412521.trips_data_all.green_tripdata` 43 | RENAME COLUMN rate_code TO RatecodeID; 44 | ALTER TABLE `taxi-rides-ny-339813-412521.trips_data_all.green_tripdata` 45 | RENAME COLUMN imp_surcharge TO improvement_surcharge; 46 | ALTER TABLE `taxi-rides-ny-339813-412521.trips_data_all.green_tripdata` 47 | RENAME COLUMN pickup_location_id TO PULocationID; 48 | ALTER TABLE `taxi-rides-ny-339813-412521.trips_data_all.green_tripdata` 49 | RENAME COLUMN dropoff_location_id TO DOLocationID; 50 | -------------------------------------------------------------------------------- /04-analytics-engineering/taxi_rides_ny/dbt_project.yml: -------------------------------------------------------------------------------- 1 | 2 | # Name your project! Project names should contain only lowercase characters 3 | # and underscores. A good package name should reflect your organization's 4 | # name or the intended use of these models 5 | name: 'taxi_rides_ny' 6 | version: '1.0.0' 7 | config-version: 2 8 | 9 | # This setting configures which "profile" dbt uses for this project. 10 | profile: 'default' 11 | 12 | # These configurations specify where dbt should look for different types of files. 13 | # The `model-paths` config, for example, states that models in this project can be 14 | # found in the "models/" directory. You probably won't need to change these! 15 | model-paths: ["models"] 16 | analysis-paths: ["analyses"] 17 | test-paths: ["tests"] 18 | seed-paths: ["seeds"] 19 | macro-paths: ["macros"] 20 | snapshot-paths: ["snapshots"] 21 | 22 | target-path: "target" # directory which will store compiled SQL files 23 | clean-targets: # directories to be removed by `dbt clean` 24 | - "target" 25 | - "dbt_packages" 26 | 27 | 28 | # Configuring models 29 | # Full documentation: https://docs.getdbt.com/docs/configuring-models 30 | 31 | # In dbt, the default materialization for a model is a view. This means, when you run 32 | # dbt run or dbt build, all of your models will be built as a view in your data platform. 33 | # The configuration below will override this setting for models in the example folder to 34 | # instead be materialized as tables. Any models you add to the root of the models folder will 35 | # continue to be built as views. These settings can be overridden in the individual model files 36 | # using the `{{ config(...) }}` macro. 37 | 38 | models: 39 | taxi_rides_ny: 40 | # Applies to all files under models/.../ 41 | staging: 42 | materialized: view 43 | core: 44 | materialized: table 45 | vars: 46 | payment_type_values: [1, 2, 3, 4, 5, 6] 47 | 48 | seeds: 49 | taxi_rides_ny: 50 | taxi_zone_lookup: 51 | +column_types: 52 | locationid: numeric -------------------------------------------------------------------------------- /04-analytics-engineering/taxi_rides_ny/macros/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataTalksClub/data-engineering-zoomcamp/76fed2419e917176369a0d3a2eeff6c936f87286/04-analytics-engineering/taxi_rides_ny/macros/.gitkeep -------------------------------------------------------------------------------- /04-analytics-engineering/taxi_rides_ny/macros/get_payment_type_description.sql: -------------------------------------------------------------------------------- 1 | {# 2 | This macro returns the description of the payment_type 3 | #} 4 | 5 | {% macro get_payment_type_description(payment_type) -%} 6 | 7 | case {{ dbt.safe_cast("payment_type", api.Column.translate_type("integer")) }} 8 | when 1 then 'Credit card' 9 | when 2 then 'Cash' 10 | when 3 then 'No charge' 11 | when 4 then 'Dispute' 12 | when 5 then 'Unknown' 13 | when 6 then 'Voided trip' 14 | else 'EMPTY' 15 | end 16 | 17 | {%- endmacro %} -------------------------------------------------------------------------------- /04-analytics-engineering/taxi_rides_ny/macros/macros_properties.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | macros: 4 | - name: get_payment_type_description 5 | description: > 6 | This macro receives a payment_type and returns the corresponding description. 7 | arguments: 8 | - name: payment_type 9 | type: int 10 | description: > 11 | payment_type value. 12 | Must be one of the accepted values, otherwise the macro will return null -------------------------------------------------------------------------------- /04-analytics-engineering/taxi_rides_ny/models/core/dim_zones.sql: -------------------------------------------------------------------------------- 1 | {{ config(materialized='table') }} 2 | 3 | select 4 | locationid, 5 | borough, 6 | zone, 7 | replace(service_zone,'Boro','Green') as service_zone 8 | from {{ ref('taxi_zone_lookup') }} -------------------------------------------------------------------------------- /04-analytics-engineering/taxi_rides_ny/models/core/dm_monthly_zone_revenue.sql: -------------------------------------------------------------------------------- 1 | {{ config(materialized='table') }} 2 | 3 | with trips_data as ( 4 | select * from {{ ref('fact_trips') }} 5 | ) 6 | select 7 | -- Revenue grouping 8 | pickup_zone as revenue_zone, 9 | {{ dbt.date_trunc("month", "pickup_datetime") }} as revenue_month, 10 | 11 | service_type, 12 | 13 | -- Revenue calculation 14 | sum(fare_amount) as revenue_monthly_fare, 15 | sum(extra) as revenue_monthly_extra, 16 | sum(mta_tax) as revenue_monthly_mta_tax, 17 | sum(tip_amount) as revenue_monthly_tip_amount, 18 | sum(tolls_amount) as revenue_monthly_tolls_amount, 19 | sum(ehail_fee) as revenue_monthly_ehail_fee, 20 | sum(improvement_surcharge) as revenue_monthly_improvement_surcharge, 21 | sum(total_amount) as revenue_monthly_total_amount, 22 | 23 | -- Additional calculations 24 | count(tripid) as total_monthly_trips, 25 | avg(passenger_count) as avg_monthly_passenger_count, 26 | avg(trip_distance) as avg_monthly_trip_distance 27 | 28 | from trips_data 29 | group by 1,2,3 -------------------------------------------------------------------------------- /04-analytics-engineering/taxi_rides_ny/models/core/fact_trips.sql: -------------------------------------------------------------------------------- 1 | {{ 2 | config( 3 | materialized='table' 4 | ) 5 | }} 6 | 7 | with green_tripdata as ( 8 | select *, 9 | 'Green' as service_type 10 | from {{ ref('stg_green_tripdata') }} 11 | ), 12 | yellow_tripdata as ( 13 | select *, 14 | 'Yellow' as service_type 15 | from {{ ref('stg_yellow_tripdata') }} 16 | ), 17 | trips_unioned as ( 18 | select * from green_tripdata 19 | union all 20 | select * from yellow_tripdata 21 | ), 22 | dim_zones as ( 23 | select * from {{ ref('dim_zones') }} 24 | where borough != 'Unknown' 25 | ) 26 | select trips_unioned.tripid, 27 | trips_unioned.vendorid, 28 | trips_unioned.service_type, 29 | trips_unioned.ratecodeid, 30 | trips_unioned.pickup_locationid, 31 | pickup_zone.borough as pickup_borough, 32 | pickup_zone.zone as pickup_zone, 33 | trips_unioned.dropoff_locationid, 34 | dropoff_zone.borough as dropoff_borough, 35 | dropoff_zone.zone as dropoff_zone, 36 | trips_unioned.pickup_datetime, 37 | trips_unioned.dropoff_datetime, 38 | trips_unioned.store_and_fwd_flag, 39 | trips_unioned.passenger_count, 40 | trips_unioned.trip_distance, 41 | trips_unioned.trip_type, 42 | trips_unioned.fare_amount, 43 | trips_unioned.extra, 44 | trips_unioned.mta_tax, 45 | trips_unioned.tip_amount, 46 | trips_unioned.tolls_amount, 47 | trips_unioned.ehail_fee, 48 | trips_unioned.improvement_surcharge, 49 | trips_unioned.total_amount, 50 | trips_unioned.payment_type, 51 | trips_unioned.payment_type_description 52 | from trips_unioned 53 | inner join dim_zones as pickup_zone 54 | on trips_unioned.pickup_locationid = pickup_zone.locationid 55 | inner join dim_zones as dropoff_zone 56 | on trips_unioned.dropoff_locationid = dropoff_zone.locationid -------------------------------------------------------------------------------- /04-analytics-engineering/taxi_rides_ny/models/staging/stg_green_tripdata.sql: -------------------------------------------------------------------------------- 1 | {{ 2 | config( 3 | materialized='view' 4 | ) 5 | }} 6 | 7 | with tripdata as 8 | ( 9 | select *, 10 | row_number() over(partition by vendorid, lpep_pickup_datetime) as rn 11 | from {{ source('staging','green_tripdata') }} 12 | where vendorid is not null 13 | ) 14 | select 15 | -- identifiers 16 | {{ dbt_utils.generate_surrogate_key(['vendorid', 'lpep_pickup_datetime']) }} as tripid, 17 | {{ dbt.safe_cast("vendorid", api.Column.translate_type("integer")) }} as vendorid, 18 | {{ dbt.safe_cast("ratecodeid", api.Column.translate_type("integer")) }} as ratecodeid, 19 | {{ dbt.safe_cast("pulocationid", api.Column.translate_type("integer")) }} as pickup_locationid, 20 | {{ dbt.safe_cast("dolocationid", api.Column.translate_type("integer")) }} as dropoff_locationid, 21 | 22 | -- timestamps 23 | cast(lpep_pickup_datetime as timestamp) as pickup_datetime, 24 | cast(lpep_dropoff_datetime as timestamp) as dropoff_datetime, 25 | 26 | -- trip info 27 | store_and_fwd_flag, 28 | {{ dbt.safe_cast("passenger_count", api.Column.translate_type("integer")) }} as passenger_count, 29 | cast(trip_distance as numeric) as trip_distance, 30 | {{ dbt.safe_cast("trip_type", api.Column.translate_type("integer")) }} as trip_type, 31 | 32 | -- payment info 33 | cast(fare_amount as numeric) as fare_amount, 34 | cast(extra as numeric) as extra, 35 | cast(mta_tax as numeric) as mta_tax, 36 | cast(tip_amount as numeric) as tip_amount, 37 | cast(tolls_amount as numeric) as tolls_amount, 38 | cast(ehail_fee as numeric) as ehail_fee, 39 | cast(improvement_surcharge as numeric) as improvement_surcharge, 40 | cast(total_amount as numeric) as total_amount, 41 | coalesce({{ dbt.safe_cast("payment_type", api.Column.translate_type("integer")) }},0) as payment_type, 42 | {{ get_payment_type_description("payment_type") }} as payment_type_description 43 | from tripdata 44 | where rn = 1 45 | 46 | 47 | -- dbt build --select --vars '{'is_test_run': 'false'}' 48 | {% if var('is_test_run', default=true) %} 49 | 50 | limit 100 51 | 52 | {% endif %} -------------------------------------------------------------------------------- /04-analytics-engineering/taxi_rides_ny/models/staging/stg_yellow_tripdata.sql: -------------------------------------------------------------------------------- 1 | {{ config(materialized='view') }} 2 | 3 | with tripdata as 4 | ( 5 | select *, 6 | row_number() over(partition by vendorid, tpep_pickup_datetime) as rn 7 | from {{ source('staging','yellow_tripdata') }} 8 | where vendorid is not null 9 | ) 10 | select 11 | -- identifiers 12 | {{ dbt_utils.generate_surrogate_key(['vendorid', 'tpep_pickup_datetime']) }} as tripid, 13 | {{ dbt.safe_cast("vendorid", api.Column.translate_type("integer")) }} as vendorid, 14 | {{ dbt.safe_cast("ratecodeid", api.Column.translate_type("integer")) }} as ratecodeid, 15 | {{ dbt.safe_cast("pulocationid", api.Column.translate_type("integer")) }} as pickup_locationid, 16 | {{ dbt.safe_cast("dolocationid", api.Column.translate_type("integer")) }} as dropoff_locationid, 17 | 18 | -- timestamps 19 | cast(tpep_pickup_datetime as timestamp) as pickup_datetime, 20 | cast(tpep_dropoff_datetime as timestamp) as dropoff_datetime, 21 | 22 | -- trip info 23 | store_and_fwd_flag, 24 | {{ dbt.safe_cast("passenger_count", api.Column.translate_type("integer")) }} as passenger_count, 25 | cast(trip_distance as numeric) as trip_distance, 26 | -- yellow cabs are always street-hail 27 | 1 as trip_type, 28 | 29 | -- payment info 30 | cast(fare_amount as numeric) as fare_amount, 31 | cast(extra as numeric) as extra, 32 | cast(mta_tax as numeric) as mta_tax, 33 | cast(tip_amount as numeric) as tip_amount, 34 | cast(tolls_amount as numeric) as tolls_amount, 35 | cast(0 as numeric) as ehail_fee, 36 | cast(improvement_surcharge as numeric) as improvement_surcharge, 37 | cast(total_amount as numeric) as total_amount, 38 | coalesce({{ dbt.safe_cast("payment_type", api.Column.translate_type("integer")) }},0) as payment_type, 39 | {{ get_payment_type_description('payment_type') }} as payment_type_description 40 | from tripdata 41 | where rn = 1 42 | 43 | -- dbt build --select --vars '{'is_test_run: false}' 44 | {% if var('is_test_run', default=true) %} 45 | 46 | limit 100 47 | 48 | {% endif %} -------------------------------------------------------------------------------- /04-analytics-engineering/taxi_rides_ny/package-lock.yml: -------------------------------------------------------------------------------- 1 | packages: 2 | - package: dbt-labs/dbt_utils 3 | version: 1.1.1 4 | - package: dbt-labs/codegen 5 | version: 0.12.1 6 | sha1_hash: d974113b0f072cce35300077208f38581075ab40 7 | -------------------------------------------------------------------------------- /04-analytics-engineering/taxi_rides_ny/packages.yml: -------------------------------------------------------------------------------- 1 | packages: 2 | - package: dbt-labs/dbt_utils 3 | version: 1.1.1 4 | - package: dbt-labs/codegen 5 | version: 0.12.1 -------------------------------------------------------------------------------- /04-analytics-engineering/taxi_rides_ny/seeds/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataTalksClub/data-engineering-zoomcamp/76fed2419e917176369a0d3a2eeff6c936f87286/04-analytics-engineering/taxi_rides_ny/seeds/.gitkeep -------------------------------------------------------------------------------- /04-analytics-engineering/taxi_rides_ny/seeds/seeds_properties.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | seeds: 4 | - name: taxi_zone_lookup 5 | description: > 6 | Taxi Zones roughly based on NYC Department of City Planning's Neighborhood 7 | Tabulation Areas (NTAs) and are meant to approximate neighborhoods, so you can see which 8 | neighborhood a passenger was picked up in, and which neighborhood they were dropped off in. 9 | Includes associated service_zone (EWR, Boro Zone, Yellow Zone) -------------------------------------------------------------------------------- /04-analytics-engineering/taxi_rides_ny/snapshots/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataTalksClub/data-engineering-zoomcamp/76fed2419e917176369a0d3a2eeff6c936f87286/04-analytics-engineering/taxi_rides_ny/snapshots/.gitkeep -------------------------------------------------------------------------------- /05-batch/.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataTalksClub/data-engineering-zoomcamp/76fed2419e917176369a0d3a2eeff6c936f87286/05-batch/.gitignore -------------------------------------------------------------------------------- /05-batch/code/download_data.sh: -------------------------------------------------------------------------------- 1 | 2 | set -e 3 | 4 | TAXI_TYPE=$1 # "yellow" 5 | YEAR=$2 # 2020 6 | 7 | URL_PREFIX="https://github.com/DataTalksClub/nyc-tlc-data/releases/download" 8 | 9 | for MONTH in {1..12}; do 10 | FMONTH=`printf "%02d" ${MONTH}` 11 | 12 | URL="${URL_PREFIX}/${TAXI_TYPE}/${TAXI_TYPE}_tripdata_${YEAR}-${FMONTH}.csv.gz" 13 | 14 | LOCAL_PREFIX="data/raw/${TAXI_TYPE}/${YEAR}/${FMONTH}" 15 | LOCAL_FILE="${TAXI_TYPE}_tripdata_${YEAR}_${FMONTH}.csv.gz" 16 | LOCAL_PATH="${LOCAL_PREFIX}/${LOCAL_FILE}" 17 | 18 | echo "downloading ${URL} to ${LOCAL_PATH}" 19 | mkdir -p ${LOCAL_PREFIX} 20 | wget ${URL} -O ${LOCAL_PATH} 21 | 22 | done 23 | -------------------------------------------------------------------------------- /05-batch/setup/config/core-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | fs.AbstractFileSystem.gs.impl 7 | com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS 8 | 9 | 10 | fs.gs.impl 11 | com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem 12 | 13 | 14 | fs.gs.auth.service.account.json.keyfile 15 | /home/alexey/.google/credentials/google_credentials.json 16 | 17 | 18 | fs.gs.auth.service.account.enable 19 | true 20 | 21 | -------------------------------------------------------------------------------- /05-batch/setup/config/spark-defaults.conf: -------------------------------------------------------------------------------- 1 | spark-master yarn 2 | spark.hadoop.google.cloud.auth.service.account.enable true 3 | spark.hadoop.google.cloud.auth.service.account.json.keyfile /home/alexey 4 | -------------------------------------------------------------------------------- /05-batch/setup/config/spark.dockerfile: -------------------------------------------------------------------------------- 1 | FROM library/openjdk:11 -------------------------------------------------------------------------------- /05-batch/setup/linux.md: -------------------------------------------------------------------------------- 1 | 2 | ## Linux 3 | 4 | Here we'll show you how to install Spark 3.3.2 for Linux. 5 | We tested it on Ubuntu 20.04 (also WSL), but it should work 6 | for other Linux distros as well 7 | 8 | 9 | ### Installing Java 10 | 11 | Download OpenJDK 11 or Oracle JDK 11 (It's important that the version is 11 - spark requires 8 or 11) 12 | 13 | We'll use [OpenJDK](https://jdk.java.net/archive/) 14 | 15 | Download it (e.g. to `~/spark`): 16 | 17 | ``` 18 | wget https://download.java.net/java/GA/jdk11/9/GPL/openjdk-11.0.2_linux-x64_bin.tar.gz 19 | ``` 20 | 21 | Unpack it: 22 | 23 | ```bash 24 | tar xzfv openjdk-11.0.2_linux-x64_bin.tar.gz 25 | ``` 26 | 27 | define `JAVA_HOME` and add it to `PATH`: 28 | 29 | ```bash 30 | export JAVA_HOME="${HOME}/spark/jdk-11.0.2" 31 | export PATH="${JAVA_HOME}/bin:${PATH}" 32 | ``` 33 | 34 | check that it works: 35 | 36 | ```bash 37 | java --version 38 | ``` 39 | 40 | Output: 41 | 42 | ``` 43 | openjdk 11.0.2 2019-01-15 44 | OpenJDK Runtime Environment 18.9 (build 11.0.2+9) 45 | OpenJDK 64-Bit Server VM 18.9 (build 11.0.2+9, mixed mode) 46 | ``` 47 | 48 | Remove the archive: 49 | 50 | ```bash 51 | rm openjdk-11.0.2_linux-x64_bin.tar.gz 52 | ``` 53 | 54 | ### Installing Spark 55 | 56 | 57 | Download Spark. Use 3.3.2 version: 58 | 59 | ```bash 60 | wget https://archive.apache.org/dist/spark/spark-3.3.2/spark-3.3.2-bin-hadoop3.tgz 61 | ``` 62 | 63 | Unpack: 64 | 65 | ```bash 66 | tar xzfv spark-3.3.2-bin-hadoop3.tgz 67 | ``` 68 | 69 | Remove the archive: 70 | 71 | ```bash 72 | rm spark-3.3.2-bin-hadoop3.tgz 73 | ``` 74 | 75 | Add it to `PATH`: 76 | 77 | ```bash 78 | export SPARK_HOME="${HOME}/spark/spark-3.3.2-bin-hadoop3" 79 | export PATH="${SPARK_HOME}/bin:${PATH}" 80 | ``` 81 | 82 | ### Testing Spark 83 | 84 | Execute `spark-shell` and run the following: 85 | 86 | ```scala 87 | val data = 1 to 10000 88 | val distData = sc.parallelize(data) 89 | distData.filter(_ < 10).collect() 90 | ``` 91 | 92 | ### PySpark 93 | 94 | It's the same for all platforms. Go to [pyspark.md](pyspark.md). 95 | -------------------------------------------------------------------------------- /05-batch/setup/pyspark.md: -------------------------------------------------------------------------------- 1 | 2 | ## PySpark 3 | 4 | This document assumes you already have python. 5 | 6 | To run PySpark, we first need to add it to `PYTHONPATH`: 7 | 8 | ```bash 9 | export PYTHONPATH="${SPARK_HOME}/python/:$PYTHONPATH" 10 | export PYTHONPATH="${SPARK_HOME}/python/lib/py4j-0.10.9.5-src.zip:$PYTHONPATH" 11 | ``` 12 | 13 | Make sure that the version under `${SPARK_HOME}/python/lib/` matches the filename of py4j or you will 14 | encounter `ModuleNotFoundError: No module named 'py4j'` while executing `import pyspark`. 15 | 16 | For example, if the file under `${SPARK_HOME}/python/lib/` is `py4j-0.10.9.3-src.zip`, then the 17 | `export PYTHONPATH` statement above should be changed to 18 | 19 | ```bash 20 | export PYTHONPATH="${SPARK_HOME}/python/lib/py4j-0.10.9.3-src.zip:$PYTHONPATH" 21 | ``` 22 | 23 | On Windows, you may have to do path conversion from unix-style to windows-style: 24 | 25 | ```bash 26 | SPARK_WIN=`cygpath -w ${SPARK_HOME}` 27 | 28 | export PYTHONPATH="${SPARK_WIN}\\python\\" 29 | export PYTHONPATH="${SPARK_WIN}\\python\\lib\\py4j-0.10.9-src.zip;$PYTHONPATH" 30 | ``` 31 | 32 | Now you can run Jupyter or IPython to test if things work. Go to some other directory, e.g. `~/tmp`. 33 | 34 | Download a CSV file that we'll use for testing: 35 | 36 | ```bash 37 | wget https://d37ci6vzurychx.cloudfront.net/misc/taxi_zone_lookup.csv 38 | ``` 39 | 40 | Now let's run `ipython` (or `jupyter notebook`) and execute: 41 | 42 | ```python 43 | import pyspark 44 | from pyspark.sql import SparkSession 45 | 46 | spark = SparkSession.builder \ 47 | .master("local[*]") \ 48 | .appName('test') \ 49 | .getOrCreate() 50 | 51 | df = spark.read \ 52 | .option("header", "true") \ 53 | .csv('taxi_zone_lookup.csv') 54 | 55 | df.show() 56 | ``` 57 | 58 | Test that writing works as well: 59 | 60 | ```python 61 | df.write.parquet('zones') 62 | ``` 63 | -------------------------------------------------------------------------------- /06-streaming/.gitignore: -------------------------------------------------------------------------------- 1 | week6_venv -------------------------------------------------------------------------------- /06-streaming/java/kafka_examples/.gitignore: -------------------------------------------------------------------------------- 1 | .gradle 2 | bin 3 | !src/main/resources/rides.csv 4 | 5 | build/classes 6 | build/generated 7 | build/libs 8 | build/reports 9 | build/resources 10 | build/test-results 11 | build/tmp 12 | -------------------------------------------------------------------------------- /06-streaming/java/kafka_examples/build.gradle: -------------------------------------------------------------------------------- 1 | plugins { 2 | id 'java' 3 | id "com.github.davidmc24.gradle.plugin.avro" version "1.5.0" 4 | } 5 | 6 | 7 | group 'org.example' 8 | version '1.0-SNAPSHOT' 9 | 10 | repositories { 11 | mavenCentral() 12 | maven { 13 | url "https://packages.confluent.io/maven" 14 | } 15 | } 16 | 17 | dependencies { 18 | implementation 'org.apache.kafka:kafka-clients:3.3.1' 19 | implementation 'com.opencsv:opencsv:5.7.1' 20 | implementation 'io.confluent:kafka-json-serializer:7.3.1' 21 | implementation 'org.apache.kafka:kafka-streams:3.3.1' 22 | implementation 'io.confluent:kafka-avro-serializer:7.3.1' 23 | implementation 'io.confluent:kafka-schema-registry-client:7.3.1' 24 | implementation 'io.confluent:kafka-streams-avro-serde:7.3.1' 25 | implementation "org.apache.avro:avro:1.11.0" 26 | testImplementation 'org.junit.jupiter:junit-jupiter-api:5.8.1' 27 | testRuntimeOnly 'org.junit.jupiter:junit-jupiter-engine:5.8.1' 28 | testImplementation 'org.apache.kafka:kafka-streams-test-utils:3.3.1' 29 | } 30 | 31 | sourceSets.main.java.srcDirs = ['build/generated-main-avro-java','src/main/java'] 32 | 33 | test { 34 | useJUnitPlatform() 35 | } 36 | 37 | -------------------------------------------------------------------------------- /06-streaming/java/kafka_examples/gradle/wrapper/gradle-wrapper.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataTalksClub/data-engineering-zoomcamp/76fed2419e917176369a0d3a2eeff6c936f87286/06-streaming/java/kafka_examples/gradle/wrapper/gradle-wrapper.jar -------------------------------------------------------------------------------- /06-streaming/java/kafka_examples/gradle/wrapper/gradle-wrapper.properties: -------------------------------------------------------------------------------- 1 | distributionBase=GRADLE_USER_HOME 2 | distributionPath=wrapper/dists 3 | distributionUrl=https\://services.gradle.org/distributions/gradle-7.5.1-bin.zip 4 | zipStoreBase=GRADLE_USER_HOME 5 | zipStorePath=wrapper/dists 6 | -------------------------------------------------------------------------------- /06-streaming/java/kafka_examples/settings.gradle: -------------------------------------------------------------------------------- 1 | pluginManagement { 2 | repositories { 3 | gradlePluginPortal() 4 | mavenCentral() 5 | } 6 | } 7 | rootProject.name = 'kafka_examples' -------------------------------------------------------------------------------- /06-streaming/java/kafka_examples/src/main/avro/rides.avsc: -------------------------------------------------------------------------------- 1 | { 2 | "type": "record", 3 | "name":"RideRecord", 4 | "namespace": "schemaregistry", 5 | "fields":[ 6 | {"name":"vendor_id","type":"string"}, 7 | {"name":"passenger_count","type":"int"}, 8 | {"name":"trip_distance","type":"double"} 9 | ] 10 | } -------------------------------------------------------------------------------- /06-streaming/java/kafka_examples/src/main/avro/rides_compatible.avsc: -------------------------------------------------------------------------------- 1 | { 2 | "type": "record", 3 | "name":"RideRecordCompatible", 4 | "namespace": "schemaregistry", 5 | "fields":[ 6 | {"name":"vendorId","type":"string"}, 7 | {"name":"passenger_count","type":"int"}, 8 | {"name":"trip_distance","type":"double"}, 9 | {"name":"pu_location_id", "type": [ "null", "long" ], "default": null} 10 | ] 11 | } -------------------------------------------------------------------------------- /06-streaming/java/kafka_examples/src/main/avro/rides_non_compatible.avsc: -------------------------------------------------------------------------------- 1 | { 2 | "type": "record", 3 | "name":"RideRecordNoneCompatible", 4 | "namespace": "schemaregistry", 5 | "fields":[ 6 | {"name":"vendorId","type":"int"}, 7 | {"name":"passenger_count","type":"int"}, 8 | {"name":"trip_distance","type":"double"} 9 | ] 10 | } -------------------------------------------------------------------------------- /06-streaming/java/kafka_examples/src/main/java/org/example/JsonConsumer.java: -------------------------------------------------------------------------------- 1 | package org.example; 2 | 3 | import org.apache.kafka.clients.consumer.ConsumerConfig; 4 | import org.apache.kafka.clients.consumer.ConsumerRecord; 5 | import org.apache.kafka.clients.consumer.KafkaConsumer; 6 | import org.apache.kafka.clients.producer.ProducerConfig; 7 | import org.example.data.Ride; 8 | 9 | import java.time.Duration; 10 | import java.time.temporal.ChronoUnit; 11 | import java.time.temporal.TemporalUnit; 12 | import java.util.List; 13 | import java.util.Properties; 14 | import io.confluent.kafka.serializers.KafkaJsonDeserializerConfig; 15 | public class JsonConsumer { 16 | 17 | private Properties props = new Properties(); 18 | private KafkaConsumer consumer; 19 | public JsonConsumer() { 20 | props.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, "pkc-75m1o.europe-west3.gcp.confluent.cloud:9092"); 21 | props.put("security.protocol", "SASL_SSL"); 22 | props.put("sasl.jaas.config", "org.apache.kafka.common.security.plain.PlainLoginModule required username='"+Secrets.KAFKA_CLUSTER_KEY+"' password='"+Secrets.KAFKA_CLUSTER_SECRET+"';"); 23 | props.put("sasl.mechanism", "PLAIN"); 24 | props.put("client.dns.lookup", "use_all_dns_ips"); 25 | props.put("session.timeout.ms", "45000"); 26 | props.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringDeserializer"); 27 | props.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, "io.confluent.kafka.serializers.KafkaJsonDeserializer"); 28 | props.put(ConsumerConfig.GROUP_ID_CONFIG, "kafka_tutorial_example.jsonconsumer.v2"); 29 | props.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "earliest"); 30 | props.put(KafkaJsonDeserializerConfig.JSON_VALUE_TYPE, Ride.class); 31 | consumer = new KafkaConsumer(props); 32 | consumer.subscribe(List.of("rides")); 33 | 34 | } 35 | 36 | public void consumeFromKafka() { 37 | System.out.println("Consuming form kafka started"); 38 | var results = consumer.poll(Duration.of(1, ChronoUnit.SECONDS)); 39 | var i = 0; 40 | do { 41 | 42 | for(ConsumerRecord result: results) { 43 | System.out.println(result.value().DOLocationID); 44 | } 45 | results = consumer.poll(Duration.of(1, ChronoUnit.SECONDS)); 46 | System.out.println("RESULTS:::" + results.count()); 47 | i++; 48 | } 49 | while(!results.isEmpty() || i < 10); 50 | } 51 | 52 | public static void main(String[] args) { 53 | JsonConsumer jsonConsumer = new JsonConsumer(); 54 | jsonConsumer.consumeFromKafka(); 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /06-streaming/java/kafka_examples/src/main/java/org/example/JsonKStream.java: -------------------------------------------------------------------------------- 1 | package org.example; 2 | 3 | import org.apache.kafka.clients.consumer.ConsumerConfig; 4 | import org.apache.kafka.common.serialization.Serdes; 5 | import org.apache.kafka.streams.KafkaStreams; 6 | import org.apache.kafka.streams.StreamsBuilder; 7 | import org.apache.kafka.streams.StreamsConfig; 8 | import org.apache.kafka.streams.Topology; 9 | import org.apache.kafka.streams.kstream.Consumed; 10 | import org.apache.kafka.streams.kstream.Produced; 11 | import org.example.customserdes.CustomSerdes; 12 | import org.example.data.Ride; 13 | 14 | import java.util.Properties; 15 | 16 | public class JsonKStream { 17 | private Properties props = new Properties(); 18 | 19 | public JsonKStream() { 20 | props.put(StreamsConfig.BOOTSTRAP_SERVERS_CONFIG, "pkc-75m1o.europe-west3.gcp.confluent.cloud:9092"); 21 | props.put("security.protocol", "SASL_SSL"); 22 | props.put("sasl.jaas.config", "org.apache.kafka.common.security.plain.PlainLoginModule required username='"+Secrets.KAFKA_CLUSTER_KEY+"' password='"+Secrets.KAFKA_CLUSTER_SECRET+"';"); 23 | props.put("sasl.mechanism", "PLAIN"); 24 | props.put("client.dns.lookup", "use_all_dns_ips"); 25 | props.put("session.timeout.ms", "45000"); 26 | props.put(StreamsConfig.APPLICATION_ID_CONFIG, "kafka_tutorial.kstream.count.plocation.v1"); 27 | props.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "latest"); 28 | props.put(StreamsConfig.CACHE_MAX_BYTES_BUFFERING_CONFIG, 0); 29 | 30 | } 31 | 32 | public Topology createTopology() { 33 | StreamsBuilder streamsBuilder = new StreamsBuilder(); 34 | var ridesStream = streamsBuilder.stream("rides", Consumed.with(Serdes.String(), CustomSerdes.getSerde(Ride.class))); 35 | var puLocationCount = ridesStream.groupByKey().count().toStream(); 36 | puLocationCount.to("rides-pulocation-count", Produced.with(Serdes.String(), Serdes.Long())); 37 | return streamsBuilder.build(); 38 | } 39 | 40 | public void countPLocation() throws InterruptedException { 41 | var topology = createTopology(); 42 | var kStreams = new KafkaStreams(topology, props); 43 | kStreams.start(); 44 | while (kStreams.state() != KafkaStreams.State.RUNNING) { 45 | System.out.println(kStreams.state()); 46 | Thread.sleep(1000); 47 | } 48 | System.out.println(kStreams.state()); 49 | Runtime.getRuntime().addShutdownHook(new Thread(kStreams::close)); 50 | } 51 | 52 | public static void main(String[] args) throws InterruptedException { 53 | var object = new JsonKStream(); 54 | object.countPLocation(); 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /06-streaming/java/kafka_examples/src/main/java/org/example/JsonKStreamWindow.java: -------------------------------------------------------------------------------- 1 | package org.example; 2 | 3 | import org.apache.kafka.clients.consumer.ConsumerConfig; 4 | import org.apache.kafka.common.serialization.Serdes; 5 | import org.apache.kafka.streams.KafkaStreams; 6 | import org.apache.kafka.streams.StreamsBuilder; 7 | import org.apache.kafka.streams.StreamsConfig; 8 | import org.apache.kafka.streams.Topology; 9 | import org.apache.kafka.streams.kstream.Consumed; 10 | import org.apache.kafka.streams.kstream.Produced; 11 | import org.apache.kafka.streams.kstream.TimeWindows; 12 | import org.apache.kafka.streams.kstream.WindowedSerdes; 13 | import org.example.customserdes.CustomSerdes; 14 | import org.example.data.Ride; 15 | 16 | import java.time.Duration; 17 | import java.time.temporal.ChronoUnit; 18 | import java.util.Properties; 19 | 20 | public class JsonKStreamWindow { 21 | private Properties props = new Properties(); 22 | 23 | public JsonKStreamWindow() { 24 | props.put(StreamsConfig.BOOTSTRAP_SERVERS_CONFIG, "pkc-75m1o.europe-west3.gcp.confluent.cloud:9092"); 25 | props.put("security.protocol", "SASL_SSL"); 26 | props.put("sasl.jaas.config", "org.apache.kafka.common.security.plain.PlainLoginModule required username='"+Secrets.KAFKA_CLUSTER_KEY+"' password='"+Secrets.KAFKA_CLUSTER_SECRET+"';"); 27 | props.put("sasl.mechanism", "PLAIN"); 28 | props.put("client.dns.lookup", "use_all_dns_ips"); 29 | props.put("session.timeout.ms", "45000"); 30 | props.put(StreamsConfig.APPLICATION_ID_CONFIG, "kafka_tutorial.kstream.count.plocation.v1"); 31 | props.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "latest"); 32 | props.put(StreamsConfig.CACHE_MAX_BYTES_BUFFERING_CONFIG, 0); 33 | 34 | } 35 | 36 | public Topology createTopology() { 37 | StreamsBuilder streamsBuilder = new StreamsBuilder(); 38 | var ridesStream = streamsBuilder.stream("rides", Consumed.with(Serdes.String(), CustomSerdes.getSerde(Ride.class))); 39 | var puLocationCount = ridesStream.groupByKey() 40 | .windowedBy(TimeWindows.ofSizeAndGrace(Duration.ofSeconds(10), Duration.ofSeconds(5))) 41 | .count().toStream(); 42 | var windowSerde = WindowedSerdes.timeWindowedSerdeFrom(String.class, 10*1000); 43 | 44 | puLocationCount.to("rides-pulocation-window-count", Produced.with(windowSerde, Serdes.Long())); 45 | return streamsBuilder.build(); 46 | } 47 | 48 | public void countPLocationWindowed() { 49 | var topology = createTopology(); 50 | var kStreams = new KafkaStreams(topology, props); 51 | kStreams.start(); 52 | 53 | Runtime.getRuntime().addShutdownHook(new Thread(kStreams::close)); 54 | } 55 | 56 | public static void main(String[] args) { 57 | var object = new JsonKStreamWindow(); 58 | object.countPLocationWindowed(); 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /06-streaming/java/kafka_examples/src/main/java/org/example/JsonProducerPickupLocation.java: -------------------------------------------------------------------------------- 1 | package org.example; 2 | 3 | import com.opencsv.exceptions.CsvException; 4 | import org.apache.kafka.clients.producer.KafkaProducer; 5 | import org.apache.kafka.clients.producer.ProducerConfig; 6 | import org.apache.kafka.clients.producer.ProducerRecord; 7 | import org.example.data.PickupLocation; 8 | 9 | import java.io.IOException; 10 | import java.time.LocalDateTime; 11 | import java.util.Properties; 12 | import java.util.concurrent.ExecutionException; 13 | 14 | public class JsonProducerPickupLocation { 15 | private Properties props = new Properties(); 16 | 17 | public JsonProducerPickupLocation() { 18 | props.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, "pkc-75m1o.europe-west3.gcp.confluent.cloud:9092"); 19 | props.put("security.protocol", "SASL_SSL"); 20 | props.put("sasl.jaas.config", "org.apache.kafka.common.security.plain.PlainLoginModule required username='"+Secrets.KAFKA_CLUSTER_KEY+"' password='"+Secrets.KAFKA_CLUSTER_SECRET+"';"); 21 | props.put("sasl.mechanism", "PLAIN"); 22 | props.put("client.dns.lookup", "use_all_dns_ips"); 23 | props.put("session.timeout.ms", "45000"); 24 | props.put(ProducerConfig.ACKS_CONFIG, "all"); 25 | props.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringSerializer"); 26 | props.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, "io.confluent.kafka.serializers.KafkaJsonSerializer"); 27 | } 28 | 29 | public void publish(PickupLocation pickupLocation) throws ExecutionException, InterruptedException { 30 | KafkaProducer kafkaProducer = new KafkaProducer(props); 31 | var record = kafkaProducer.send(new ProducerRecord<>("rides_location", String.valueOf(pickupLocation.PULocationID), pickupLocation), (metadata, exception) -> { 32 | if (exception != null) { 33 | System.out.println(exception.getMessage()); 34 | } 35 | }); 36 | System.out.println(record.get().offset()); 37 | } 38 | 39 | 40 | public static void main(String[] args) throws IOException, CsvException, ExecutionException, InterruptedException { 41 | var producer = new JsonProducerPickupLocation(); 42 | producer.publish(new PickupLocation(186, LocalDateTime.now())); 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /06-streaming/java/kafka_examples/src/main/java/org/example/Secrets.java: -------------------------------------------------------------------------------- 1 | package org.example; 2 | 3 | public class Secrets { 4 | public static final String KAFKA_CLUSTER_KEY = "REPLACE_WITH_YOUR_KAFKA_CLUSTER_KEY"; 5 | public static final String KAFKA_CLUSTER_SECRET = "REPLACE_WITH_YOUR_KAFKA_CLUSTER_SECRET"; 6 | 7 | public static final String SCHEMA_REGISTRY_KEY = "REPLACE_WITH_SCHEMA_REGISTRY_KEY"; 8 | public static final String SCHEMA_REGISTRY_SECRET = "REPLACE_WITH_SCHEMA_REGISTRY_SECRET"; 9 | 10 | } 11 | -------------------------------------------------------------------------------- /06-streaming/java/kafka_examples/src/main/java/org/example/Topics.java: -------------------------------------------------------------------------------- 1 | package org.example; 2 | 3 | public class Topics { 4 | public static final String INPUT_RIDE_TOPIC = "rides"; 5 | public static final String INPUT_RIDE_LOCATION_TOPIC = "rides_location"; 6 | public static final String OUTPUT_TOPIC = "vendor_info"; 7 | } 8 | -------------------------------------------------------------------------------- /06-streaming/java/kafka_examples/src/main/java/org/example/customserdes/CustomSerdes.java: -------------------------------------------------------------------------------- 1 | package org.example.customserdes; 2 | 3 | import io.confluent.kafka.serializers.AbstractKafkaAvroSerDeConfig; 4 | import io.confluent.kafka.serializers.KafkaJsonDeserializer; 5 | import io.confluent.kafka.serializers.KafkaJsonSerializer; 6 | import io.confluent.kafka.streams.serdes.avro.SpecificAvroSerde; 7 | import org.apache.avro.specific.SpecificRecordBase; 8 | import org.apache.kafka.common.serialization.Deserializer; 9 | import org.apache.kafka.common.serialization.Serde; 10 | import org.apache.kafka.common.serialization.Serdes; 11 | import org.apache.kafka.common.serialization.Serializer; 12 | import org.example.data.PickupLocation; 13 | import org.example.data.Ride; 14 | import org.example.data.VendorInfo; 15 | 16 | import java.util.HashMap; 17 | import java.util.Map; 18 | 19 | public class CustomSerdes { 20 | 21 | public static Serde getSerde(Class classOf) { 22 | Map serdeProps = new HashMap<>(); 23 | serdeProps.put("json.value.type", classOf); 24 | final Serializer mySerializer = new KafkaJsonSerializer<>(); 25 | mySerializer.configure(serdeProps, false); 26 | 27 | final Deserializer myDeserializer = new KafkaJsonDeserializer<>(); 28 | myDeserializer.configure(serdeProps, false); 29 | return Serdes.serdeFrom(mySerializer, myDeserializer); 30 | } 31 | 32 | public static SpecificAvroSerde getAvroSerde(boolean isKey, String schemaRegistryUrl) { 33 | var serde = new SpecificAvroSerde(); 34 | 35 | Map serdeProps = new HashMap<>(); 36 | serdeProps.put(AbstractKafkaAvroSerDeConfig.SCHEMA_REGISTRY_URL_CONFIG, schemaRegistryUrl); 37 | serde.configure(serdeProps, isKey); 38 | return serde; 39 | } 40 | 41 | 42 | } 43 | -------------------------------------------------------------------------------- /06-streaming/java/kafka_examples/src/main/java/org/example/data/PickupLocation.java: -------------------------------------------------------------------------------- 1 | package org.example.data; 2 | 3 | import java.time.LocalDateTime; 4 | 5 | public class PickupLocation { 6 | public PickupLocation(long PULocationID, LocalDateTime tpep_pickup_datetime) { 7 | this.PULocationID = PULocationID; 8 | this.tpep_pickup_datetime = tpep_pickup_datetime; 9 | } 10 | 11 | public PickupLocation() { 12 | } 13 | 14 | public long PULocationID; 15 | public LocalDateTime tpep_pickup_datetime; 16 | } 17 | -------------------------------------------------------------------------------- /06-streaming/java/kafka_examples/src/main/java/org/example/data/Ride.java: -------------------------------------------------------------------------------- 1 | package org.example.data; 2 | 3 | import java.nio.DoubleBuffer; 4 | import java.time.LocalDate; 5 | import java.time.LocalDateTime; 6 | import java.time.format.DateTimeFormatter; 7 | 8 | public class Ride { 9 | public Ride(String[] arr) { 10 | VendorID = arr[0]; 11 | tpep_pickup_datetime = LocalDateTime.parse(arr[1], DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss")); 12 | tpep_dropoff_datetime = LocalDateTime.parse(arr[2], DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss")); 13 | passenger_count = Integer.parseInt(arr[3]); 14 | trip_distance = Double.parseDouble(arr[4]); 15 | RatecodeID = Long.parseLong(arr[5]); 16 | store_and_fwd_flag = arr[6]; 17 | PULocationID = Long.parseLong(arr[7]); 18 | DOLocationID = Long.parseLong(arr[8]); 19 | payment_type = arr[9]; 20 | fare_amount = Double.parseDouble(arr[10]); 21 | extra = Double.parseDouble(arr[11]); 22 | mta_tax = Double.parseDouble(arr[12]); 23 | tip_amount = Double.parseDouble(arr[13]); 24 | tolls_amount = Double.parseDouble(arr[14]); 25 | improvement_surcharge = Double.parseDouble(arr[15]); 26 | total_amount = Double.parseDouble(arr[16]); 27 | congestion_surcharge = Double.parseDouble(arr[17]); 28 | } 29 | public Ride(){} 30 | public String VendorID; 31 | public LocalDateTime tpep_pickup_datetime; 32 | public LocalDateTime tpep_dropoff_datetime; 33 | public int passenger_count; 34 | public double trip_distance; 35 | public long RatecodeID; 36 | public String store_and_fwd_flag; 37 | public long PULocationID; 38 | public long DOLocationID; 39 | public String payment_type; 40 | public double fare_amount; 41 | public double extra; 42 | public double mta_tax; 43 | public double tip_amount; 44 | public double tolls_amount; 45 | public double improvement_surcharge; 46 | public double total_amount; 47 | public double congestion_surcharge; 48 | 49 | } 50 | -------------------------------------------------------------------------------- /06-streaming/java/kafka_examples/src/main/java/org/example/data/VendorInfo.java: -------------------------------------------------------------------------------- 1 | package org.example.data; 2 | 3 | import java.time.LocalDateTime; 4 | 5 | public class VendorInfo { 6 | 7 | public VendorInfo(String vendorID, long PULocationID, LocalDateTime pickupTime, LocalDateTime lastDropoffTime) { 8 | VendorID = vendorID; 9 | this.PULocationID = PULocationID; 10 | this.pickupTime = pickupTime; 11 | this.lastDropoffTime = lastDropoffTime; 12 | } 13 | 14 | public VendorInfo() { 15 | } 16 | 17 | public String VendorID; 18 | public long PULocationID; 19 | public LocalDateTime pickupTime; 20 | public LocalDateTime lastDropoffTime; 21 | } 22 | -------------------------------------------------------------------------------- /06-streaming/java/kafka_examples/src/test/java/org/example/helper/DataGeneratorHelper.java: -------------------------------------------------------------------------------- 1 | package org.example.helper; 2 | 3 | import org.example.data.PickupLocation; 4 | import org.example.data.Ride; 5 | import org.example.data.VendorInfo; 6 | 7 | import java.time.LocalDateTime; 8 | import java.time.format.DateTimeFormatter; 9 | import java.util.List; 10 | 11 | public class DataGeneratorHelper { 12 | public static Ride generateRide() { 13 | var arrivalTime = LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss")); 14 | var departureTime = LocalDateTime.now().minusMinutes(30).format(DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss")); 15 | return new Ride(new String[]{"1", departureTime, arrivalTime,"1","1.50","1","N","238","75","2","8","0.5","0.5","0","0","0.3","9.3","0"}); 16 | } 17 | 18 | public static PickupLocation generatePickUpLocation(long pickupLocationId) { 19 | return new PickupLocation(pickupLocationId, LocalDateTime.now()); 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /06-streaming/ksqldb/commands.md: -------------------------------------------------------------------------------- 1 | ## KSQL DB Examples 2 | ### Create streams 3 | ```sql 4 | CREATE STREAM ride_streams ( 5 | VendorId varchar, 6 | trip_distance double, 7 | payment_type varchar 8 | ) WITH (KAFKA_TOPIC='rides', 9 | VALUE_FORMAT='JSON'); 10 | ``` 11 | 12 | ### Query stream 13 | ```sql 14 | select * from RIDE_STREAMS 15 | EMIT CHANGES; 16 | ``` 17 | 18 | ### Query stream count 19 | ```sql 20 | SELECT VENDORID, count(*) FROM RIDE_STREAMS 21 | GROUP BY VENDORID 22 | EMIT CHANGES; 23 | ``` 24 | 25 | ### Query stream with filters 26 | ```sql 27 | SELECT payment_type, count(*) FROM RIDE_STREAMS 28 | WHERE payment_type IN ('1', '2') 29 | GROUP BY payment_type 30 | EMIT CHANGES; 31 | ``` 32 | 33 | ### Query stream with window functions 34 | ```sql 35 | CREATE TABLE payment_type_sessions AS 36 | SELECT payment_type, 37 | count(*) 38 | FROM RIDE_STREAMS 39 | WINDOW SESSION (60 SECONDS) 40 | GROUP BY payment_type 41 | EMIT CHANGES; 42 | ``` 43 | 44 | ## KSQL documentation for details 45 | [KSQL DB Documentation](https://docs.ksqldb.io/en/latest/developer-guide/ksqldb-reference/quick-reference/) 46 | 47 | [KSQL DB Java client](https://docs.ksqldb.io/en/latest/developer-guide/ksqldb-clients/java-client/) -------------------------------------------------------------------------------- /06-streaming/pyflink/.gitignore: -------------------------------------------------------------------------------- 1 | data/ 2 | postgres-data 3 | # Byte-compiled / optimized / DLL files 4 | __pycache__/ 5 | *.py[cod] 6 | *$py.class 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Distribution / packaging 12 | .Python 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | wheels/ 25 | pip-wheel-metadata/ 26 | share/python-wheels/ 27 | *.egg-info/ 28 | .installed.cfg 29 | *.egg 30 | MANIFEST 31 | 32 | # PyInstaller 33 | # Usually these files are written by a python script from a template 34 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 35 | *.manifest 36 | *.spec 37 | 38 | # Installer logs 39 | pip-log.txt 40 | pip-delete-this-directory.txt 41 | 42 | # Unit test / coverage reports 43 | htmlcov/ 44 | .tox/ 45 | .nox/ 46 | .coverage 47 | .coverage.* 48 | .cache 49 | nosetests.xml 50 | coverage.xml 51 | *.cover 52 | *.py,cover 53 | .hypothesis/ 54 | .pytest_cache/ 55 | 56 | # Translations 57 | *.mo 58 | *.pot 59 | 60 | # Django stuff: 61 | *.log 62 | local_settings.py 63 | db.sqlite3 64 | db.sqlite3-journal 65 | 66 | # Flask stuff: 67 | instance/ 68 | .webassets-cache 69 | 70 | # Scrapy stuff: 71 | .scrapy 72 | 73 | # Sphinx documentation 74 | docs/_build/ 75 | 76 | # PyBuilder 77 | target/ 78 | 79 | # Jupyter Notebook 80 | .ipynb_checkpoints 81 | 82 | # IPython 83 | profile_default/ 84 | ipython_config.py 85 | 86 | # pyenv 87 | .python-version 88 | 89 | # pipenv 90 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 91 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 92 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 93 | # install all needed dependencies. 94 | #Pipfile.lock 95 | 96 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 97 | __pypackages__/ 98 | 99 | # Celery stuff 100 | celerybeat-schedule 101 | celerybeat.pid 102 | 103 | # SageMath parsed files 104 | *.sage.py 105 | 106 | # Environments 107 | .env 108 | .venv 109 | env/ 110 | venv/ 111 | ENV/ 112 | env.bak/ 113 | venv.bak/ 114 | 115 | # Spyder project settings 116 | .spyderproject 117 | .spyproject 118 | 119 | # Rope project settings 120 | .ropeproject 121 | 122 | # mkdocs documentation 123 | /site 124 | 125 | # mypy 126 | .mypy_cache/ 127 | .dmypy.json 128 | dmypy.json 129 | 130 | # Pyre type checker 131 | .pyre/ 132 | 133 | dump.sql 134 | 135 | # Personal workspace files 136 | .idea/* 137 | .vscode/* -------------------------------------------------------------------------------- /06-streaming/pyflink/Dockerfile.flink: -------------------------------------------------------------------------------- 1 | FROM --platform=linux/amd64 flink:1.16.0-scala_2.12-java8 2 | 3 | # install python3: it has updated Python to 3.9 in Debian 11 and so install Python 3.7 from source 4 | # it currently only supports Python 3.6, 3.7 and 3.8 in PyFlink officially. 5 | 6 | # ref: https://nightlies.apache.org/flink/flink-docs-release-1.16/docs/deployment/resource-providers/standalone/docker/#using-flink-python-on-docker 7 | 8 | RUN apt-get update -y && \ 9 | apt-get install -y build-essential libssl-dev zlib1g-dev libbz2-dev libffi-dev liblzma-dev && \ 10 | wget https://www.python.org/ftp/python/3.7.9/Python-3.7.9.tgz && \ 11 | tar -xvf Python-3.7.9.tgz && \ 12 | cd Python-3.7.9 && \ 13 | ./configure --without-tests --enable-shared && \ 14 | make -j6 && \ 15 | make install && \ 16 | ldconfig /usr/local/lib && \ 17 | cd .. && rm -f Python-3.7.9.tgz && rm -rf Python-3.7.9 && \ 18 | ln -s /usr/local/bin/python3 /usr/local/bin/python && \ 19 | apt-get clean && \ 20 | rm -rf /var/lib/apt/lists/* 21 | 22 | # install PyFlink 23 | COPY requirements.txt . 24 | RUN python -m pip install --upgrade pip; \ 25 | pip3 install --upgrade google-api-python-client; \ 26 | pip3 install -r requirements.txt --no-cache-dir; 27 | 28 | # Download connector libraries 29 | RUN wget -P /opt/flink/lib/ https://repo.maven.apache.org/maven2/org/apache/flink/flink-json/1.16.0/flink-json-1.16.0.jar; \ 30 | wget -P /opt/flink/lib/ https://repo.maven.apache.org/maven2/org/apache/flink/flink-sql-connector-kafka/1.16.0/flink-sql-connector-kafka-1.16.0.jar; \ 31 | wget -P /opt/flink/lib/ https://repo.maven.apache.org/maven2/org/apache/flink/flink-connector-jdbc/1.16.0/flink-connector-jdbc-1.16.0.jar; \ 32 | wget -P /opt/flink/lib/ https://repo1.maven.org/maven2/org/postgresql/postgresql/42.2.24/postgresql-42.2.24.jar; 33 | 34 | RUN echo "taskmanager.memory.jvm-metaspace.size: 512m" >> /opt/flink/conf/flink-conf.yaml; 35 | 36 | WORKDIR /opt/flink 37 | -------------------------------------------------------------------------------- /06-streaming/pyflink/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 Sreela Das, Julie Scherer, Zach Wilson 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /06-streaming/pyflink/Makefile: -------------------------------------------------------------------------------- 1 | PLATFORM ?= linux/amd64 2 | 3 | # COLORS 4 | GREEN := $(shell tput -Txterm setaf 2) 5 | YELLOW := $(shell tput -Txterm setaf 3) 6 | WHITE := $(shell tput -Txterm setaf 7) 7 | RESET := $(shell tput -Txterm sgr0) 8 | 9 | 10 | TARGET_MAX_CHAR_NUM=20 11 | 12 | ## Show help with `make help` 13 | help: 14 | @echo '' 15 | @echo 'Usage:' 16 | @echo ' ${YELLOW}make${RESET} ${GREEN}${RESET}' 17 | @echo '' 18 | @echo 'Targets:' 19 | @awk '/^[a-zA-Z\-\_0-9]+:/ { \ 20 | helpMessage = match(lastLine, /^## (.*)/); \ 21 | if (helpMessage) { \ 22 | helpCommand = substr($$1, 0, index($$1, ":")-1); \ 23 | helpMessage = substr(lastLine, RSTART + 3, RLENGTH); \ 24 | printf " ${YELLOW}%-$(TARGET_MAX_CHAR_NUM)s${RESET} ${GREEN}%s${RESET}\n", helpCommand, helpMessage; \ 25 | } \ 26 | } \ 27 | { lastLine = $$0 }' $(MAKEFILE_LIST) 28 | 29 | .PHONY: build 30 | ## Builds the Flink base image with pyFlink and connectors installed 31 | build: 32 | docker build . 33 | 34 | .PHONY: up 35 | ## Builds the base Docker image and starts Flink cluster 36 | up: 37 | docker compose up --build --remove-orphans -d 38 | 39 | .PHONY: down 40 | ## Shuts down the Flink cluster 41 | down: 42 | docker compose down --remove-orphans 43 | 44 | .PHONY: job 45 | ## Submit the Flink job 46 | job: 47 | docker compose exec jobmanager ./bin/flink run -py /opt/src/job/start_job.py --pyFiles /opt/src -d 48 | 49 | aggregation_job: 50 | docker compose exec jobmanager ./bin/flink run -py /opt/src/job/aggregation_job.py --pyFiles /opt/src -d 51 | 52 | .PHONY: stop 53 | ## Stops all services in Docker compose 54 | stop: 55 | docker compose stop 56 | 57 | .PHONY: start 58 | ## Starts all services in Docker compose 59 | start: 60 | docker compose start 61 | -------------------------------------------------------------------------------- /06-streaming/pyflink/requirements.txt: -------------------------------------------------------------------------------- 1 | apache-flink==1.16.0 2 | psycopg2-binary==2.9.1 3 | requests 4 | kafka-python -------------------------------------------------------------------------------- /06-streaming/pyflink/src/job/start_job.py: -------------------------------------------------------------------------------- 1 | from pyflink.datastream import StreamExecutionEnvironment 2 | from pyflink.table import EnvironmentSettings, DataTypes, TableEnvironment, StreamTableEnvironment 3 | 4 | 5 | def create_processed_events_sink_postgres(t_env): 6 | table_name = 'processed_events' 7 | sink_ddl = f""" 8 | CREATE TABLE {table_name} ( 9 | test_data INTEGER, 10 | event_timestamp TIMESTAMP 11 | ) WITH ( 12 | 'connector' = 'jdbc', 13 | 'url' = 'jdbc:postgresql://postgres:5432/postgres', 14 | 'table-name' = '{table_name}', 15 | 'username' = 'postgres', 16 | 'password' = 'postgres', 17 | 'driver' = 'org.postgresql.Driver' 18 | ); 19 | """ 20 | t_env.execute_sql(sink_ddl) 21 | return table_name 22 | 23 | 24 | def create_events_source_kafka(t_env): 25 | table_name = "events" 26 | pattern = "yyyy-MM-dd HH:mm:ss.SSS" 27 | source_ddl = f""" 28 | CREATE TABLE {table_name} ( 29 | test_data INTEGER, 30 | event_timestamp BIGINT, 31 | event_watermark AS TO_TIMESTAMP_LTZ(event_timestamp, 3), 32 | WATERMARK for event_watermark as event_watermark - INTERVAL '5' SECOND 33 | ) WITH ( 34 | 'connector' = 'kafka', 35 | 'properties.bootstrap.servers' = 'redpanda-1:29092', 36 | 'topic' = 'test-topic', 37 | 'scan.startup.mode' = 'latest-offset', 38 | 'properties.auto.offset.reset' = 'latest', 39 | 'format' = 'json' 40 | ); 41 | """ 42 | t_env.execute_sql(source_ddl) 43 | return table_name 44 | 45 | def log_processing(): 46 | # Set up the execution environment 47 | env = StreamExecutionEnvironment.get_execution_environment() 48 | env.enable_checkpointing(10 * 1000) 49 | # env.set_parallelism(1) 50 | 51 | # Set up the table environment 52 | settings = EnvironmentSettings.new_instance().in_streaming_mode().build() 53 | t_env = StreamTableEnvironment.create(env, environment_settings=settings) 54 | try: 55 | # Create Kafka table 56 | source_table = create_events_source_kafka(t_env) 57 | postgres_sink = create_processed_events_sink_postgres(t_env) 58 | # write records to postgres too! 59 | t_env.execute_sql( 60 | f""" 61 | INSERT INTO {postgres_sink} 62 | SELECT 63 | test_data, 64 | TO_TIMESTAMP_LTZ(event_timestamp, 3) as event_timestamp 65 | FROM {source_table} 66 | """ 67 | ).wait() 68 | 69 | except Exception as e: 70 | print("Writing records from Kafka to JDBC failed:", str(e)) 71 | 72 | 73 | if __name__ == '__main__': 74 | log_processing() 75 | -------------------------------------------------------------------------------- /06-streaming/pyflink/src/producers/load_taxi_data.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import json 3 | from kafka import KafkaProducer 4 | 5 | def main(): 6 | # Create a Kafka producer 7 | producer = KafkaProducer( 8 | bootstrap_servers='localhost:9092', 9 | value_serializer=lambda v: json.dumps(v).encode('utf-8') 10 | ) 11 | 12 | csv_file = 'data/green_tripdata_2019-10.csv' # change to your CSV file path if needed 13 | 14 | with open(csv_file, 'r', newline='', encoding='utf-8') as file: 15 | reader = csv.DictReader(file) 16 | 17 | for row in reader: 18 | # Each row will be a dictionary keyed by the CSV headers 19 | # Send data to Kafka topic "green-data" 20 | producer.send('green-data', value=row) 21 | 22 | # Make sure any remaining messages are delivered 23 | producer.flush() 24 | producer.close() 25 | 26 | 27 | if __name__ == "__main__": 28 | main() -------------------------------------------------------------------------------- /06-streaming/pyflink/src/producers/producer.py: -------------------------------------------------------------------------------- 1 | import json 2 | import time 3 | from kafka import KafkaProducer 4 | 5 | def json_serializer(data): 6 | return json.dumps(data).encode('utf-8') 7 | 8 | server = 'localhost:9092' 9 | 10 | producer = KafkaProducer( 11 | bootstrap_servers=[server], 12 | value_serializer=json_serializer 13 | ) 14 | t0 = time.time() 15 | 16 | topic_name = 'test-topic' 17 | 18 | for i in range(10, 1000): 19 | message = {'test_data': i, 'event_timestamp': time.time() * 1000} 20 | producer.send(topic_name, value=message) 21 | print(f"Sent: {message}") 22 | time.sleep(0.05) 23 | 24 | producer.flush() 25 | 26 | t1 = time.time() 27 | print(f'took {(t1 - t0):.2f} seconds') -------------------------------------------------------------------------------- /06-streaming/python/README.md: -------------------------------------------------------------------------------- 1 | ### Stream-Processing with Python 2 | 3 | In this document, you will be finding information about stream processing 4 | using different Python libraries (`kafka-python`,`confluent-kafka`,`pyspark`, `faust`). 5 | 6 | This Python module can be separated in following modules. 7 | 8 | #### 1. Docker 9 | Docker module includes, Dockerfiles and docker-compose definitions 10 | to run Kafka and Spark in a docker container. Setting up required services is 11 | the prerequsite step for running following modules. 12 | 13 | #### 2. Kafka Producer - Consumer Examples 14 | - [Json Producer-Consumer Example](json_example) using `kafka-python` library 15 | - [Avro Producer-Consumer Example](avro_example) using `confluent-kafka` library 16 | 17 | Both of these examples require, up-and running Kafka services, therefore please ensure 18 | following steps under [docker-README](docker/README.md) 19 | 20 | To run the producer-consumer examples in the respective example folder, run following commands 21 | ```bash 22 | # Start producer script 23 | python3 producer.py 24 | # Start consumer script 25 | python3 consumer.py 26 | ``` 27 | 28 | 29 | 30 | 31 | 32 | -------------------------------------------------------------------------------- /06-streaming/python/avro_example/ride_record.py: -------------------------------------------------------------------------------- 1 | from typing import List, Dict 2 | 3 | 4 | class RideRecord: 5 | 6 | def __init__(self, arr: List[str]): 7 | self.vendor_id = int(arr[0]) 8 | self.passenger_count = int(arr[1]) 9 | self.trip_distance = float(arr[2]) 10 | self.payment_type = int(arr[3]) 11 | self.total_amount = float(arr[4]) 12 | 13 | @classmethod 14 | def from_dict(cls, d: Dict): 15 | return cls(arr=[ 16 | d['vendor_id'], 17 | d['passenger_count'], 18 | d['trip_distance'], 19 | d['payment_type'], 20 | d['total_amount'] 21 | ] 22 | ) 23 | 24 | def __repr__(self): 25 | return f'{self.__class__.__name__}: {self.__dict__}' 26 | 27 | 28 | def dict_to_ride_record(obj, ctx): 29 | if obj is None: 30 | return None 31 | 32 | return RideRecord.from_dict(obj) 33 | 34 | 35 | def ride_record_to_dict(ride_record: RideRecord, ctx): 36 | return ride_record.__dict__ 37 | -------------------------------------------------------------------------------- /06-streaming/python/avro_example/ride_record_key.py: -------------------------------------------------------------------------------- 1 | from typing import Dict 2 | 3 | 4 | class RideRecordKey: 5 | def __init__(self, vendor_id): 6 | self.vendor_id = vendor_id 7 | 8 | @classmethod 9 | def from_dict(cls, d: Dict): 10 | return cls(vendor_id=d['vendor_id']) 11 | 12 | def __repr__(self): 13 | return f'{self.__class__.__name__}: {self.__dict__}' 14 | 15 | 16 | def dict_to_ride_record_key(obj, ctx): 17 | if obj is None: 18 | return None 19 | 20 | return RideRecordKey.from_dict(obj) 21 | 22 | 23 | def ride_record_key_to_dict(ride_record_key: RideRecordKey, ctx): 24 | return ride_record_key.__dict__ 25 | -------------------------------------------------------------------------------- /06-streaming/python/avro_example/settings.py: -------------------------------------------------------------------------------- 1 | INPUT_DATA_PATH = '../resources/rides.csv' 2 | 3 | RIDE_KEY_SCHEMA_PATH = '../resources/schemas/taxi_ride_key.avsc' 4 | RIDE_VALUE_SCHEMA_PATH = '../resources/schemas/taxi_ride_value.avsc' 5 | 6 | SCHEMA_REGISTRY_URL = 'http://localhost:8081' 7 | BOOTSTRAP_SERVERS = 'localhost:9092' 8 | KAFKA_TOPIC = 'rides_avro' 9 | -------------------------------------------------------------------------------- /06-streaming/python/docker/README.md: -------------------------------------------------------------------------------- 1 | 2 | # Running Spark and Kafka Clusters on Docker 3 | 4 | ### 1. Build Required Images for running Spark 5 | 6 | The details of how to spark-images are build in different layers can be created can be read through 7 | the blog post written by André Perez on [Medium blog -Towards Data Science](https://towardsdatascience.com/apache-spark-cluster-on-docker-ft-a-juyterlab-interface-418383c95445) 8 | 9 | ```bash 10 | # Build Spark Images 11 | ./build.sh 12 | ``` 13 | 14 | ### 2. Create Docker Network & Volume 15 | 16 | ```bash 17 | # Create Network 18 | docker network create kafka-spark-network 19 | 20 | # Create Volume 21 | docker volume create --name=hadoop-distributed-file-system 22 | ``` 23 | 24 | ### 3. Run Services on Docker 25 | ```bash 26 | # Start Docker-Compose (within for kafka and spark folders) 27 | docker compose up -d 28 | ``` 29 | In depth explanation of [Kafka Listeners](https://www.confluent.io/blog/kafka-listeners-explained/) 30 | 31 | Explanation of [Kafka Listeners](https://www.confluent.io/blog/kafka-listeners-explained/) 32 | 33 | ### 4. Stop Services on Docker 34 | ```bash 35 | # Stop Docker-Compose (within for kafka and spark folders) 36 | docker compose down 37 | ``` 38 | 39 | ### 5. Helpful Comands 40 | ```bash 41 | # Delete all Containers 42 | docker rm -f $(docker ps -a -q) 43 | 44 | # Delete all volumes 45 | docker volume rm $(docker volume ls -q) 46 | ``` 47 | 48 | -------------------------------------------------------------------------------- /06-streaming/python/docker/spark/build.sh: -------------------------------------------------------------------------------- 1 | # -- Software Stack Version 2 | 3 | SPARK_VERSION="3.3.1" 4 | HADOOP_VERSION="3" 5 | JUPYTERLAB_VERSION="3.6.1" 6 | 7 | # -- Building the Images 8 | 9 | docker build \ 10 | -f cluster-base.Dockerfile \ 11 | -t cluster-base . 12 | 13 | docker build \ 14 | --build-arg spark_version="${SPARK_VERSION}" \ 15 | --build-arg hadoop_version="${HADOOP_VERSION}" \ 16 | -f spark-base.Dockerfile \ 17 | -t spark-base . 18 | 19 | docker build \ 20 | -f spark-master.Dockerfile \ 21 | -t spark-master . 22 | 23 | docker build \ 24 | -f spark-worker.Dockerfile \ 25 | -t spark-worker . 26 | 27 | docker build \ 28 | --build-arg spark_version="${SPARK_VERSION}" \ 29 | --build-arg jupyterlab_version="${JUPYTERLAB_VERSION}" \ 30 | -f jupyterlab.Dockerfile \ 31 | -t jupyterlab . 32 | -------------------------------------------------------------------------------- /06-streaming/python/docker/spark/cluster-base.Dockerfile: -------------------------------------------------------------------------------- 1 | # Reference from offical Apache Spark repository Dockerfile for Kubernetes 2 | # https://github.com/apache/spark/blob/master/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/Dockerfile 3 | ARG java_image_tag=17-jre 4 | FROM eclipse-temurin:${java_image_tag} 5 | 6 | # -- Layer: OS + Python 7 | 8 | ARG shared_workspace=/opt/workspace 9 | 10 | RUN mkdir -p ${shared_workspace} && \ 11 | apt-get update -y && \ 12 | apt-get install -y python3 && \ 13 | ln -s /usr/bin/python3 /usr/bin/python && \ 14 | rm -rf /var/lib/apt/lists/* 15 | 16 | ENV SHARED_WORKSPACE=${shared_workspace} 17 | 18 | # -- Runtime 19 | 20 | VOLUME ${shared_workspace} 21 | CMD ["bash"] -------------------------------------------------------------------------------- /06-streaming/python/docker/spark/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3.6" 2 | volumes: 3 | shared-workspace: 4 | name: "hadoop-distributed-file-system" 5 | driver: local 6 | networks: 7 | default: 8 | name: kafka-spark-network 9 | external: true 10 | 11 | services: 12 | jupyterlab: 13 | image: jupyterlab 14 | container_name: jupyterlab 15 | ports: 16 | - 8888:8888 17 | volumes: 18 | - shared-workspace:/opt/workspace 19 | spark-master: 20 | image: spark-master 21 | container_name: spark-master 22 | environment: 23 | SPARK_LOCAL_IP: 'spark-master' 24 | ports: 25 | - 8080:8080 26 | - 7077:7077 27 | volumes: 28 | - shared-workspace:/opt/workspace 29 | spark-worker-1: 30 | image: spark-worker 31 | container_name: spark-worker-1 32 | environment: 33 | - SPARK_WORKER_CORES=1 34 | - SPARK_WORKER_MEMORY=4g 35 | ports: 36 | - 8083:8081 37 | volumes: 38 | - shared-workspace:/opt/workspace 39 | depends_on: 40 | - spark-master 41 | spark-worker-2: 42 | image: spark-worker 43 | container_name: spark-worker-2 44 | environment: 45 | - SPARK_WORKER_CORES=1 46 | - SPARK_WORKER_MEMORY=4g 47 | ports: 48 | - 8084:8081 49 | volumes: 50 | - shared-workspace:/opt/workspace 51 | depends_on: 52 | - spark-master 53 | -------------------------------------------------------------------------------- /06-streaming/python/docker/spark/jupyterlab.Dockerfile: -------------------------------------------------------------------------------- 1 | FROM cluster-base 2 | 3 | # -- Layer: JupyterLab 4 | 5 | ARG spark_version=3.3.1 6 | ARG jupyterlab_version=3.6.1 7 | 8 | RUN apt-get update -y && \ 9 | apt-get install -y python3-pip && \ 10 | pip3 install wget pyspark==${spark_version} jupyterlab==${jupyterlab_version} 11 | 12 | # -- Runtime 13 | 14 | EXPOSE 8888 15 | WORKDIR ${SHARED_WORKSPACE} 16 | CMD jupyter lab --ip=0.0.0.0 --port=8888 --no-browser --allow-root --NotebookApp.token= 17 | -------------------------------------------------------------------------------- /06-streaming/python/docker/spark/spark-base.Dockerfile: -------------------------------------------------------------------------------- 1 | FROM cluster-base 2 | 3 | # -- Layer: Apache Spark 4 | 5 | ARG spark_version=3.3.1 6 | ARG hadoop_version=3 7 | 8 | RUN apt-get update -y && \ 9 | apt-get install -y curl && \ 10 | curl https://archive.apache.org/dist/spark/spark-${spark_version}/spark-${spark_version}-bin-hadoop${hadoop_version}.tgz -o spark.tgz && \ 11 | tar -xf spark.tgz && \ 12 | mv spark-${spark_version}-bin-hadoop${hadoop_version} /usr/bin/ && \ 13 | mkdir /usr/bin/spark-${spark_version}-bin-hadoop${hadoop_version}/logs && \ 14 | rm spark.tgz 15 | 16 | ENV SPARK_HOME /usr/bin/spark-${spark_version}-bin-hadoop${hadoop_version} 17 | ENV SPARK_MASTER_HOST spark-master 18 | ENV SPARK_MASTER_PORT 7077 19 | ENV PYSPARK_PYTHON python3 20 | 21 | # -- Runtime 22 | 23 | WORKDIR ${SPARK_HOME} -------------------------------------------------------------------------------- /06-streaming/python/docker/spark/spark-master.Dockerfile: -------------------------------------------------------------------------------- 1 | FROM spark-base 2 | 3 | # -- Runtime 4 | 5 | ARG spark_master_web_ui=8080 6 | 7 | EXPOSE ${spark_master_web_ui} ${SPARK_MASTER_PORT} 8 | CMD bin/spark-class org.apache.spark.deploy.master.Master >> logs/spark-master.out -------------------------------------------------------------------------------- /06-streaming/python/docker/spark/spark-worker.Dockerfile: -------------------------------------------------------------------------------- 1 | FROM spark-base 2 | 3 | # -- Runtime 4 | 5 | ARG spark_worker_web_ui=8081 6 | 7 | EXPOSE ${spark_worker_web_ui} 8 | CMD bin/spark-class org.apache.spark.deploy.worker.Worker spark://${SPARK_MASTER_HOST}:${SPARK_MASTER_PORT} >> logs/spark-worker.out 9 | -------------------------------------------------------------------------------- /06-streaming/python/json_example/consumer.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, List 2 | from json import loads 3 | from kafka import KafkaConsumer 4 | 5 | from ride import Ride 6 | from settings import BOOTSTRAP_SERVERS, KAFKA_TOPIC 7 | 8 | 9 | class JsonConsumer: 10 | def __init__(self, props: Dict): 11 | self.consumer = KafkaConsumer(**props) 12 | 13 | def consume_from_kafka(self, topics: List[str]): 14 | self.consumer.subscribe(topics) 15 | print('Consuming from Kafka started') 16 | print('Available topics to consume: ', self.consumer.subscription()) 17 | while True: 18 | try: 19 | # SIGINT can't be handled when polling, limit timeout to 1 second. 20 | message = self.consumer.poll(1.0) 21 | if message is None or message == {}: 22 | continue 23 | for message_key, message_value in message.items(): 24 | for msg_val in message_value: 25 | print(msg_val.key, msg_val.value) 26 | except KeyboardInterrupt: 27 | break 28 | 29 | self.consumer.close() 30 | 31 | 32 | if __name__ == '__main__': 33 | config = { 34 | 'bootstrap_servers': BOOTSTRAP_SERVERS, 35 | 'auto_offset_reset': 'earliest', 36 | 'enable_auto_commit': True, 37 | 'key_deserializer': lambda key: int(key.decode('utf-8')), 38 | 'value_deserializer': lambda x: loads(x.decode('utf-8'), object_hook=lambda d: Ride.from_dict(d)), 39 | 'group_id': 'consumer.group.id.json-example.1', 40 | } 41 | 42 | json_consumer = JsonConsumer(props=config) 43 | json_consumer.consume_from_kafka(topics=[KAFKA_TOPIC]) 44 | -------------------------------------------------------------------------------- /06-streaming/python/json_example/producer.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import json 3 | from typing import List, Dict 4 | from kafka import KafkaProducer 5 | from kafka.errors import KafkaTimeoutError 6 | 7 | from ride import Ride 8 | from settings import BOOTSTRAP_SERVERS, INPUT_DATA_PATH, KAFKA_TOPIC 9 | 10 | 11 | class JsonProducer(KafkaProducer): 12 | def __init__(self, props: Dict): 13 | self.producer = KafkaProducer(**props) 14 | 15 | @staticmethod 16 | def read_records(resource_path: str): 17 | records = [] 18 | with open(resource_path, 'r') as f: 19 | reader = csv.reader(f) 20 | header = next(reader) # skip the header row 21 | for row in reader: 22 | records.append(Ride(arr=row)) 23 | return records 24 | 25 | def publish_rides(self, topic: str, messages: List[Ride]): 26 | for ride in messages: 27 | try: 28 | record = self.producer.send(topic=topic, key=ride.pu_location_id, value=ride) 29 | print('Record {} successfully produced at offset {}'.format(ride.pu_location_id, record.get().offset)) 30 | except KafkaTimeoutError as e: 31 | print(e.__str__()) 32 | 33 | 34 | if __name__ == '__main__': 35 | # Config Should match with the KafkaProducer expectation 36 | config = { 37 | 'bootstrap_servers': BOOTSTRAP_SERVERS, 38 | 'key_serializer': lambda key: str(key).encode(), 39 | 'value_serializer': lambda x: json.dumps(x.__dict__, default=str).encode('utf-8') 40 | } 41 | producer = JsonProducer(props=config) 42 | rides = producer.read_records(resource_path=INPUT_DATA_PATH) 43 | producer.publish_rides(topic=KAFKA_TOPIC, messages=rides) 44 | -------------------------------------------------------------------------------- /06-streaming/python/json_example/ride.py: -------------------------------------------------------------------------------- 1 | from typing import List, Dict 2 | from decimal import Decimal 3 | from datetime import datetime 4 | 5 | 6 | class Ride: 7 | def __init__(self, arr: List[str]): 8 | self.vendor_id = arr[0] 9 | self.tpep_pickup_datetime = datetime.strptime(arr[1], "%Y-%m-%d %H:%M:%S"), 10 | self.tpep_dropoff_datetime = datetime.strptime(arr[2], "%Y-%m-%d %H:%M:%S"), 11 | self.passenger_count = int(arr[3]) 12 | self.trip_distance = Decimal(arr[4]) 13 | self.rate_code_id = int(arr[5]) 14 | self.store_and_fwd_flag = arr[6] 15 | self.pu_location_id = int(arr[7]) 16 | self.do_location_id = int(arr[8]) 17 | self.payment_type = arr[9] 18 | self.fare_amount = Decimal(arr[10]) 19 | self.extra = Decimal(arr[11]) 20 | self.mta_tax = Decimal(arr[12]) 21 | self.tip_amount = Decimal(arr[13]) 22 | self.tolls_amount = Decimal(arr[14]) 23 | self.improvement_surcharge = Decimal(arr[15]) 24 | self.total_amount = Decimal(arr[16]) 25 | self.congestion_surcharge = Decimal(arr[17]) 26 | 27 | @classmethod 28 | def from_dict(cls, d: Dict): 29 | return cls(arr=[ 30 | d['vendor_id'], 31 | d['tpep_pickup_datetime'][0], 32 | d['tpep_dropoff_datetime'][0], 33 | d['passenger_count'], 34 | d['trip_distance'], 35 | d['rate_code_id'], 36 | d['store_and_fwd_flag'], 37 | d['pu_location_id'], 38 | d['do_location_id'], 39 | d['payment_type'], 40 | d['fare_amount'], 41 | d['extra'], 42 | d['mta_tax'], 43 | d['tip_amount'], 44 | d['tolls_amount'], 45 | d['improvement_surcharge'], 46 | d['total_amount'], 47 | d['congestion_surcharge'], 48 | ] 49 | ) 50 | 51 | def __repr__(self): 52 | return f'{self.__class__.__name__}: {self.__dict__}' 53 | -------------------------------------------------------------------------------- /06-streaming/python/json_example/settings.py: -------------------------------------------------------------------------------- 1 | INPUT_DATA_PATH = '../resources/rides.csv' 2 | 3 | BOOTSTRAP_SERVERS = ['localhost:9092'] 4 | KAFKA_TOPIC = 'rides_json' 5 | -------------------------------------------------------------------------------- /06-streaming/python/redpanda_example/consumer.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import Dict, List 3 | from json import loads 4 | from kafka import KafkaConsumer 5 | 6 | from ride import Ride 7 | from settings import BOOTSTRAP_SERVERS, KAFKA_TOPIC 8 | 9 | 10 | class JsonConsumer: 11 | def __init__(self, props: Dict): 12 | self.consumer = KafkaConsumer(**props) 13 | 14 | def consume_from_kafka(self, topics: List[str]): 15 | self.consumer.subscribe(topics) 16 | print('Consuming from Kafka started') 17 | print('Available topics to consume: ', self.consumer.subscription()) 18 | while True: 19 | try: 20 | # SIGINT can't be handled when polling, limit timeout to 1 second. 21 | message = self.consumer.poll(1.0) 22 | if message is None or message == {}: 23 | continue 24 | for message_key, message_value in message.items(): 25 | for msg_val in message_value: 26 | print(msg_val.key, msg_val.value) 27 | except KeyboardInterrupt: 28 | break 29 | 30 | self.consumer.close() 31 | 32 | 33 | if __name__ == '__main__': 34 | config = { 35 | 'bootstrap_servers': BOOTSTRAP_SERVERS, 36 | 'auto_offset_reset': 'earliest', 37 | 'enable_auto_commit': True, 38 | 'key_deserializer': lambda key: int(key.decode('utf-8')), 39 | 'value_deserializer': lambda x: loads(x.decode('utf-8'), object_hook=lambda d: Ride.from_dict(d)), 40 | 'group_id': 'consumer.group.id.json-example.1', 41 | } 42 | 43 | json_consumer = JsonConsumer(props=config) 44 | json_consumer.consume_from_kafka(topics=[KAFKA_TOPIC]) 45 | 46 | 47 | # There's no schema in JSON format, so if the schema changes and one column is removed or new one added or the data types is changed, the Ride class would still work and produce-consume messages would still run without a hitch. 48 | # But the issue is in the downstream Analytics as the dataset would no longer have that column and the dashboards would thus fail. Therefore, the trust in our data and processes would erodes. -------------------------------------------------------------------------------- /06-streaming/python/redpanda_example/docker-compose.yaml: -------------------------------------------------------------------------------- 1 | version: '3.7' 2 | services: 3 | # Redpanda cluster 4 | redpanda-1: 5 | image: docker.redpanda.com/redpandadata/redpanda:v23.2.26 6 | container_name: redpanda-1 7 | command: 8 | - redpanda 9 | - start 10 | - --smp 11 | - '1' 12 | - --reserve-memory 13 | - 0M 14 | - --overprovisioned 15 | - --node-id 16 | - '1' 17 | - --kafka-addr 18 | - PLAINTEXT://0.0.0.0:29092,OUTSIDE://0.0.0.0:9092 19 | - --advertise-kafka-addr 20 | - PLAINTEXT://redpanda-1:29092,OUTSIDE://localhost:9092 21 | - --pandaproxy-addr 22 | - PLAINTEXT://0.0.0.0:28082,OUTSIDE://0.0.0.0:8082 23 | - --advertise-pandaproxy-addr 24 | - PLAINTEXT://redpanda-1:28082,OUTSIDE://localhost:8082 25 | - --rpc-addr 26 | - 0.0.0.0:33145 27 | - --advertise-rpc-addr 28 | - redpanda-1:33145 29 | ports: 30 | # - 8081:8081 31 | - 8082:8082 32 | - 9092:9092 33 | - 9644:9644 34 | - 28082:28082 35 | - 29092:29092 36 | 37 | # Want a two node Redpanda cluster? Uncomment this block :) 38 | # redpanda-2: 39 | # image: docker.redpanda.com/redpandadata/redpanda:v23.1.1 40 | # container_name: redpanda-2 41 | # command: 42 | # - redpanda 43 | # - start 44 | # - --smp 45 | # - '1' 46 | # - --reserve-memory 47 | # - 0M 48 | # - --overprovisioned 49 | # - --node-id 50 | # - '2' 51 | # - --seeds 52 | # - redpanda-1:33145 53 | # - --kafka-addr 54 | # - PLAINTEXT://0.0.0.0:29093,OUTSIDE://0.0.0.0:9093 55 | # - --advertise-kafka-addr 56 | # - PLAINTEXT://redpanda-2:29093,OUTSIDE://localhost:9093 57 | # - --pandaproxy-addr 58 | # - PLAINTEXT://0.0.0.0:28083,OUTSIDE://0.0.0.0:8083 59 | # - --advertise-pandaproxy-addr 60 | # - PLAINTEXT://redpanda-2:28083,OUTSIDE://localhost:8083 61 | # - --rpc-addr 62 | # - 0.0.0.0:33146 63 | # - --advertise-rpc-addr 64 | # - redpanda-2:33146 65 | # ports: 66 | # - 8083:8083 67 | # - 9093:9093 68 | 69 | redpanda-console: 70 | image: docker.redpanda.com/redpandadata/console:v2.2.2 71 | container_name: redpanda-console 72 | entrypoint: /bin/sh 73 | command: -c "echo \"$$CONSOLE_CONFIG_FILE\" > /tmp/config.yml; /app/console" 74 | environment: 75 | CONFIG_FILEPATH: /tmp/config.yml 76 | CONSOLE_CONFIG_FILE: | 77 | kafka: 78 | brokers: ["redpanda-1:29092"] 79 | schemaRegistry: 80 | enabled: false 81 | redpanda: 82 | adminApi: 83 | enabled: true 84 | urls: ["http://redpanda-1:9644"] 85 | connect: 86 | enabled: false 87 | ports: 88 | - 8080:8080 89 | depends_on: 90 | - redpanda-1 91 | -------------------------------------------------------------------------------- /06-streaming/python/redpanda_example/producer.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import json 3 | from typing import List, Dict 4 | from kafka import KafkaProducer 5 | from kafka.errors import KafkaTimeoutError 6 | 7 | from ride import Ride 8 | from settings import BOOTSTRAP_SERVERS, INPUT_DATA_PATH, KAFKA_TOPIC 9 | 10 | 11 | class JsonProducer(KafkaProducer): 12 | def __init__(self, props: Dict): 13 | self.producer = KafkaProducer(**props) 14 | 15 | @staticmethod 16 | def read_records(resource_path: str): 17 | records = [] 18 | with open(resource_path, 'r') as f: 19 | reader = csv.reader(f) 20 | header = next(reader) # skip the header row 21 | for row in reader: 22 | records.append(Ride(arr=row)) 23 | return records 24 | 25 | def publish_rides(self, topic: str, messages: List[Ride]): 26 | for ride in messages: 27 | try: 28 | record = self.producer.send(topic=topic, key=ride.pu_location_id, value=ride) 29 | print('Record {} successfully produced at offset {}'.format(ride.pu_location_id, record.get().offset)) 30 | except KafkaTimeoutError as e: 31 | print(e.__str__()) 32 | 33 | 34 | if __name__ == '__main__': 35 | # Config Should match with the KafkaProducer expectation 36 | # kafka expects binary format for the key-value pair 37 | config = { 38 | 'bootstrap_servers': BOOTSTRAP_SERVERS, 39 | 'key_serializer': lambda key: str(key).encode(), 40 | 'value_serializer': lambda x: json.dumps(x.__dict__, default=str).encode('utf-8') 41 | } 42 | producer = JsonProducer(props=config) 43 | rides = producer.read_records(resource_path=INPUT_DATA_PATH) 44 | producer.publish_rides(topic=KAFKA_TOPIC, messages=rides) 45 | -------------------------------------------------------------------------------- /06-streaming/python/redpanda_example/ride.py: -------------------------------------------------------------------------------- 1 | from typing import List, Dict 2 | from decimal import Decimal 3 | from datetime import datetime 4 | 5 | 6 | class Ride: 7 | def __init__(self, arr: List[str]): 8 | self.vendor_id = arr[0] 9 | self.tpep_pickup_datetime = datetime.strptime(arr[1], "%Y-%m-%d %H:%M:%S"), 10 | self.tpep_dropoff_datetime = datetime.strptime(arr[2], "%Y-%m-%d %H:%M:%S"), 11 | self.passenger_count = int(arr[3]) 12 | self.trip_distance = Decimal(arr[4]) 13 | self.rate_code_id = int(arr[5]) 14 | self.store_and_fwd_flag = arr[6] 15 | self.pu_location_id = int(arr[7]) 16 | self.do_location_id = int(arr[8]) 17 | self.payment_type = arr[9] 18 | self.fare_amount = Decimal(arr[10]) 19 | self.extra = Decimal(arr[11]) 20 | self.mta_tax = Decimal(arr[12]) 21 | self.tip_amount = Decimal(arr[13]) 22 | self.tolls_amount = Decimal(arr[14]) 23 | self.improvement_surcharge = Decimal(arr[15]) 24 | self.total_amount = Decimal(arr[16]) 25 | self.congestion_surcharge = Decimal(arr[17]) 26 | 27 | @classmethod 28 | def from_dict(cls, d: Dict): 29 | return cls(arr=[ 30 | d['vendor_id'], 31 | d['tpep_pickup_datetime'][0], 32 | d['tpep_dropoff_datetime'][0], 33 | d['passenger_count'], 34 | d['trip_distance'], 35 | d['rate_code_id'], 36 | d['store_and_fwd_flag'], 37 | d['pu_location_id'], 38 | d['do_location_id'], 39 | d['payment_type'], 40 | d['fare_amount'], 41 | d['extra'], 42 | d['mta_tax'], 43 | d['tip_amount'], 44 | d['tolls_amount'], 45 | d['improvement_surcharge'], 46 | d['total_amount'], 47 | d['congestion_surcharge'], 48 | ] 49 | ) 50 | 51 | def __repr__(self): 52 | return f'{self.__class__.__name__}: {self.__dict__}' 53 | -------------------------------------------------------------------------------- /06-streaming/python/redpanda_example/settings.py: -------------------------------------------------------------------------------- 1 | INPUT_DATA_PATH = '../resources/rides.csv' 2 | 3 | BOOTSTRAP_SERVERS = ['localhost:9092'] 4 | KAFKA_TOPIC = 'rides_json' 5 | -------------------------------------------------------------------------------- /06-streaming/python/requirements.txt: -------------------------------------------------------------------------------- 1 | kafka-python==1.4.6 2 | confluent_kafka 3 | requests 4 | avro 5 | faust 6 | fastavro 7 | -------------------------------------------------------------------------------- /06-streaming/python/resources/schemas/taxi_ride_key.avsc: -------------------------------------------------------------------------------- 1 | { 2 | "namespace": "com.datatalksclub.taxi", 3 | "type": "record", 4 | "name": "RideRecordKey", 5 | "fields": [ 6 | { 7 | "name": "vendor_id", 8 | "type": "int" 9 | } 10 | ] 11 | } -------------------------------------------------------------------------------- /06-streaming/python/resources/schemas/taxi_ride_value.avsc: -------------------------------------------------------------------------------- 1 | { 2 | "namespace": "com.datatalksclub.taxi", 3 | "type": "record", 4 | "name": "RideRecord", 5 | "fields": [ 6 | { 7 | "name": "vendor_id", 8 | "type": "int" 9 | }, 10 | { 11 | "name": "passenger_count", 12 | "type": "int" 13 | }, 14 | { 15 | "name": "trip_distance", 16 | "type": "float" 17 | }, 18 | { 19 | "name": "payment_type", 20 | "type": "int" 21 | }, 22 | { 23 | "name": "total_amount", 24 | "type": "float" 25 | } 26 | ] 27 | } -------------------------------------------------------------------------------- /06-streaming/python/streams-example/faust/branch_price.py: -------------------------------------------------------------------------------- 1 | import faust 2 | from taxi_rides import TaxiRide 3 | from faust import current_event 4 | 5 | app = faust.App('datatalksclub.stream.v3', broker='kafka://localhost:9092', consumer_auto_offset_reset="earliest") 6 | topic = app.topic('datatalkclub.yellow_taxi_ride.json', value_type=TaxiRide) 7 | 8 | high_amount_rides = app.topic('datatalks.yellow_taxi_rides.high_amount') 9 | low_amount_rides = app.topic('datatalks.yellow_taxi_rides.low_amount') 10 | 11 | 12 | @app.agent(topic) 13 | async def process(stream): 14 | async for event in stream: 15 | if event.total_amount >= 40.0: 16 | await current_event().forward(high_amount_rides) 17 | else: 18 | await current_event().forward(low_amount_rides) 19 | 20 | if __name__ == '__main__': 21 | app.main() 22 | -------------------------------------------------------------------------------- /06-streaming/python/streams-example/faust/producer_taxi_json.py: -------------------------------------------------------------------------------- 1 | import csv 2 | from json import dumps 3 | from kafka import KafkaProducer 4 | from time import sleep 5 | 6 | 7 | producer = KafkaProducer(bootstrap_servers=['localhost:9092'], 8 | key_serializer=lambda x: dumps(x).encode('utf-8'), 9 | value_serializer=lambda x: dumps(x).encode('utf-8')) 10 | 11 | file = open('../../resources/rides.csv') 12 | 13 | csvreader = csv.reader(file) 14 | header = next(csvreader) 15 | for row in csvreader: 16 | key = {"vendorId": int(row[0])} 17 | value = {"vendorId": int(row[0]), "passenger_count": int(row[3]), "trip_distance": float(row[4]), "payment_type": int(row[9]), "total_amount": float(row[16])} 18 | producer.send('datatalkclub.yellow_taxi_ride.json', value=value, key=key) 19 | print("producing") 20 | sleep(1) -------------------------------------------------------------------------------- /06-streaming/python/streams-example/faust/stream.py: -------------------------------------------------------------------------------- 1 | import faust 2 | from taxi_rides import TaxiRide 3 | 4 | 5 | app = faust.App('datatalksclub.stream.v2', broker='kafka://localhost:9092') 6 | topic = app.topic('datatalkclub.yellow_taxi_ride.json', value_type=TaxiRide) 7 | 8 | 9 | @app.agent(topic) 10 | async def start_reading(records): 11 | async for record in records: 12 | print(record) 13 | 14 | 15 | if __name__ == '__main__': 16 | app.main() 17 | -------------------------------------------------------------------------------- /06-streaming/python/streams-example/faust/stream_count_vendor_trips.py: -------------------------------------------------------------------------------- 1 | import faust 2 | from taxi_rides import TaxiRide 3 | 4 | 5 | app = faust.App('datatalksclub.stream.v2', broker='kafka://localhost:9092') 6 | topic = app.topic('datatalkclub.yellow_taxi_ride.json', value_type=TaxiRide) 7 | 8 | vendor_rides = app.Table('vendor_rides', default=int) 9 | 10 | 11 | @app.agent(topic) 12 | async def process(stream): 13 | async for event in stream.group_by(TaxiRide.vendorId): 14 | vendor_rides[event.vendorId] += 1 15 | 16 | if __name__ == '__main__': 17 | app.main() 18 | -------------------------------------------------------------------------------- /06-streaming/python/streams-example/faust/taxi_rides.py: -------------------------------------------------------------------------------- 1 | import faust 2 | 3 | 4 | class TaxiRide(faust.Record, validation=True): 5 | vendorId: str 6 | passenger_count: int 7 | trip_distance: float 8 | payment_type: int 9 | total_amount: float 10 | -------------------------------------------------------------------------------- /06-streaming/python/streams-example/faust/windowing.py: -------------------------------------------------------------------------------- 1 | from datetime import timedelta 2 | import faust 3 | from taxi_rides import TaxiRide 4 | 5 | 6 | app = faust.App('datatalksclub.stream.v2', broker='kafka://localhost:9092') 7 | topic = app.topic('datatalkclub.yellow_taxi_ride.json', value_type=TaxiRide) 8 | 9 | vendor_rides = app.Table('vendor_rides_windowed', default=int).tumbling( 10 | timedelta(minutes=1), 11 | expires=timedelta(hours=1), 12 | ) 13 | 14 | 15 | @app.agent(topic) 16 | async def process(stream): 17 | async for event in stream.group_by(TaxiRide.vendorId): 18 | vendor_rides[event.vendorId] += 1 19 | 20 | 21 | if __name__ == '__main__': 22 | app.main() 23 | -------------------------------------------------------------------------------- /06-streaming/python/streams-example/pyspark/README.md: -------------------------------------------------------------------------------- 1 | 2 | # Running PySpark Streaming 3 | 4 | #### Prerequisite 5 | 6 | Ensure your Kafka and Spark services up and running by following the [docker setup readme](./../../docker/README.md). 7 | It is important to create network and volume as described in the document. Therefore please ensure, your volume and network are created correctly 8 | 9 | ```bash 10 | docker volume ls # should list hadoop-distributed-file-system 11 | docker network ls # should list kafka-spark-network 12 | ``` 13 | 14 | 15 | ### Running Producer and Consumer 16 | ```bash 17 | # Run producer 18 | python3 producer.py 19 | 20 | # Run consumer with default settings 21 | python3 consumer.py 22 | # Run consumer for specific topic 23 | python3 consumer.py --topic 24 | ``` 25 | 26 | ### Running Streaming Script 27 | 28 | spark-submit script ensures installation of necessary jars before running the streaming.py 29 | 30 | ```bash 31 | ./spark-submit.sh streaming.py 32 | ``` 33 | 34 | ### Additional Resources 35 | - [Structured Streaming Programming Guide](https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html#structured-streaming-programming-guide) 36 | - [Structured Streaming + Kafka Integration](https://spark.apache.org/docs/latest/structured-streaming-kafka-integration.html#structured-streaming-kafka-integration-guide-kafka-broker-versio) 37 | -------------------------------------------------------------------------------- /06-streaming/python/streams-example/pyspark/consumer.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from typing import Dict, List 3 | from kafka import KafkaConsumer 4 | 5 | from settings import BOOTSTRAP_SERVERS, CONSUME_TOPIC_RIDES_CSV 6 | 7 | 8 | class RideCSVConsumer: 9 | def __init__(self, props: Dict): 10 | self.consumer = KafkaConsumer(**props) 11 | 12 | def consume_from_kafka(self, topics: List[str]): 13 | self.consumer.subscribe(topics=topics) 14 | print('Consuming from Kafka started') 15 | print('Available topics to consume: ', self.consumer.subscription()) 16 | while True: 17 | try: 18 | # SIGINT can't be handled when polling, limit timeout to 1 second. 19 | msg = self.consumer.poll(1.0) 20 | if msg is None or msg == {}: 21 | continue 22 | for msg_key, msg_values in msg.items(): 23 | for msg_val in msg_values: 24 | print(f'Key:{msg_val.key}-type({type(msg_val.key)}), ' 25 | f'Value:{msg_val.value}-type({type(msg_val.value)})') 26 | except KeyboardInterrupt: 27 | break 28 | 29 | self.consumer.close() 30 | 31 | 32 | if __name__ == '__main__': 33 | parser = argparse.ArgumentParser(description='Kafka Consumer') 34 | parser.add_argument('--topic', type=str, default=CONSUME_TOPIC_RIDES_CSV) 35 | args = parser.parse_args() 36 | 37 | topic = args.topic 38 | config = { 39 | 'bootstrap_servers': [BOOTSTRAP_SERVERS], 40 | 'auto_offset_reset': 'earliest', 41 | 'enable_auto_commit': True, 42 | 'key_deserializer': lambda key: int(key.decode('utf-8')), 43 | 'value_deserializer': lambda value: value.decode('utf-8'), 44 | 'group_id': 'consumer.group.id.csv-example.1', 45 | } 46 | csv_consumer = RideCSVConsumer(props=config) 47 | csv_consumer.consume_from_kafka(topics=[topic]) 48 | -------------------------------------------------------------------------------- /06-streaming/python/streams-example/pyspark/producer.py: -------------------------------------------------------------------------------- 1 | import csv 2 | from time import sleep 3 | from typing import Dict 4 | from kafka import KafkaProducer 5 | 6 | from settings import BOOTSTRAP_SERVERS, INPUT_DATA_PATH, PRODUCE_TOPIC_RIDES_CSV 7 | 8 | 9 | def delivery_report(err, msg): 10 | if err is not None: 11 | print("Delivery failed for record {}: {}".format(msg.key(), err)) 12 | return 13 | print('Record {} successfully produced to {} [{}] at offset {}'.format( 14 | msg.key(), msg.topic(), msg.partition(), msg.offset())) 15 | 16 | 17 | class RideCSVProducer: 18 | def __init__(self, props: Dict): 19 | self.producer = KafkaProducer(**props) 20 | # self.producer = Producer(producer_props) 21 | 22 | @staticmethod 23 | def read_records(resource_path: str): 24 | records, ride_keys = [], [] 25 | i = 0 26 | with open(resource_path, 'r') as f: 27 | reader = csv.reader(f) 28 | header = next(reader) # skip the header 29 | for row in reader: 30 | # vendor_id, passenger_count, trip_distance, payment_type, total_amount 31 | records.append(f'{row[0]}, {row[1]}, {row[2]}, {row[3]}, {row[4]}, {row[9]}, {row[16]}') 32 | ride_keys.append(str(row[0])) 33 | i += 1 34 | if i == 5: 35 | break 36 | return zip(ride_keys, records) 37 | 38 | def publish(self, topic: str, records: [str, str]): 39 | for key_value in records: 40 | key, value = key_value 41 | try: 42 | self.producer.send(topic=topic, key=key, value=value) 43 | print(f"Producing record for ") 44 | except KeyboardInterrupt: 45 | break 46 | except Exception as e: 47 | print(f"Exception while producing record - {value}: {e}") 48 | 49 | self.producer.flush() 50 | sleep(1) 51 | 52 | 53 | if __name__ == "__main__": 54 | config = { 55 | 'bootstrap_servers': [BOOTSTRAP_SERVERS], 56 | 'key_serializer': lambda x: x.encode('utf-8'), 57 | 'value_serializer': lambda x: x.encode('utf-8') 58 | } 59 | producer = RideCSVProducer(props=config) 60 | ride_records = producer.read_records(resource_path=INPUT_DATA_PATH) 61 | print(ride_records) 62 | producer.publish(topic=PRODUCE_TOPIC_RIDES_CSV, records=ride_records) 63 | -------------------------------------------------------------------------------- /06-streaming/python/streams-example/pyspark/settings.py: -------------------------------------------------------------------------------- 1 | import pyspark.sql.types as T 2 | 3 | INPUT_DATA_PATH = '../../resources/rides.csv' 4 | BOOTSTRAP_SERVERS = 'localhost:9092' 5 | 6 | TOPIC_WINDOWED_VENDOR_ID_COUNT = 'vendor_counts_windowed' 7 | 8 | PRODUCE_TOPIC_RIDES_CSV = CONSUME_TOPIC_RIDES_CSV = 'rides_csv' 9 | 10 | RIDE_SCHEMA = T.StructType( 11 | [T.StructField("vendor_id", T.IntegerType()), 12 | T.StructField('tpep_pickup_datetime', T.TimestampType()), 13 | T.StructField('tpep_dropoff_datetime', T.TimestampType()), 14 | T.StructField("passenger_count", T.IntegerType()), 15 | T.StructField("trip_distance", T.FloatType()), 16 | T.StructField("payment_type", T.IntegerType()), 17 | T.StructField("total_amount", T.FloatType()), 18 | ]) 19 | -------------------------------------------------------------------------------- /06-streaming/python/streams-example/pyspark/spark-submit.sh: -------------------------------------------------------------------------------- 1 | # Submit Python code to SparkMaster 2 | 3 | if [ $# -lt 1 ] 4 | then 5 | echo "Usage: $0 [ executor-memory ]" 6 | echo "(specify memory in string format such as \"512M\" or \"2G\")" 7 | exit 1 8 | fi 9 | PYTHON_JOB=$1 10 | 11 | if [ -z $2 ] 12 | then 13 | EXEC_MEM="1G" 14 | else 15 | EXEC_MEM=$2 16 | fi 17 | spark-submit --master spark://localhost:7077 --num-executors 2 \ 18 | --executor-memory $EXEC_MEM --executor-cores 1 \ 19 | --packages org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.1,org.apache.spark:spark-avro_2.12:3.3.1,org.apache.spark:spark-streaming-kafka-0-10_2.12:3.3.1 \ 20 | $PYTHON_JOB -------------------------------------------------------------------------------- /06-streaming/python/streams-example/redpanda/README.md: -------------------------------------------------------------------------------- 1 | 2 | # Running PySpark Streaming with Redpanda 3 | 4 | ### 1. Prerequisite 5 | 6 | It is important to create network and volume as described in the document. Therefore please ensure, your volume and network are created correctly. 7 | 8 | ```bash 9 | docker volume ls # should list hadoop-distributed-file-system 10 | docker network ls # should list kafka-spark-network 11 | ``` 12 | 13 | ### 2. Create Docker Network & Volume 14 | 15 | If you have not followed any other examples, and above `ls` steps shows no output, create them now. 16 | 17 | ```bash 18 | # Create Network 19 | docker network create kafka-spark-network 20 | 21 | # Create Volume 22 | docker volume create --name=hadoop-distributed-file-system 23 | ``` 24 | 25 | ### Running Producer and Consumer 26 | ```bash 27 | # Run producer 28 | python producer.py 29 | 30 | # Run consumer with default settings 31 | python consumer.py 32 | # Run consumer for specific topic 33 | python consumer.py --topic 34 | ``` 35 | 36 | ### Running Streaming Script 37 | 38 | spark-submit script ensures installation of necessary jars before running the streaming.py 39 | 40 | ```bash 41 | ./spark-submit.sh streaming.py 42 | ``` 43 | 44 | ### Additional Resources 45 | - [Structured Streaming Programming Guide](https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html#structured-streaming-programming-guide) 46 | - [Structured Streaming + Kafka Integration](https://spark.apache.org/docs/latest/structured-streaming-kafka-integration.html#structured-streaming-kafka-integration-guide-kafka-broker-versio) 47 | -------------------------------------------------------------------------------- /06-streaming/python/streams-example/redpanda/consumer.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from typing import Dict, List 3 | from kafka import KafkaConsumer 4 | 5 | from settings import BOOTSTRAP_SERVERS, CONSUME_TOPIC_RIDES_CSV 6 | 7 | 8 | class RideCSVConsumer: 9 | def __init__(self, props: Dict): 10 | self.consumer = KafkaConsumer(**props) 11 | 12 | def consume_from_kafka(self, topics: List[str]): 13 | self.consumer.subscribe(topics=topics) 14 | print('Consuming from Kafka started') 15 | print('Available topics to consume: ', self.consumer.subscription()) 16 | while True: 17 | try: 18 | # SIGINT can't be handled when polling, limit timeout to 1 second. 19 | msg = self.consumer.poll(1.0) 20 | if msg is None or msg == {}: 21 | continue 22 | for msg_key, msg_values in msg.items(): 23 | for msg_val in msg_values: 24 | print(f'Key:{msg_val.key}-type({type(msg_val.key)}), ' 25 | f'Value:{msg_val.value}-type({type(msg_val.value)})') 26 | except KeyboardInterrupt: 27 | break 28 | 29 | self.consumer.close() 30 | 31 | 32 | if __name__ == '__main__': 33 | parser = argparse.ArgumentParser(description='Kafka Consumer') 34 | parser.add_argument('--topic', type=str, default=CONSUME_TOPIC_RIDES_CSV) 35 | args = parser.parse_args() 36 | 37 | topic = args.topic 38 | config = { 39 | 'bootstrap_servers': [BOOTSTRAP_SERVERS], 40 | 'auto_offset_reset': 'earliest', 41 | 'enable_auto_commit': True, 42 | 'key_deserializer': lambda key: int(key.decode('utf-8')), 43 | 'value_deserializer': lambda value: value.decode('utf-8'), 44 | 'group_id': 'consumer.group.id.csv-example.1', 45 | } 46 | csv_consumer = RideCSVConsumer(props=config) 47 | csv_consumer.consume_from_kafka(topics=[topic]) 48 | -------------------------------------------------------------------------------- /06-streaming/python/streams-example/redpanda/producer.py: -------------------------------------------------------------------------------- 1 | import csv 2 | from time import sleep 3 | from typing import Dict 4 | from kafka import KafkaProducer 5 | 6 | from settings import BOOTSTRAP_SERVERS, INPUT_DATA_PATH, PRODUCE_TOPIC_RIDES_CSV 7 | 8 | 9 | def delivery_report(err, msg): 10 | if err is not None: 11 | print("Delivery failed for record {}: {}".format(msg.key(), err)) 12 | return 13 | print('Record {} successfully produced to {} [{}] at offset {}'.format( 14 | msg.key(), msg.topic(), msg.partition(), msg.offset())) 15 | 16 | 17 | class RideCSVProducer: 18 | def __init__(self, props: Dict): 19 | self.producer = KafkaProducer(**props) 20 | # self.producer = Producer(producer_props) 21 | 22 | @staticmethod 23 | def read_records(resource_path: str): 24 | records, ride_keys = [], [] 25 | i = 0 26 | with open(resource_path, 'r') as f: 27 | reader = csv.reader(f) 28 | header = next(reader) # skip the header 29 | for row in reader: 30 | # vendor_id, passenger_count, trip_distance, payment_type, total_amount 31 | records.append(f'{row[0]}, {row[1]}, {row[2]}, {row[3]}, {row[4]}, {row[9]}, {row[16]}') 32 | ride_keys.append(str(row[0])) 33 | i += 1 34 | if i == 5: 35 | break 36 | return zip(ride_keys, records) 37 | 38 | def publish(self, topic: str, records: [str, str]): 39 | for key_value in records: 40 | key, value = key_value 41 | try: 42 | self.producer.send(topic=topic, key=key, value=value) 43 | print(f"Producing record for ") 44 | except KeyboardInterrupt: 45 | break 46 | except Exception as e: 47 | print(f"Exception while producing record - {value}: {e}") 48 | 49 | self.producer.flush() 50 | sleep(1) 51 | 52 | 53 | if __name__ == "__main__": 54 | config = { 55 | 'bootstrap_servers': [BOOTSTRAP_SERVERS], 56 | 'key_serializer': lambda x: x.encode('utf-8'), 57 | 'value_serializer': lambda x: x.encode('utf-8') 58 | } 59 | producer = RideCSVProducer(props=config) 60 | ride_records = producer.read_records(resource_path=INPUT_DATA_PATH) 61 | print(ride_records) 62 | producer.publish(topic=PRODUCE_TOPIC_RIDES_CSV, records=ride_records) 63 | -------------------------------------------------------------------------------- /06-streaming/python/streams-example/redpanda/settings.py: -------------------------------------------------------------------------------- 1 | import pyspark.sql.types as T 2 | 3 | INPUT_DATA_PATH = '../../resources/rides.csv' 4 | BOOTSTRAP_SERVERS = 'localhost:9092' 5 | 6 | TOPIC_WINDOWED_VENDOR_ID_COUNT = 'vendor_counts_windowed' 7 | 8 | PRODUCE_TOPIC_RIDES_CSV = CONSUME_TOPIC_RIDES_CSV = 'rides_csv' 9 | 10 | RIDE_SCHEMA = T.StructType( 11 | [T.StructField("vendor_id", T.IntegerType()), 12 | T.StructField('tpep_pickup_datetime', T.TimestampType()), 13 | T.StructField('tpep_dropoff_datetime', T.TimestampType()), 14 | T.StructField("passenger_count", T.IntegerType()), 15 | T.StructField("trip_distance", T.FloatType()), 16 | T.StructField("payment_type", T.IntegerType()), 17 | T.StructField("total_amount", T.FloatType()), 18 | ]) 19 | -------------------------------------------------------------------------------- /06-streaming/python/streams-example/redpanda/spark-submit.sh: -------------------------------------------------------------------------------- 1 | # Submit Python code to SparkMaster 2 | 3 | if [ $# -lt 1 ] 4 | then 5 | echo "Usage: $0 [ executor-memory ]" 6 | echo "(specify memory in string format such as \"512M\" or \"2G\")" 7 | exit 1 8 | fi 9 | PYTHON_JOB=$1 10 | 11 | if [ -z $2 ] 12 | then 13 | EXEC_MEM="1G" 14 | else 15 | EXEC_MEM=$2 16 | fi 17 | spark-submit --master spark://localhost:7077 --num-executors 2 \ 18 | --executor-memory $EXEC_MEM --executor-cores 1 \ 19 | --packages org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.1,org.apache.spark:spark-avro_2.12:3.5.1,org.apache.spark:spark-streaming-kafka-0-10_2.12:3.5.1 \ 20 | $PYTHON_JOB 21 | -------------------------------------------------------------------------------- /after-sign-up.md: -------------------------------------------------------------------------------- 1 | ## Thank you! 2 | 3 | Thanks for signing up for the course. 4 | 5 | The process of adding you to the mailing list is not automated yet, 6 | but you will hear from us closer to the course start. 7 | 8 | To make sure you don't miss any announcements 9 | 10 | - Register in [DataTalks.Club's Slack](https://datatalks.club/slack.html) and 11 | join the [`#course-data-engineering`](https://app.slack.com/client/T01ATQK62F8/C01FABYF2RG) channel 12 | - Join the [course Telegram channel with announcements](https://t.me/dezoomcamp) 13 | - Subscribe to [DataTalks.Club's YouTube channel](https://www.youtube.com/c/DataTalksClub) and check 14 | [the course playlist](https://www.youtube.com/playlist?list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb) 15 | - Subscribe to our [public Google Calendar](https://calendar.google.com/calendar/?cid=ZXIxcjA1M3ZlYjJpcXU0dTFmaG02MzVxMG9AZ3JvdXAuY2FsZW5kYXIuZ29vZ2xlLmNvbQ) (it works from Desktop only) 16 | 17 | See you in January! 18 | -------------------------------------------------------------------------------- /certificates.md: -------------------------------------------------------------------------------- 1 | ## Getting your certificate 2 | 3 | Congratulations on finishing the course! 4 | 5 | You can find your certificate in your enrollment profile (you need to be logged in): 6 | 7 | * For the 2025 edition, it's https://courses.datatalks.club/de-zoomcamp-2025/enrollment 8 | 9 | If you can't find a certificate in your profile, it means you didn't pass the project. 10 | If you believe it's a mistake, write in the course channel in Slack. 11 | 12 | 13 | ## Adding to LinkedIn 14 | 15 | You can add your certificate to LinkedIn: 16 | 17 | * Log in to your LinkedIn account, then go to your profile. 18 | * On the right, in the "Add profile" section dropdown, choose "Background" and then select the drop-down triangle next to "Licenses & Certifications". 19 | * In "Name", enter "Data Engineering Zoomcamp". 20 | * In "Issuing Organization", enter "DataTalksClub". 21 | * (Optional) In "Issue Date", enter the time when the certificate was created. 22 | * (Optional) Select the checkbox This certification does not expire. 23 | * Put your certificate ID. 24 | * In "Certification URL", enter the URL for your certificate. 25 | 26 | [Adapted from here](https://support.edx.org/hc/en-us/articles/206501938-How-can-I-add-my-certificate-to-my-LinkedIn-profile-) 27 | -------------------------------------------------------------------------------- /cohorts/2022/README.md: -------------------------------------------------------------------------------- 1 | 2 | ### 2022 Cohort 3 | 4 | * **Start**: 17 January 2022 5 | * **Registration link**: https://airtable.com/shr6oVXeQvSI5HuWD 6 | * [Leaderboard](https://docs.google.com/spreadsheets/d/e/2PACX-1vR9oQiYnAVvzL4dagnhvp0sngqagF0AceD0FGjhS-dnzMTBzNQIal3-hOgkTibVQvfuqbQ69b0fvRnf/pubhtml) 7 | * Subscribe to our [public Google Calendar](https://calendar.google.com/calendar/?cid=ZXIxcjA1M3ZlYjJpcXU0dTFmaG02MzVxMG9AZ3JvdXAuY2FsZW5kYXIuZ29vZ2xlLmNvbQ) (it works from Desktop only) 8 | -------------------------------------------------------------------------------- /cohorts/2022/project.md: -------------------------------------------------------------------------------- 1 | ## Course Project 2 | 3 | The goal of this project is to apply everything we learned 4 | in this course and build an end-to-end data pipeline. 5 | 6 | Remember that to pass the project, you must evaluate 3 peers. If you don't do that, your project can't be considered compelete. 7 | 8 | 9 | ### Submitting 10 | 11 | #### Project Cohort #2 12 | 13 | Project: 14 | 15 | * Form: https://forms.gle/JECXB9jYQ1vBXbsw6 16 | * Deadline: 2 May, 22:00 CET 17 | 18 | Peer reviewing: 19 | 20 | * Peer review assignments: [link](https://docs.google.com/spreadsheets/d/e/2PACX-1vShnv8T4iY_5NA8h0nySIS8Wzr-DZGGigEikIW4ZMSi9HlvhaEB4RhwmepVIuIUGaQHS90r5iHR2YXV/pubhtml?gid=964123374&single=true) 21 | * Form: https://forms.gle/Pb2fBwYLQ3GGFsaK6 22 | * Deadline: 9 May, 22:00 CET 23 | 24 | 25 | #### Project Cohort #1 26 | 27 | Project: 28 | 29 | * Form: https://forms.gle/6aeVcEVJipqR2BqC8 30 | * Deadline: 4 April, 22:00 CET 31 | 32 | Peer reviewing: 33 | 34 | * Peer review assignments: [link](https://docs.google.com/spreadsheets/d/e/2PACX-1vShnv8T4iY_5NA8h0nySIS8Wzr-DZGGigEikIW4ZMSi9HlvhaEB4RhwmepVIuIUGaQHS90r5iHR2YXV/pubhtml) 35 | * Form: https://forms.gle/AZ62bXMp4SGcVUmK7 36 | * Deadline: 11 April, 22:00 CET 37 | 38 | Project feedback: [link](https://docs.google.com/spreadsheets/d/e/2PACX-1vRcVCkO-jes5mbPAcikn9X_s2laJ1KhsO8aibHYQxxKqdCUYMVTEJLJQdM8C5aAUWKFl_0SJW4rme7H/pubhtml) 39 | -------------------------------------------------------------------------------- /cohorts/2022/week_1_basics_n_setup/homework.md: -------------------------------------------------------------------------------- 1 | ## Week 1 Homework 2 | 3 | In this homework we'll prepare the environment 4 | and practice with terraform and SQL 5 | 6 | 7 | ## Question 1. Google Cloud SDK 8 | 9 | Install Google Cloud SDK. What's the version you have? 10 | 11 | To get the version, run `gcloud --version` 12 | 13 | ## Google Cloud account 14 | 15 | Create an account in Google Cloud and create a project. 16 | 17 | 18 | ## Question 2. Terraform 19 | 20 | Now install terraform and go to the terraform directory (`week_1_basics_n_setup/1_terraform_gcp/terraform`) 21 | 22 | After that, run 23 | 24 | * `terraform init` 25 | * `terraform plan` 26 | * `terraform apply` 27 | 28 | Apply the plan and copy the output (after running `apply`) to the form. 29 | 30 | It should be the entire output - from the moment you typed `terraform init` to the very end. 31 | 32 | ## Prepare Postgres 33 | 34 | Run Postgres and load data as shown in the videos 35 | 36 | We'll use the yellow taxi trips from January 2021: 37 | 38 | ```bash 39 | wget https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2021-01.csv 40 | ``` 41 | 42 | You will also need the dataset with zones: 43 | 44 | ```bash 45 | wget https://s3.amazonaws.com/nyc-tlc/misc/taxi+_zone_lookup.csv 46 | ``` 47 | 48 | Download this data and put it to Postgres 49 | 50 | ## Question 3. Count records 51 | 52 | How many taxi trips were there on January 15? 53 | 54 | Consider only trips that started on January 15. 55 | 56 | 57 | ## Question 4. Largest tip for each day 58 | 59 | Find the largest tip for each day. 60 | On which day it was the largest tip in January? 61 | 62 | Use the pick up time for your calculations. 63 | 64 | (note: it's not a typo, it's "tip", not "trip") 65 | 66 | 67 | ## Question 5. Most popular destination 68 | 69 | What was the most popular destination for passengers picked up 70 | in central park on January 14? 71 | 72 | Use the pick up time for your calculations. 73 | 74 | Enter the zone name (not id). If the zone name is unknown (missing), write "Unknown" 75 | 76 | 77 | ## Question 6. Most expensive locations 78 | 79 | What's the pickup-dropoff pair with the largest 80 | average price for a ride (calculated based on `total_amount`)? 81 | 82 | Enter two zone names separated by a slash 83 | 84 | For example: 85 | 86 | "Jamaica Bay / Clinton East" 87 | 88 | If any of the zone names are unknown (missing), write "Unknown". For example, "Unknown / Clinton East". 89 | 90 | 91 | ## Submitting the solutions 92 | 93 | * Form for submitting: https://forms.gle/yGQrkgRdVbiFs8Vd7 94 | * You can submit your homework multiple times. In this case, only the last submission will be used. 95 | 96 | Deadline: 26 January (Wednesday), 22:00 CET 97 | 98 | 99 | ## Solution 100 | 101 | Here is the solution to questions 3-6: [video](https://www.youtube.com/watch?v=HxHqH2ARfxM&list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb) 102 | 103 | -------------------------------------------------------------------------------- /cohorts/2022/week_2_data_ingestion/airflow/.env_example: -------------------------------------------------------------------------------- 1 | # Custom 2 | COMPOSE_PROJECT_NAME=dtc-de 3 | GOOGLE_APPLICATION_CREDENTIALS=/.google/credentials/google_credentials.json 4 | AIRFLOW_CONN_GOOGLE_CLOUD_DEFAULT=google-cloud-platform://?extra__google_cloud_platform__key_path=/.google/credentials/google_credentials.json 5 | # AIRFLOW_UID= 6 | GCP_PROJECT_ID= 7 | GCP_GCS_BUCKET= 8 | 9 | # Postgres 10 | POSTGRES_USER=airflow 11 | POSTGRES_PASSWORD=airflow 12 | POSTGRES_DB=airflow 13 | 14 | # Airflow 15 | AIRFLOW__CORE__EXECUTOR=LocalExecutor 16 | AIRFLOW__SCHEDULER__SCHEDULER_HEARTBEAT_SEC=10 17 | 18 | AIRFLOW__CORE__SQL_ALCHEMY_CONN=postgresql+psycopg2://${POSTGRES_USER}:${POSTGRES_PASSWORD}@postgres:5432/${POSTGRES_DB} 19 | AIRFLOW_CONN_METADATA_DB=postgres+psycopg2://airflow:airflow@postgres:5432/airflow 20 | AIRFLOW_VAR__METADATA_DB_SCHEMA=airflow 21 | 22 | _AIRFLOW_WWW_USER_CREATE=True 23 | _AIRFLOW_WWW_USER_USERNAME=${_AIRFLOW_WWW_USER_USERNAME:airflow} 24 | _AIRFLOW_WWW_USER_PASSWORD=${_AIRFLOW_WWW_USER_PASSWORD:airflow} 25 | 26 | AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION=True 27 | AIRFLOW__CORE__LOAD_EXAMPLES=False 28 | -------------------------------------------------------------------------------- /cohorts/2022/week_2_data_ingestion/airflow/Dockerfile: -------------------------------------------------------------------------------- 1 | # First-time build can take upto 10 mins. 2 | 3 | FROM apache/airflow:2.2.3 4 | 5 | ENV AIRFLOW_HOME=/opt/airflow 6 | 7 | USER root 8 | RUN apt-get update -qq && apt-get install vim -qqq 9 | # git gcc g++ -qqq 10 | 11 | COPY requirements.txt . 12 | RUN pip install --no-cache-dir -r requirements.txt 13 | 14 | # Ref: https://airflow.apache.org/docs/docker-stack/recipes.html 15 | 16 | SHELL ["/bin/bash", "-o", "pipefail", "-e", "-u", "-x", "-c"] 17 | 18 | ARG CLOUD_SDK_VERSION=322.0.0 19 | ENV GCLOUD_HOME=/home/google-cloud-sdk 20 | 21 | ENV PATH="${GCLOUD_HOME}/bin/:${PATH}" 22 | 23 | RUN DOWNLOAD_URL="https://dl.google.com/dl/cloudsdk/channels/rapid/downloads/google-cloud-sdk-${CLOUD_SDK_VERSION}-linux-x86_64.tar.gz" \ 24 | && TMP_DIR="$(mktemp -d)" \ 25 | && curl -fL "${DOWNLOAD_URL}" --output "${TMP_DIR}/google-cloud-sdk.tar.gz" \ 26 | && mkdir -p "${GCLOUD_HOME}" \ 27 | && tar xzf "${TMP_DIR}/google-cloud-sdk.tar.gz" -C "${GCLOUD_HOME}" --strip-components=1 \ 28 | && "${GCLOUD_HOME}/install.sh" \ 29 | --bash-completion=false \ 30 | --path-update=false \ 31 | --usage-reporting=false \ 32 | --quiet \ 33 | && rm -rf "${TMP_DIR}" \ 34 | && gcloud --version 35 | 36 | WORKDIR $AIRFLOW_HOME 37 | 38 | COPY scripts scripts 39 | RUN chmod +x scripts 40 | 41 | USER $AIRFLOW_UID 42 | -------------------------------------------------------------------------------- /cohorts/2022/week_2_data_ingestion/airflow/dags_local/data_ingestion_local.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from datetime import datetime 4 | 5 | from airflow import DAG 6 | 7 | from airflow.operators.bash import BashOperator 8 | from airflow.operators.python import PythonOperator 9 | 10 | from ingest_script import ingest_callable 11 | 12 | 13 | AIRFLOW_HOME = os.environ.get("AIRFLOW_HOME", "/opt/airflow/") 14 | 15 | 16 | PG_HOST = os.getenv('PG_HOST') 17 | PG_USER = os.getenv('PG_USER') 18 | PG_PASSWORD = os.getenv('PG_PASSWORD') 19 | PG_PORT = os.getenv('PG_PORT') 20 | PG_DATABASE = os.getenv('PG_DATABASE') 21 | 22 | 23 | local_workflow = DAG( 24 | "LocalIngestionDag", 25 | schedule_interval="0 6 2 * *", 26 | start_date=datetime(2021, 1, 1) 27 | ) 28 | 29 | 30 | URL_PREFIX = 'https://s3.amazonaws.com/nyc-tlc/trip+data' 31 | URL_TEMPLATE = URL_PREFIX + '/yellow_tripdata_{{ execution_date.strftime(\'%Y-%m\') }}.csv' 32 | OUTPUT_FILE_TEMPLATE = AIRFLOW_HOME + '/output_{{ execution_date.strftime(\'%Y-%m\') }}.csv' 33 | TABLE_NAME_TEMPLATE = 'yellow_taxi_{{ execution_date.strftime(\'%Y_%m\') }}' 34 | 35 | with local_workflow: 36 | wget_task = BashOperator( 37 | task_id='wget', 38 | bash_command=f'curl -sSL {URL_TEMPLATE} > {OUTPUT_FILE_TEMPLATE}' 39 | ) 40 | 41 | ingest_task = PythonOperator( 42 | task_id="ingest", 43 | python_callable=ingest_callable, 44 | op_kwargs=dict( 45 | user=PG_USER, 46 | password=PG_PASSWORD, 47 | host=PG_HOST, 48 | port=PG_PORT, 49 | db=PG_DATABASE, 50 | table_name=TABLE_NAME_TEMPLATE, 51 | csv_file=OUTPUT_FILE_TEMPLATE 52 | ), 53 | ) 54 | 55 | wget_task >> ingest_task -------------------------------------------------------------------------------- /cohorts/2022/week_2_data_ingestion/airflow/dags_local/ingest_script.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from time import time 4 | 5 | import pandas as pd 6 | from sqlalchemy import create_engine 7 | 8 | 9 | def ingest_callable(user, password, host, port, db, table_name, csv_file, execution_date): 10 | print(table_name, csv_file, execution_date) 11 | 12 | engine = create_engine(f'postgresql://{user}:{password}@{host}:{port}/{db}') 13 | engine.connect() 14 | 15 | print('connection established successfully, inserting data...') 16 | 17 | t_start = time() 18 | df_iter = pd.read_csv(csv_file, iterator=True, chunksize=100000) 19 | 20 | df = next(df_iter) 21 | 22 | df.tpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime) 23 | df.tpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime) 24 | 25 | df.head(n=0).to_sql(name=table_name, con=engine, if_exists='replace') 26 | 27 | df.to_sql(name=table_name, con=engine, if_exists='append') 28 | 29 | t_end = time() 30 | print('inserted the first chunk, took %.3f second' % (t_end - t_start)) 31 | 32 | while True: 33 | t_start = time() 34 | 35 | try: 36 | df = next(df_iter) 37 | except StopIteration: 38 | print("completed") 39 | break 40 | 41 | df.tpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime) 42 | df.tpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime) 43 | 44 | df.to_sql(name=table_name, con=engine, if_exists='append') 45 | 46 | t_end = time() 47 | 48 | print('inserted another chunk, took %.3f second' % (t_end - t_start)) 49 | -------------------------------------------------------------------------------- /cohorts/2022/week_2_data_ingestion/airflow/docker-compose-nofrills.yml: -------------------------------------------------------------------------------- 1 | version: '3' 2 | services: 3 | postgres: 4 | image: postgres:13 5 | env_file: 6 | - .env 7 | volumes: 8 | - postgres-db-volume:/var/lib/postgresql/data 9 | healthcheck: 10 | test: ["CMD", "pg_isready", "-U", "airflow"] 11 | interval: 5s 12 | retries: 5 13 | restart: always 14 | 15 | scheduler: 16 | build: . 17 | command: scheduler 18 | restart: on-failure 19 | depends_on: 20 | - postgres 21 | env_file: 22 | - .env 23 | volumes: 24 | - ./dags:/opt/airflow/dags 25 | - ./logs:/opt/airflow/logs 26 | - ./plugins:/opt/airflow/plugins 27 | - ./scripts:/opt/airflow/scripts 28 | - ~/.google/credentials/:/.google/credentials 29 | 30 | 31 | webserver: 32 | build: . 33 | entrypoint: ./scripts/entrypoint.sh 34 | restart: on-failure 35 | depends_on: 36 | - postgres 37 | - scheduler 38 | env_file: 39 | - .env 40 | volumes: 41 | - ./dags:/opt/airflow/dags 42 | - ./logs:/opt/airflow/logs 43 | - ./plugins:/opt/airflow/plugins 44 | - ~/.google/credentials/:/.google/credentials:ro 45 | - ./scripts:/opt/airflow/scripts 46 | 47 | user: "${AIRFLOW_UID:-50000}:0" 48 | ports: 49 | - "8080:8080" 50 | healthcheck: 51 | test: [ "CMD-SHELL", "[ -f /home/airflow/airflow-webserver.pid ]" ] 52 | interval: 30s 53 | timeout: 30s 54 | retries: 3 55 | 56 | volumes: 57 | postgres-db-volume: -------------------------------------------------------------------------------- /cohorts/2022/week_2_data_ingestion/airflow/docs/1_concepts.md: -------------------------------------------------------------------------------- 1 | ## Airflow concepts 2 | 3 | 4 | ### Airflow architecture 5 | ![](arch-diag-airflow.png) 6 | 7 | Ref: https://airflow.apache.org/docs/apache-airflow/stable/concepts/overview.html 8 | 9 | * **Web server**: 10 | GUI to inspect, trigger and debug the behaviour of DAGs and tasks. 11 | Available at http://localhost:8080. 12 | 13 | * **Scheduler**: 14 | Responsible for scheduling jobs. Handles both triggering & scheduled workflows, submits Tasks to the executor to run, monitors all tasks and DAGs, and 15 | then triggers the task instances once their dependencies are complete. 16 | 17 | * **Worker**: 18 | This component executes the tasks given by the scheduler. 19 | 20 | * **Metadata database (postgres)**: 21 | Backend to the Airflow environment. Used by the scheduler, executor and webserver to store state. 22 | 23 | * **Other components** (seen in docker-compose services): 24 | * `redis`: Message broker that forwards messages from scheduler to worker. 25 | * `flower`: The flower app for monitoring the environment. It is available at http://localhost:5555. 26 | * `airflow-init`: initialization service (customized as per this design) 27 | 28 | All these services allow you to run Airflow with CeleryExecutor. 29 | For more information, see [Architecture Overview](https://airflow.apache.org/docs/apache-airflow/stable/concepts/overview.html). 30 | 31 | 32 | ### Project Structure: 33 | 34 | * `./dags` - `DAG_FOLDER` for DAG files (use `./dags_local` for the local ingestion DAG) 35 | * `./logs` - contains logs from task execution and scheduler. 36 | * `./plugins` - for custom plugins 37 | 38 | 39 | ### Workflow components 40 | 41 | * `DAG`: Directed acyclic graph, specifies the dependencies between a set of tasks with explicit execution order, and has a beginning as well as an end. (Hence, “acyclic”) 42 | * `DAG Structure`: DAG Definition, Tasks (eg. Operators), Task Dependencies (control flow: `>>` or `<<` ) 43 | 44 | * `Task`: a defined unit of work (aka, operators in Airflow). The Tasks themselves describe what to do, be it fetching data, running analysis, triggering other systems, or more. 45 | * Common Types: Operators (used in this workshop), Sensors, TaskFlow decorators 46 | * Sub-classes of Airflow's BaseOperator 47 | 48 | * `DAG Run`: individual execution/run of a DAG 49 | * scheduled or triggered 50 | 51 | * `Task Instance`: an individual run of a single task. Task instances also have an indicative state, which could be “running”, “success”, “failed”, “skipped”, “up for retry”, etc. 52 | * Ideally, a task should flow from `none`, to `scheduled`, to `queued`, to `running`, and finally to `success`. 53 | 54 | 55 | ### References 56 | 57 | https://airflow.apache.org/docs/apache-airflow/stable/concepts/dags.html 58 | 59 | https://airflow.apache.org/docs/apache-airflow/stable/concepts/tasks.html 60 | 61 | -------------------------------------------------------------------------------- /cohorts/2022/week_2_data_ingestion/airflow/docs/arch-diag-airflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataTalksClub/data-engineering-zoomcamp/76fed2419e917176369a0d3a2eeff6c936f87286/cohorts/2022/week_2_data_ingestion/airflow/docs/arch-diag-airflow.png -------------------------------------------------------------------------------- /cohorts/2022/week_2_data_ingestion/airflow/docs/gcs_ingestion_dag.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataTalksClub/data-engineering-zoomcamp/76fed2419e917176369a0d3a2eeff6c936f87286/cohorts/2022/week_2_data_ingestion/airflow/docs/gcs_ingestion_dag.png -------------------------------------------------------------------------------- /cohorts/2022/week_2_data_ingestion/airflow/extras/web_to_gcs.sh: -------------------------------------------------------------------------------- 1 | dataset_url=${dataset_url} 2 | dataset_file=${dataset_file} 3 | path_to_local_file=${path_to_local_file} 4 | path_to_creds=${path_to_creds} 5 | 6 | curl -sS "$dataset_url" > $path_to_local_file/$dataset_file 7 | gcloud auth activate-service-account --key-file=$path_to_creds 8 | gsutil -m cp $path_to_local_file/$dataset_file gs://$BUCKET 9 | -------------------------------------------------------------------------------- /cohorts/2022/week_2_data_ingestion/airflow/requirements.txt: -------------------------------------------------------------------------------- 1 | apache-airflow-providers-google 2 | pyarrow 3 | -------------------------------------------------------------------------------- /cohorts/2022/week_2_data_ingestion/airflow/scripts/entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | export GOOGLE_APPLICATION_CREDENTIALS=${GOOGLE_APPLICATION_CREDENTIALS} 3 | export AIRFLOW_CONN_GOOGLE_CLOUD_DEFAULT=${AIRFLOW_CONN_GOOGLE_CLOUD_DEFAULT} 4 | 5 | airflow db upgrade 6 | 7 | airflow users create -r Admin -u admin -p admin -e admin@example.com -f admin -l airflow 8 | # "$_AIRFLOW_WWW_USER_USERNAME" -p "$_AIRFLOW_WWW_USER_PASSWORD" 9 | 10 | airflow webserver 11 | -------------------------------------------------------------------------------- /cohorts/2022/week_2_data_ingestion/transfer_service/README.md: -------------------------------------------------------------------------------- 1 | ## Generate AWS Access key 2 | - Login in to AWS account 3 | - Search for IAM 4 | ![aws iam](../../images/aws/iam.png) 5 | - Click on `Manage access key` 6 | - Click on `Create New Access Key` 7 | - Download the csv, your access key and secret would be in that csv (Please note that once lost secret cannot be recovered) 8 | 9 | ## Transfer service 10 | https://console.cloud.google.com/transfer/cloud/jobs 11 | 12 | 13 | -------------------------------------------------------------------------------- /cohorts/2022/week_3_data_warehouse/airflow/.env_example: -------------------------------------------------------------------------------- 1 | # Custom 2 | COMPOSE_PROJECT_NAME=dtc-de 3 | GOOGLE_APPLICATION_CREDENTIALS=/.google/credentials/google_credentials.json 4 | AIRFLOW_CONN_GOOGLE_CLOUD_DEFAULT=google-cloud-platform://?extra__google_cloud_platform__key_path=/.google/credentials/google_credentials.json 5 | # AIRFLOW_UID= 6 | GCP_PROJECT_ID= 7 | GCP_GCS_BUCKET= 8 | 9 | # Postgres 10 | POSTGRES_USER=airflow 11 | POSTGRES_PASSWORD=airflow 12 | POSTGRES_DB=airflow 13 | 14 | # Airflow 15 | AIRFLOW__CORE__EXECUTOR=LocalExecutor 16 | AIRFLOW__SCHEDULER__SCHEDULER_HEARTBEAT_SEC=10 17 | 18 | AIRFLOW__CORE__SQL_ALCHEMY_CONN=postgresql+psycopg2://${POSTGRES_USER}:${POSTGRES_PASSWORD}@postgres:5432/${POSTGRES_DB} 19 | AIRFLOW_CONN_METADATA_DB=postgres+psycopg2://airflow:airflow@postgres:5432/airflow 20 | AIRFLOW_VAR__METADATA_DB_SCHEMA=airflow 21 | 22 | _AIRFLOW_WWW_USER_CREATE=True 23 | _AIRFLOW_WWW_USER_USERNAME=${_AIRFLOW_WWW_USER_USERNAME:airflow} 24 | _AIRFLOW_WWW_USER_PASSWORD=${_AIRFLOW_WWW_USER_PASSWORD:airflow} 25 | 26 | AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION=True 27 | AIRFLOW__CORE__LOAD_EXAMPLES=False 28 | -------------------------------------------------------------------------------- /cohorts/2022/week_3_data_warehouse/airflow/docker-compose-nofrills.yml: -------------------------------------------------------------------------------- 1 | version: '3' 2 | services: 3 | postgres: 4 | image: postgres:13 5 | env_file: 6 | - .env 7 | volumes: 8 | - postgres-db-volume:/var/lib/postgresql/data 9 | healthcheck: 10 | test: ["CMD", "pg_isready", "-U", "airflow"] 11 | interval: 5s 12 | retries: 5 13 | restart: always 14 | 15 | scheduler: 16 | build: . 17 | command: scheduler 18 | restart: on-failure 19 | depends_on: 20 | - postgres 21 | env_file: 22 | - .env 23 | volumes: 24 | - ./dags:/opt/airflow/dags 25 | - ./logs:/opt/airflow/logs 26 | - ./plugins:/opt/airflow/plugins 27 | - ./scripts:/opt/airflow/scripts 28 | - ~/.google/credentials/:/.google/credentials:ro 29 | 30 | 31 | webserver: 32 | build: . 33 | entrypoint: ./scripts/entrypoint.sh 34 | restart: on-failure 35 | depends_on: 36 | - postgres 37 | - scheduler 38 | env_file: 39 | - .env 40 | volumes: 41 | - ./dags:/opt/airflow/dags 42 | - ./logs:/opt/airflow/logs 43 | - ./plugins:/opt/airflow/plugins 44 | - ~/.google/credentials/:/.google/credentials:ro 45 | - ./scripts:/opt/airflow/scripts 46 | 47 | user: "${AIRFLOW_UID:-50000}:0" 48 | ports: 49 | - "8080:8080" 50 | healthcheck: 51 | test: [ "CMD-SHELL", "[ -f /home/airflow/airflow-webserver.pid ]" ] 52 | interval: 30s 53 | timeout: 30s 54 | retries: 3 55 | 56 | volumes: 57 | postgres-db-volume: -------------------------------------------------------------------------------- /cohorts/2022/week_3_data_warehouse/airflow/docs/gcs_2_bq_dag_graph_view.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataTalksClub/data-engineering-zoomcamp/76fed2419e917176369a0d3a2eeff6c936f87286/cohorts/2022/week_3_data_warehouse/airflow/docs/gcs_2_bq_dag_graph_view.png -------------------------------------------------------------------------------- /cohorts/2022/week_3_data_warehouse/airflow/docs/gcs_2_bq_dag_tree_view.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataTalksClub/data-engineering-zoomcamp/76fed2419e917176369a0d3a2eeff6c936f87286/cohorts/2022/week_3_data_warehouse/airflow/docs/gcs_2_bq_dag_tree_view.png -------------------------------------------------------------------------------- /cohorts/2022/week_3_data_warehouse/airflow/scripts/entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | export GOOGLE_APPLICATION_CREDENTIALS=${GOOGLE_APPLICATION_CREDENTIALS} 3 | export AIRFLOW_CONN_GOOGLE_CLOUD_DEFAULT=${AIRFLOW_CONN_GOOGLE_CLOUD_DEFAULT} 4 | 5 | airflow db upgrade 6 | 7 | airflow users create -r Admin -u admin -p admin -e admin@example.com -f admin -l airflow 8 | # "$_AIRFLOW_WWW_USER_USERNAME" -p "$_AIRFLOW_WWW_USER_PASSWORD" 9 | 10 | airflow webserver 11 | -------------------------------------------------------------------------------- /cohorts/2022/week_5_batch_processing/homework.md: -------------------------------------------------------------------------------- 1 | ## Week 5 Homework 2 | 3 | In this homework we'll put what we learned about Spark 4 | in practice. 5 | 6 | We'll use high volume for-hire vehicles (HVFHV) dataset for that. 7 | 8 | ## Question 1. Install Spark and PySpark 9 | 10 | * Install Spark 11 | * Run PySpark 12 | * Create a local spark session 13 | * Execute `spark.version` 14 | 15 | What's the output? 16 | 17 | 18 | ## Question 2. HVFHW February 2021 19 | 20 | Download the HVFHV data for february 2021: 21 | 22 | ```bash 23 | wget https://nyc-tlc.s3.amazonaws.com/trip+data/fhvhv_tripdata_2021-02.csv 24 | ``` 25 | 26 | Read it with Spark using the same schema as we did 27 | in the lessons. We will use this dataset for all 28 | the remaining questions. 29 | 30 | Repartition it to 24 partitions and save it to 31 | parquet. 32 | 33 | What's the size of the folder with results (in MB)? 34 | 35 | 36 | ## Question 3. Count records 37 | 38 | How many taxi trips were there on February 15? 39 | 40 | Consider only trips that started on February 15. 41 | 42 | 43 | ## Question 4. Longest trip for each day 44 | 45 | Now calculate the duration for each trip. 46 | 47 | Trip starting on which day was the longest? 48 | 49 | 50 | ## Question 5. Most frequent `dispatching_base_num` 51 | 52 | Now find the most frequently occurring `dispatching_base_num` 53 | in this dataset. 54 | 55 | How many stages this spark job has? 56 | 57 | > Note: the answer may depend on how you write the query, 58 | > so there are multiple correct answers. 59 | > Select the one you have. 60 | 61 | 62 | ## Question 6. Most common locations pair 63 | 64 | Find the most common pickup-dropoff pair. 65 | 66 | For example: 67 | 68 | "Jamaica Bay / Clinton East" 69 | 70 | Enter two zone names separated by a slash 71 | 72 | If any of the zone names are unknown (missing), use "Unknown". For example, "Unknown / Clinton East". 73 | 74 | 75 | ## Bonus question. Join type 76 | 77 | (not graded) 78 | 79 | For finding the answer to Q6, you'll need to perform a join. 80 | 81 | What type of join is it? 82 | 83 | And how many stages your spark job has? 84 | 85 | 86 | ## Submitting the solutions 87 | 88 | * Form for submitting: https://forms.gle/dBkVK9yT8cSMDwuw7 89 | * You can submit your homework multiple times. In this case, only the last submission will be used. 90 | 91 | Deadline: 07 March (Monday), 22:00 CET 92 | -------------------------------------------------------------------------------- /cohorts/2022/week_6_stream_processing/homework.md: -------------------------------------------------------------------------------- 1 | ## Week 6 Homework 2 | [Form](https://forms.gle/mSzfpPCXskWCabeu5) 3 | 4 | The homework is mostly theoretical. In the last question you have to provide working code link, please keep in mind that this 5 | question is not scored. 6 | 7 | Deadline: 14 March, 22:00 CET -------------------------------------------------------------------------------- /cohorts/2023/README.md: -------------------------------------------------------------------------------- 1 | ## Data Engineering Zoomcamp 2023 Cohort 2 | 3 | * [Launch stream with course overview](https://www.youtube.com/watch?v=-zpVha7bw5A) 4 | * [Course Google calendar](https://calendar.google.com/calendar/?cid=ZXIxcjA1M3ZlYjJpcXU0dTFmaG02MzVxMG9AZ3JvdXAuY2FsZW5kYXIuZ29vZ2xlLmNvbQ) 5 | * [FAQ](https://docs.google.com/document/d/19bnYs80DwuUimHM65UV3sylsCn2j1vziPOwzBwQrebw/edit?usp=sharing) 6 | * [Public Leaderboard](leaderboard.md) and [Private Leaderboard](https://docs.google.com/spreadsheets/d/e/2PACX-1vTbL00GcdQp0bJt9wf1ROltMq7s3qyxl-NYF7Pvk79Jfxgwfn9dNWmPD_yJHTDq_Wzvps8EIr6cOKWm/pubhtml) 7 | * [Course Playlist: Only 2023 Live videos & homeworks](https://www.youtube.com/playlist?list=PL3MmuxUbc_hJjEePXIdE-LVUx_1ZZjYGW) 8 | 9 | [**Week 1: Introduction & Prerequisites**](week_1_docker_sql/) 10 | 11 | * [Homework SQL](week_1_docker_sql/homework.md) and [solution](https://www.youtube.com/watch?v=KIh_9tZiroA) 12 | * [Homework Terraform](week_1_terraform/homework.md) 13 | * [Office hours](https://www.youtube.com/watch?v=RVTryVvSyw4&list=PL3MmuxUbc_hJjEePXIdE-LVUx_1ZZjYGW) 14 | 15 | [**Week 2: Workflow Orchestration**](week_2_workflow_orchestration) 16 | 17 | * [Homework](week_2_workflow_orchestration/homework.md) 18 | * [Office hours part 1](https://www.youtube.com/watch?v=a_nmLHb8hzw&list=PL3MmuxUbc_hJjEePXIdE-LVUx_1ZZjYGW) and [part 2](https://www.youtube.com/watch?v=PK8yyMY54Vk&list=PL3MmuxUbc_hJjEePXIdE-LVUx_1ZZjYGW&index=7) 19 | 20 | [**Week 3: Data Warehouse**](week_3_data_warehouse) 21 | 22 | * [Homework](week_3_data_warehouse/homework.md) 23 | * [Office hours](https://www.youtube.com/watch?v=QXfmtJp3bXE&list=PL3MmuxUbc_hJjEePXIdE-LVUx_1ZZjYGW) 24 | 25 | [**Week 4: Analytics Engineering**](week_4_analytics_engineering/) 26 | 27 | * [Homework](week_4_analytics_engineering/homework.md) 28 | * [PipeRider + dbt Workshop](workshops/piperider.md) 29 | * [Office hours](https://www.youtube.com/watch?v=ODYg_r72qaE&list=PL3MmuxUbc_hJjEePXIdE-LVUx_1ZZjYGW) 30 | 31 | [**Week 5: Batch processing**](week_5_batch_processing/) 32 | 33 | * [Homework](week_5_batch_processing/homework.md) 34 | * [Office hours](https://www.youtube.com/watch?v=5_69yL2PPYI&list=PL3MmuxUbc_hJjEePXIdE-LVUx_1ZZjYGW) 35 | 36 | [**Week 6: Stream Processing**](week_6_stream_processing) 37 | 38 | * [Homework](week_6_stream_processing/homework.md) 39 | 40 | 41 | [**Week 7, 8 & 9: Project**](project.md) 42 | 43 | More information [here](project.md) 44 | -------------------------------------------------------------------------------- /cohorts/2023/project.md: -------------------------------------------------------------------------------- 1 | ## Course Project 2 | 3 | The goal of this project is to apply everything we learned 4 | in this course and build an end-to-end data pipeline. 5 | 6 | You will have two attempts to submit your project. If you don't have 7 | time to submit your project by the end of attempt #1 (you started the 8 | course late, you have vacation plans, life/work got in the way, etc.) 9 | or you fail your first attempt, 10 | then you will have a second chance to submit your project as attempt 11 | #2. 12 | 13 | There are only two attempts. 14 | 15 | Remember that to pass the project, you must evaluate 3 peers. If you don't do that, 16 | your project can't be considered complete. 17 | 18 | To find the projects assigned to you, use the peer review assignments link 19 | and find your hash in the first column. You will see three rows: you need to evaluate 20 | each of these projects. For each project, you need to submit the form once, 21 | so in total, you will make three submissions. 22 | 23 | 24 | ### Submitting 25 | 26 | #### Project Attempt #1 27 | 28 | Project: 29 | 30 | * Form: https://forms.gle/zTJiVYSmCgsENj6y8 31 | * Deadline: 10 April, 22:00 CET 32 | 33 | Peer reviewing: 34 | 35 | * Peer review assignments: [link](https://docs.google.com/spreadsheets/d/e/2PACX-1vRYQ0A9C7AkRK-YPSFhqaRMmuPR97QPfl2PjI8n11l5jntc6YMHIJXVVS0GQNqAYIGwzyevyManDB08/pubhtml?gid=0&single=true) ("project-01" sheet) 36 | * Form: https://forms.gle/1bxmgR8yPwV359zb7 37 | * Deadline: 17 April, 22:00 CET 38 | 39 | Project feedback: [link](https://docs.google.com/spreadsheets/d/e/2PACX-1vQuMt9m1XlPrCACqnsFTXTV_KGiSnsl9UjL7kdTMsLJ8DLu3jNJlPzoUKG6baxc8APeEQ8RaSP1U2VX/pubhtml?gid=27207346&single=true) ("project-01" sheet) 40 | 41 | #### Project Attempt #2 42 | 43 | Project: 44 | 45 | * Form: https://forms.gle/gCXUSYBm1KgMKXVm8 46 | * Deadline: 4 May, 22:00 CET 47 | 48 | Peer reviewing: 49 | 50 | * Peer review assignments: [link](https://docs.google.com/spreadsheets/d/e/2PACX-1vRYQ0A9C7AkRK-YPSFhqaRMmuPR97QPfl2PjI8n11l5jntc6YMHIJXVVS0GQNqAYIGwzyevyManDB08/pubhtml?gid=303437788&single=true) ("project-02" sheet) 51 | * Form: https://forms.gle/2x5MT4xxczR8isy37 52 | * Deadline: 11 May, 22:00 CET 53 | 54 | Project feedback: [link](https://docs.google.com/spreadsheets/d/e/2PACX-1vQuMt9m1XlPrCACqnsFTXTV_KGiSnsl9UjL7kdTMsLJ8DLu3jNJlPzoUKG6baxc8APeEQ8RaSP1U2VX/pubhtml?gid=246029638&single=true) 55 | 56 | ### Evaluation criteria 57 | 58 | See [here](../../week_7_project/README.md) 59 | 60 | 61 | ### Misc 62 | 63 | To get the hash for your project, use this function to hash your email: 64 | 65 | ```python 66 | from hashlib import sha1 67 | 68 | def compute_hash(email): 69 | return sha1(email.lower().encode('utf-8')).hexdigest() 70 | ``` 71 | 72 | Or use [this website](http://www.sha1-online.com/). 73 | -------------------------------------------------------------------------------- /cohorts/2023/week_1_docker_sql/homework.md: -------------------------------------------------------------------------------- 1 | ## Week 1 Homework 2 | 3 | In this homework we'll prepare the environment 4 | and practice with Docker and SQL 5 | 6 | 7 | ## Question 1. Knowing docker tags 8 | 9 | Run the command to get information on Docker 10 | 11 | ```docker --help``` 12 | 13 | Now run the command to get help on the "docker build" command 14 | 15 | Which tag has the following text? - *Write the image ID to the file* 16 | 17 | - `--imageid string` 18 | - `--iidfile string` 19 | - `--idimage string` 20 | - `--idfile string` 21 | 22 | 23 | ## Question 2. Understanding docker first run 24 | 25 | Run docker with the python:3.9 image in an interactive mode and the entrypoint of bash. 26 | Now check the python modules that are installed ( use pip list). 27 | How many python packages/modules are installed? 28 | 29 | - 1 30 | - 6 31 | - 3 32 | - 7 33 | 34 | # Prepare Postgres 35 | 36 | Run Postgres and load data as shown in the videos 37 | We'll use the green taxi trips from January 2019: 38 | 39 | ```wget https://github.com/DataTalksClub/nyc-tlc-data/releases/download/green/green_tripdata_2019-01.csv.gz``` 40 | 41 | You will also need the dataset with zones: 42 | 43 | ```wget https://s3.amazonaws.com/nyc-tlc/misc/taxi+_zone_lookup.csv``` 44 | 45 | Download this data and put it into Postgres (with jupyter notebooks or with a pipeline) 46 | 47 | 48 | ## Question 3. Count records 49 | 50 | How many taxi trips were totally made on January 15? 51 | 52 | Tip: started and finished on 2019-01-15. 53 | 54 | Remember that `lpep_pickup_datetime` and `lpep_dropoff_datetime` columns are in the format timestamp (date and hour+min+sec) and not in date. 55 | 56 | - 20689 57 | - 20530 58 | - 17630 59 | - 21090 60 | 61 | ## Question 4. Largest trip for each day 62 | 63 | Which was the day with the largest trip distance 64 | Use the pick up time for your calculations. 65 | 66 | - 2019-01-18 67 | - 2019-01-28 68 | - 2019-01-15 69 | - 2019-01-10 70 | 71 | ## Question 5. The number of passengers 72 | 73 | In 2019-01-01 how many trips had 2 and 3 passengers? 74 | 75 | - 2: 1282 ; 3: 266 76 | - 2: 1532 ; 3: 126 77 | - 2: 1282 ; 3: 254 78 | - 2: 1282 ; 3: 274 79 | 80 | 81 | ## Question 6. Largest tip 82 | 83 | For the passengers picked up in the Astoria Zone which was the drop off zone that had the largest tip? 84 | We want the name of the zone, not the id. 85 | 86 | Note: it's not a typo, it's `tip` , not `trip` 87 | 88 | - Central Park 89 | - Jamaica 90 | - South Ozone Park 91 | - Long Island City/Queens Plaza 92 | 93 | 94 | ## Submitting the solutions 95 | 96 | * Form for submitting: [form](https://forms.gle/EjphSkR1b3nsdojv7) 97 | * You can submit your homework multiple times. In this case, only the last submission will be used. 98 | 99 | Deadline: 30 January (Monday), 22:00 CET 100 | 101 | 102 | ## Solution 103 | 104 | See here: https://www.youtube.com/watch?v=KIh_9tZiroA 105 | -------------------------------------------------------------------------------- /cohorts/2023/week_1_terraform/homework.md: -------------------------------------------------------------------------------- 1 | ## Week 1 Homework 2 | 3 | In this homework we'll prepare the environment by creating resources in GCP with Terraform. 4 | 5 | In your VM on GCP install Terraform. Copy the files from the course repo 6 | [here](https://github.com/DataTalksClub/data-engineering-zoomcamp/tree/main/week_1_basics_n_setup/1_terraform_gcp/terraform) to your VM. 7 | 8 | Modify the files as necessary to create a GCP Bucket and Big Query Dataset. 9 | 10 | 11 | ## Question 1. Creating Resources 12 | 13 | After updating the main.tf and variable.tf files run: 14 | 15 | ``` 16 | terraform apply 17 | ``` 18 | 19 | Paste the output of this command into the homework submission form. 20 | 21 | 22 | ## Submitting the solutions 23 | 24 | * Form for submitting: [form](https://forms.gle/S57Xs3HL9nB3YTzj9) 25 | * You can submit your homework multiple times. In this case, only the last submission will be used. 26 | 27 | Deadline: 30 January (Monday), 22:00 CET 28 | 29 | -------------------------------------------------------------------------------- /cohorts/2023/week_6_stream_processing/client.properties: -------------------------------------------------------------------------------- 1 | # Required connection configs for Kafka producer, consumer, and admin 2 | bootstrap.servers=:9092 3 | security.protocol=SASL_SSL 4 | sasl.mechanisms=PLAIN 5 | sasl.username= 6 | sasl.password= 7 | 8 | # Best practice for higher availability in librdkafka clients prior to 1.7 9 | session.timeout.ms=45000 -------------------------------------------------------------------------------- /cohorts/2023/week_6_stream_processing/producer_confluent.py: -------------------------------------------------------------------------------- 1 | from confluent_kafka import Producer 2 | 3 | import argparse 4 | import csv 5 | from typing import Dict 6 | from time import sleep 7 | 8 | from settings import CONFLUENT_CLOUD_CONFIG, \ 9 | GREEN_TAXI_TOPIC, FHV_TAXI_TOPIC, \ 10 | GREEN_TRIP_DATA_PATH, FHV_TRIP_DATA_PATH 11 | 12 | 13 | class RideCSVProducer: 14 | def __init__(self, probs: Dict, ride_type: str): 15 | 16 | self.producer = Producer(**probs) 17 | self.ride_type = ride_type 18 | 19 | def parse_row(self, row): 20 | if self.ride_type == 'green': 21 | record = f'{row[5]}, {row[6]}' # PULocationID, DOLocationID 22 | key = str(row[0]) # vendor_id 23 | elif self.ride_type == 'fhv': 24 | record = f'{row[3]}, {row[4]}' # PULocationID, DOLocationID, 25 | key = str(row[0]) # dispatching_base_num 26 | return key, record 27 | 28 | def read_records(self, resource_path: str): 29 | records, ride_keys = [], [] 30 | with open(resource_path, 'r') as f: 31 | reader = csv.reader(f) 32 | header = next(reader) # skip the header 33 | for row in reader: 34 | key, record = self.parse_row(row) 35 | ride_keys.append(key) 36 | records.append(record) 37 | return zip(ride_keys, records) 38 | 39 | def publish(self, records: [str, str], topic: str): 40 | for key_value in records: 41 | key, value = key_value 42 | try: 43 | self.producer.poll(0) 44 | self.producer.produce(topic=topic, key=key, value=value) 45 | print(f"Producing record for ") 46 | except KeyboardInterrupt: 47 | break 48 | except BufferError as bfer: 49 | self.producer.poll(0.1) 50 | except Exception as e: 51 | print(f"Exception while producing record - {value}: {e}") 52 | 53 | self.producer.flush() 54 | sleep(10) 55 | 56 | 57 | if __name__ == "__main__": 58 | parser = argparse.ArgumentParser(description='Kafka Consumer') 59 | parser.add_argument('--type', type=str, default='green') 60 | args = parser.parse_args() 61 | 62 | if args.type == 'green': 63 | kafka_topic = GREEN_TAXI_TOPIC 64 | data_path = GREEN_TRIP_DATA_PATH 65 | elif args.type == 'fhv': 66 | kafka_topic = FHV_TAXI_TOPIC 67 | data_path = FHV_TRIP_DATA_PATH 68 | 69 | producer = RideCSVProducer(ride_type=args.type, probs=CONFLUENT_CLOUD_CONFIG) 70 | ride_records = producer.read_records(resource_path=data_path) 71 | producer.publish(records=ride_records, topic=kafka_topic) 72 | -------------------------------------------------------------------------------- /cohorts/2023/week_6_stream_processing/settings.py: -------------------------------------------------------------------------------- 1 | import pyspark.sql.types as T 2 | 3 | GREEN_TRIP_DATA_PATH = './resources/green_tripdata/green_tripdata_2019-01.csv' 4 | FHV_TRIP_DATA_PATH = './resources/fhv_tripdata/fhv_tripdata_2019-01.csv' 5 | BOOTSTRAP_SERVERS = 'localhost:9092' 6 | 7 | RIDES_TOPIC = 'all_rides' 8 | FHV_TAXI_TOPIC = 'fhv_taxi_rides' 9 | GREEN_TAXI_TOPIC = 'green_taxi_rides' 10 | 11 | ALL_RIDE_SCHEMA = T.StructType( 12 | [T.StructField("PUlocationID", T.StringType()), 13 | T.StructField("DOlocationID", T.StringType()), 14 | ]) 15 | 16 | 17 | def read_ccloud_config(config_file): 18 | conf = {} 19 | with open(config_file) as fh: 20 | for line in fh: 21 | line = line.strip() 22 | if len(line) != 0 and line[0] != "#": 23 | parameter, value = line.strip().split('=', 1) 24 | conf[parameter] = value.strip() 25 | return conf 26 | 27 | 28 | CONFLUENT_CLOUD_CONFIG = read_ccloud_config('client_original.properties') 29 | -------------------------------------------------------------------------------- /cohorts/2023/week_6_stream_processing/spark-submit.sh: -------------------------------------------------------------------------------- 1 | # Submit Python code to SparkMaster 2 | 3 | if [ $# -lt 1 ] 4 | then 5 | echo "Usage: $0 [ executor-memory ]" 6 | echo "(specify memory in string format such as \"512M\" or \"2G\")" 7 | exit 1 8 | fi 9 | PYTHON_JOB=$1 10 | 11 | if [ -z $2 ] 12 | then 13 | EXEC_MEM="1G" 14 | else 15 | EXEC_MEM=$2 16 | fi 17 | spark-submit --master spark://localhost:7077 --num-executors 2 \ 18 | --executor-memory $EXEC_MEM --executor-cores 1 \ 19 | --packages org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.1,org.apache.spark:spark-avro_2.12:3.3.1,org.apache.spark:spark-streaming-kafka-0-10_2.12:3.3.1 \ 20 | $PYTHON_JOB -------------------------------------------------------------------------------- /cohorts/2023/workshops/piperider.md: -------------------------------------------------------------------------------- 1 | 2 | ## Workshop: Maximizing Confidence in Your Data Model Changes with dbt and PipeRider 3 | 4 | To learn how to use PipeRider together with dbt for detecting changes in model and data, sign up for a workshop 5 | 6 | - Video: https://www.youtube.com/watch?v=O-tyUOQccSs 7 | - Repository: https://github.com/InfuseAI/taxi_rides_ny_duckdb 8 | 9 | 10 | ## Homework 11 | 12 | The following questions follow on from the original Week 4 homework, and so use the same data as required by those questions: 13 | 14 | https://github.com/DataTalksClub/data-engineering-zoomcamp/blob/main/cohorts/2023/week_4_analytics_engineering/homework.md 15 | 16 | Yellow taxi data - Years 2019 and 2020 17 | Green taxi data - Years 2019 and 2020 18 | fhv data - Year 2019. 19 | 20 | ### Question 1: 21 | 22 | What is the distribution between vendor id filtering by years 2019 and 2020 data? 23 | 24 | You will need to run PipeRider and check the report 25 | 26 | * 70.1/29.6/0.5 27 | * 60.1/39.5/0.4 28 | * 90.2/9.5/0.3 29 | * 80.1/19.7/0.2 30 | 31 | ### Question 2: 32 | 33 | What is the composition of total amount (positive/zero/negative) filtering by years 2019 and 2020 data? 34 | 35 | You will need to run PipeRider and check the report 36 | 37 | 38 | * 51.4M/15K/48.6K 39 | * 21.4M/5K/248.6K 40 | * 61.4M/25K/148.6K 41 | * 81.4M/35K/14.6K 42 | 43 | ### Question 3: 44 | 45 | What is the numeric statistics (average/standard deviation/min/max/sum) of trip distances filtering by years 2019 and 2020 data? 46 | 47 | You will need to run PipeRider and check the report 48 | 49 | 50 | * 1.95/35.43/0/16.3K/151.5M 51 | * 3.95/25.43/23.88/267.3K/281.5M 52 | * 5.95/75.43/-63.88/67.3K/81.5M 53 | * 2.95/35.43/-23.88/167.3K/181.5M 54 | 55 | 56 | 57 | ## Submitting the solutions 58 | 59 | * Form for submitting: https://forms.gle/WyLQHBu1DNwNTfqe8 60 | * You can submit your homework multiple times. In this case, only the last submission will be used. 61 | 62 | Deadline: 20 March, 22:00 CET 63 | 64 | 65 | ## Solution 66 | 67 | Video: https://www.youtube.com/watch?v=inNrUys7W8U&list=PL3MmuxUbc_hJjEePXIdE-LVUx_1ZZjYGW 68 | -------------------------------------------------------------------------------- /cohorts/2024/05-batch/homework.md: -------------------------------------------------------------------------------- 1 | ## Module 5 Homework 2 | 3 | Solution: https://www.youtube.com/watch?v=YtddC7vJOgQ 4 | 5 | In this homework we'll put what we learned about Spark in practice. 6 | 7 | For this homework we will be using the FHV 2019-10 data found here. [FHV Data](https://github.com/DataTalksClub/nyc-tlc-data/releases/download/fhv/fhv_tripdata_2019-10.csv.gz) 8 | 9 | ### Question 1: 10 | 11 | **Install Spark and PySpark** 12 | 13 | - Install Spark 14 | - Run PySpark 15 | - Create a local spark session 16 | - Execute spark.version. 17 | 18 | What's the output? 19 | 20 | > [!NOTE] 21 | > To install PySpark follow this [guide](https://github.com/DataTalksClub/data-engineering-zoomcamp/blob/main/05-batch/setup/pyspark.md) 22 | 23 | ### Question 2: 24 | 25 | **FHV October 2019** 26 | 27 | Read the October 2019 FHV into a Spark Dataframe with a schema as we did in the lessons. 28 | 29 | Repartition the Dataframe to 6 partitions and save it to parquet. 30 | 31 | What is the average size of the Parquet (ending with .parquet extension) Files that were created (in MB)? Select the answer which most closely matches. 32 | 33 | - 1MB 34 | - 6MB 35 | - 25MB 36 | - 87MB 37 | 38 | 39 | 40 | ### Question 3: 41 | 42 | **Count records** 43 | 44 | How many taxi trips were there on the 15th of October? 45 | 46 | Consider only trips that started on the 15th of October. 47 | 48 | - 108,164 49 | - 12,856 50 | - 452,470 51 | - 62,610 52 | 53 | > [!IMPORTANT] 54 | > Be aware of columns order when defining schema 55 | 56 | ### Question 4: 57 | 58 | **Longest trip for each day** 59 | 60 | What is the length of the longest trip in the dataset in hours? 61 | 62 | - 631,152.50 Hours 63 | - 243.44 Hours 64 | - 7.68 Hours 65 | - 3.32 Hours 66 | 67 | 68 | 69 | ### Question 5: 70 | 71 | **User Interface** 72 | 73 | Spark’s User Interface which shows the application's dashboard runs on which local port? 74 | 75 | - 80 76 | - 443 77 | - 4040 78 | - 8080 79 | 80 | 81 | 82 | ### Question 6: 83 | 84 | **Least frequent pickup location zone** 85 | 86 | Load the zone lookup data into a temp view in Spark
87 | [Zone Data](https://github.com/DataTalksClub/nyc-tlc-data/releases/download/misc/taxi_zone_lookup.csv) 88 | 89 | Using the zone lookup data and the FHV October 2019 data, what is the name of the LEAST frequent pickup location Zone?
90 | 91 | - East Chelsea 92 | - Jamaica Bay 93 | - Union Sq 94 | - Crown Heights North 95 | 96 | 97 | ## Submitting the solutions 98 | 99 | - Form for submitting: https://courses.datatalks.club/de-zoomcamp-2024/homework/hw5 100 | - Deadline: See the website 101 | -------------------------------------------------------------------------------- /cohorts/2024/06-streaming/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3.7' 2 | services: 3 | # Redpanda cluster 4 | redpanda-1: 5 | image: docker.redpanda.com/vectorized/redpanda:v22.3.5 6 | container_name: redpanda-1 7 | command: 8 | - redpanda 9 | - start 10 | - --smp 11 | - '1' 12 | - --reserve-memory 13 | - 0M 14 | - --overprovisioned 15 | - --node-id 16 | - '1' 17 | - --kafka-addr 18 | - PLAINTEXT://0.0.0.0:29092,OUTSIDE://0.0.0.0:9092 19 | - --advertise-kafka-addr 20 | - PLAINTEXT://redpanda-1:29092,OUTSIDE://localhost:9092 21 | - --pandaproxy-addr 22 | - PLAINTEXT://0.0.0.0:28082,OUTSIDE://0.0.0.0:8082 23 | - --advertise-pandaproxy-addr 24 | - PLAINTEXT://redpanda-1:28082,OUTSIDE://localhost:8082 25 | - --rpc-addr 26 | - 0.0.0.0:33145 27 | - --advertise-rpc-addr 28 | - redpanda-1:33145 29 | ports: 30 | # - 8081:8081 31 | - 8082:8082 32 | - 9092:9092 33 | - 28082:28082 34 | - 29092:29092 -------------------------------------------------------------------------------- /cohorts/2024/README.md: -------------------------------------------------------------------------------- 1 | ## Data Engineering Zoomcamp 2024 Cohort 2 | 3 | * [Pre-launch Q&A stream](https://www.youtube.com/watch?v=91b8u9GmqB4) 4 | * [Launch stream with course overview](https://www.youtube.com/live/AtRhA-NfS24?si=5JzA_E8BmJjiLi8l) 5 | * [Deadline calendar](https://docs.google.com/spreadsheets/d/e/2PACX-1vQACMLuutV5rvXg5qICuJGL-yZqIV0FBD84CxPdC5eZHf8TfzB-CJT_3Mo7U7oGVTXmSihPgQxuuoku/pubhtml) 6 | * [Course Google calendar](https://calendar.google.com/calendar/?cid=ZXIxcjA1M3ZlYjJpcXU0dTFmaG02MzVxMG9AZ3JvdXAuY2FsZW5kYXIuZ29vZ2xlLmNvbQ) 7 | * [FAQ](https://docs.google.com/document/d/19bnYs80DwuUimHM65UV3sylsCn2j1vziPOwzBwQrebw/edit?usp=sharing) 8 | * Course Playlist: Only 2024 Live videos & homeworks (TODO) 9 | * [Public Leaderboard of Top-100 Participants](leaderboard.md) 10 | 11 | 12 | [**Module 1: Introduction & Prerequisites**](01-docker-terraform/) 13 | 14 | * [Homework](01-docker-terraform/homework.md) 15 | 16 | 17 | [**Module 2: Workflow Orchestration**](02-workflow-orchestration) 18 | 19 | * [Homework](02-workflow-orchestration/homework.md) 20 | * Office hours 21 | 22 | [**Workshop 1: Data Ingestion**](workshops/dlt.md) 23 | 24 | * Workshop with dlt 25 | * [Homework](workshops/dlt.md) 26 | 27 | 28 | [**Module 3: Data Warehouse**](03-data-warehouse) 29 | 30 | * [Homework](03-data-warehouse/homework.md) 31 | 32 | 33 | [**Module 4: Analytics Engineering**](04-analytics-engineering/) 34 | 35 | * [Homework](04-analytics-engineering/homework.md) 36 | 37 | 38 | [**Module 5: Batch processing**](05-batch/) 39 | 40 | * [Homework](05-batch/homework.md) 41 | 42 | 43 | [**Module 6: Stream Processing**](06-streaming) 44 | 45 | * [Homework](06-streaming/homework.md) 46 | 47 | 48 | [**Project**](project.md) 49 | 50 | More information [here](project.md) 51 | -------------------------------------------------------------------------------- /cohorts/2024/project.md: -------------------------------------------------------------------------------- 1 | ## Course Project 2 | 3 | The goal of this project is to apply everything we learned 4 | in this course and build an end-to-end data pipeline. 5 | 6 | You will have two attempts to submit your project. If you don't have 7 | time to submit your project by the end of attempt #1 (you started the 8 | course late, you have vacation plans, life/work got in the way, etc.) 9 | or you fail your first attempt, 10 | then you will have a second chance to submit your project as attempt 11 | #2. 12 | 13 | There are only two attempts. 14 | 15 | Remember that to pass the project, you must evaluate 3 peers. If you don't do that, 16 | your project can't be considered complete. 17 | 18 | To find the projects assigned to you, use the peer review assignments link 19 | and find your hash in the first column. You will see three rows: you need to evaluate 20 | each of these projects. For each project, you need to submit the form once, 21 | so in total, you will make three submissions. 22 | 23 | 24 | ### Submitting 25 | 26 | #### Project Attempt #1 27 | 28 | * Project: https://courses.datatalks.club/de-zoomcamp-2024/project/project1 29 | * Review: https://courses.datatalks.club/de-zoomcamp-2024/project/project1/eval 30 | 31 | #### Project Attempt #2 32 | 33 | * Project: https://courses.datatalks.club/de-zoomcamp-2024/project/project2 34 | * Review: https://courses.datatalks.club/de-zoomcamp-2024/project/project2/eval 35 | 36 | > **Important**: update your "Certificate name" here: https://courses.datatalks.club/de-zoomcamp-2024/enrollment - 37 | this is what we will use when generating certificates for you. 38 | 39 | ### Evaluation criteria 40 | 41 | See [here](../../week_7_project/README.md) 42 | 43 | 44 | -------------------------------------------------------------------------------- /cohorts/2024/workshops/dlt_resources/incremental_loading.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataTalksClub/data-engineering-zoomcamp/76fed2419e917176369a0d3a2eeff6c936f87286/cohorts/2024/workshops/dlt_resources/incremental_loading.png -------------------------------------------------------------------------------- /cohorts/2025/02-workflow-orchestration/solution.md: -------------------------------------------------------------------------------- 1 | ## Question 1 2 | 3 | ``` 4 | Within the execution for Yellow Taxi data for the year 2020 and month 12: what is the uncompressed file size (i.e. the output file yellow_tripdata_2020-12.csv of the extract task)? 5 | ``` 6 | 7 | To get this answer, you need to go to the Outputs tab in Kestra and select the file. The size will be next to the preview and download button. 8 | Answer: `128.3 MB` 9 | 10 | ## Question 2 11 | 12 | ``` 13 | What is the rendered value of the variable file when the inputs taxi is set to green, year is set to 2020, and month is set to 04 during execution? 14 | ``` 15 | 16 | To get this answer, you can run the expression in [Debug Outputs](https://youtu.be/SPGmXSJN3VE) to see it rendered. 17 | 18 | Answer: `green_tripdata_2020-04.csv` 19 | 20 | ## Question 3 21 | 22 | ``` 23 | How many rows are there for the Yellow Taxi data for all CSV files in the year 2020? 24 | ``` 25 | 26 | Answer: `24,648,499` 27 | 28 | ## Question 4 29 | 30 | ``` 31 | How many rows are there for the Green Taxi data for all CSV files in the year 2020? 32 | ``` 33 | 34 | Answer: `1,734,051` 35 | 36 | ## Question 5 37 | 38 | ``` 39 | How many rows are there for the Yellow Taxi data for the March 2021 CSV file? 40 | ``` 41 | 42 | Answer: `1,925,152` 43 | 44 | ## Question 6 45 | 46 | ``` 47 | How would you configure the timezone to New York in a Schedule trigger? 48 | ``` 49 | 50 | Answer: `Add a timezone property set to America/New_York in the Schedule trigger configuration` 51 | -------------------------------------------------------------------------------- /cohorts/2025/04-analytics-engineering/homework_q2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataTalksClub/data-engineering-zoomcamp/76fed2419e917176369a0d3a2eeff6c936f87286/cohorts/2025/04-analytics-engineering/homework_q2.png -------------------------------------------------------------------------------- /cohorts/2025/05-batch/homework.md: -------------------------------------------------------------------------------- 1 | # Module 5 Homework 2 | 3 | In this homework we'll put what we learned about Spark in practice. 4 | 5 | For this homework we will be using the Yellow 2024-10 data from the official website: 6 | 7 | ```bash 8 | wget https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-10.parquet 9 | ``` 10 | 11 | 12 | ## Question 1: Install Spark and PySpark 13 | 14 | - Install Spark 15 | - Run PySpark 16 | - Create a local spark session 17 | - Execute spark.version. 18 | 19 | What's the output? 20 | 21 | > [!NOTE] 22 | > To install PySpark follow this [guide](https://github.com/DataTalksClub/data-engineering-zoomcamp/blob/main/05-batch/setup/pyspark.md) 23 | 24 | 25 | ## Question 2: Yellow October 2024 26 | 27 | Read the October 2024 Yellow into a Spark Dataframe. 28 | 29 | Repartition the Dataframe to 4 partitions and save it to parquet. 30 | 31 | What is the average size of the Parquet (ending with .parquet extension) Files that were created (in MB)? Select the answer which most closely matches. 32 | 33 | - 6MB 34 | - 25MB 35 | - 75MB 36 | - 100MB 37 | 38 | 39 | ## Question 3: Count records 40 | 41 | How many taxi trips were there on the 15th of October? 42 | 43 | Consider only trips that started on the 15th of October. 44 | 45 | - 85,567 46 | - 105,567 47 | - 125,567 48 | - 145,567 49 | 50 | 51 | ## Question 4: Longest trip 52 | 53 | What is the length of the longest trip in the dataset in hours? 54 | 55 | - 122 56 | - 142 57 | - 162 58 | - 182 59 | 60 | 61 | ## Question 5: User Interface 62 | 63 | Spark’s User Interface which shows the application's dashboard runs on which local port? 64 | 65 | - 80 66 | - 443 67 | - 4040 68 | - 8080 69 | 70 | 71 | 72 | ## Question 6: Least frequent pickup location zone 73 | 74 | Load the zone lookup data into a temp view in Spark: 75 | 76 | ```bash 77 | wget https://d37ci6vzurychx.cloudfront.net/misc/taxi_zone_lookup.csv 78 | ``` 79 | 80 | Using the zone lookup data and the Yellow October 2024 data, what is the name of the LEAST frequent pickup location Zone? 81 | 82 | - Governor's Island/Ellis Island/Liberty Island 83 | - Arden Heights 84 | - Rikers Island 85 | - Jamaica Bay 86 | 87 | 88 | ## Submitting the solutions 89 | 90 | - Form for submitting: https://courses.datatalks.club/de-zoomcamp-2025/homework/hw5 91 | - Deadline: See the website 92 | -------------------------------------------------------------------------------- /cohorts/2025/README.md: -------------------------------------------------------------------------------- 1 | ## Data Engineering Zoomcamp 2025 Cohort 2 | 3 | * [Pre-launch Q&A stream](https://www.youtube.com/watch?v=DPnAOu2csYA) 4 | * [Launch stream with course overview](https://www.youtube.com/watch?v=X8cEEwi8DTM) 5 | * [Course Google calendar](https://calendar.google.com/calendar/?cid=ZXIxcjA1M3ZlYjJpcXU0dTFmaG02MzVxMG9AZ3JvdXAuY2FsZW5kYXIuZ29vZ2xlLmNvbQ) 6 | * [FAQ](https://docs.google.com/document/d/19bnYs80DwuUimHM65UV3sylsCn2j1vziPOwzBwQrebw/edit?usp=sharing) 7 | * [Course Playlist](https://www.youtube.com/playlist?list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb) 8 | * [Cohort-specific playlist: only 2025 Live videos](https://www.youtube.com/playlist?list=PL3MmuxUbc_hJZdpLpRHp7dg6EOx828q6y) 9 | 10 | 11 | [**Module 1: Introduction & Prerequisites**](01-docker-terraform/) 12 | 13 | * [Homework](01-docker-terraform/homework.md) 14 | 15 | 16 | [**Module 2: Workflow Orchestration**](02-workflow-orchestration) 17 | 18 | * [Homework](02-workflow-orchestration/homework.md) 19 | * Office hours 20 | 21 | [**Workshop 1: Data Ingestion**](workshops/dlt/README.md) 22 | 23 | * Workshop with dlt 24 | * [Homework](workshops/dlt/README.md) 25 | 26 | 27 | [**Module 3: Data Warehouse**](03-data-warehouse) 28 | 29 | * [Homework](03-data-warehouse/homework.md) 30 | 31 | 32 | [**Module 4: Analytics Engineering**](04-analytics-engineering/) 33 | 34 | * [Homework](04-analytics-engineering/homework.md) 35 | 36 | 37 | [**Module 5: Batch processing**](05-batch/) 38 | 39 | * [Homework](05-batch/homework.md) 40 | 41 | 42 | [**Module 6: Stream Processing**](06-streaming) 43 | 44 | * [Homework](06-streaming/homework.md) 45 | 46 | 47 | [**Project**](project.md) 48 | 49 | More information [here](project.md) 50 | -------------------------------------------------------------------------------- /cohorts/2025/project.md: -------------------------------------------------------------------------------- 1 | ## Course Project 2 | 3 | The goal of this project is to apply everything we learned 4 | in this course and build an end-to-end data pipeline. 5 | 6 | You will have two attempts to submit your project. If you don't have 7 | time to submit your project by the end of attempt #1 (you started the 8 | course late, you have vacation plans, life/work got in the way, etc.) 9 | or you fail your first attempt, 10 | then you will have a second chance to submit your project as attempt 11 | #2. 12 | 13 | There are only two attempts. 14 | 15 | Remember that to pass the project, you must evaluate 3 peers. If you don't do that, 16 | your project can't be considered complete. 17 | 18 | To find the projects assigned to you, use the peer review assignments link 19 | and find your hash in the first column. You will see three rows: you need to evaluate 20 | each of these projects. For each project, you need to submit the form once, 21 | so in total, you will make three submissions. 22 | 23 | 24 | ### Submitting 25 | 26 | #### Project Attempt #1 27 | 28 | * Project: https://courses.datatalks.club/de-zoomcamp-2025/project/project1 29 | * Review: https://courses.datatalks.club/de-zoomcamp-2025/project/project1/eval 30 | 31 | #### Project Attempt #2 32 | 33 | * Project: https://courses.datatalks.club/de-zoomcamp-2025/project/project2 34 | * Review: https://courses.datatalks.club/de-zoomcamp-2025/project/project2/eval 35 | 36 | > **Important**: update your "Certificate name" here: https://courses.datatalks.club/de-zoomcamp-2025/enrollment - 37 | this is what we will use when generating certificates for you. 38 | 39 | ### Evaluation criteria 40 | 41 | See [here](../../projects/README.md) 42 | -------------------------------------------------------------------------------- /cohorts/2025/workshops/dlt/img/Rest_API.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataTalksClub/data-engineering-zoomcamp/76fed2419e917176369a0d3a2eeff6c936f87286/cohorts/2025/workshops/dlt/img/Rest_API.png -------------------------------------------------------------------------------- /cohorts/2025/workshops/dlt/img/dlt.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataTalksClub/data-engineering-zoomcamp/76fed2419e917176369a0d3a2eeff6c936f87286/cohorts/2025/workshops/dlt/img/dlt.png -------------------------------------------------------------------------------- /cohorts/2025/workshops/dlt/img/pipes.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataTalksClub/data-engineering-zoomcamp/76fed2419e917176369a0d3a2eeff6c936f87286/cohorts/2025/workshops/dlt/img/pipes.jpg -------------------------------------------------------------------------------- /dataset.md: -------------------------------------------------------------------------------- 1 | [Medium article](https://medium.com/@NYCTLC/what-makes-a-city-street-smart-23496d92f60d) 2 | 3 | [Trip record user guide](https://www1.nyc.gov/assets/tlc/downloads/pdf/trip_record_user_guide.pdf) 4 | 5 | The data set is divided into 4 parts: 6 | 7 | - Yellow cabs 8 | - Green cabs 9 | - For Hire Vehicles 10 | - High volume for hire vehicles 11 | 12 | 13 | 14 | Below I am only concentrating on Yellow and green cabs 15 | 16 | ### Yellow and green cabs 17 | 18 | , 19 | 20 | | Columns | Definition | Example | 21 | | --------------------- | ---------- | ------------------- | 22 | | VendorID | | 2 | 23 | | lpep_pickup_datetime | | 2021-01-01 00:15:56 | 24 | | lpep_dropoff_datetime | | 2021-01-01 00:19:52 | 25 | | store_and_fwd_flag | | N, | 26 | | RatecodeID | | 1 | 27 | | PULocationID | | 43 | 28 | | DOLocationID | | 151 | 29 | | passenger_count | | 1 | 30 | | trip_distance | | 1.01 | 31 | | fare_amount | | 5.5 | 32 | | extra | | 0.5 | 33 | | mta_tax | | 0.5 | 34 | | tip_amount | | 0 | 35 | | tolls_amount | | 0 | 36 | | ehail_fee | | | 37 | | improvement_surcharge | | 0.3 | 38 | | total_amount | | 6.8 | 39 | | payment_type | | 2 | 40 | | trip_type | | 1 | 41 | | congestion_surcharge | | 0 | 42 | 43 | 44 | 45 | ### Taxi zone Loopup 46 | 47 | | Columns | Definition | Example | 48 | | ------------ | ---------- | -------------- | 49 | | LocationID | | 1 | 50 | | Borough | | EWR | 51 | | Zone | | Newark Airport | 52 | | service_zone | | EWR | 53 | 54 | [Shapefile from S3](https://s3.amazonaws.com/nyctlc/misc/taxi_zones.zip) 55 | 56 | [Taxi zones](https://data.cityofnewyork.us/Transportation/NYC-Taxi-Zones/d3c5-ddgc) 57 | 58 | -------------------------------------------------------------------------------- /images/architecture/arch_v3_workshops.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataTalksClub/data-engineering-zoomcamp/76fed2419e917176369a0d3a2eeff6c936f87286/images/architecture/arch_v3_workshops.jpg -------------------------------------------------------------------------------- /images/architecture/arch_v4_workshops.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataTalksClub/data-engineering-zoomcamp/76fed2419e917176369a0d3a2eeff6c936f87286/images/architecture/arch_v4_workshops.jpg -------------------------------------------------------------------------------- /images/architecture/photo1700757552.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataTalksClub/data-engineering-zoomcamp/76fed2419e917176369a0d3a2eeff6c936f87286/images/architecture/photo1700757552.jpeg -------------------------------------------------------------------------------- /images/aws/iam.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataTalksClub/data-engineering-zoomcamp/76fed2419e917176369a0d3a2eeff6c936f87286/images/aws/iam.png -------------------------------------------------------------------------------- /images/dlthub.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataTalksClub/data-engineering-zoomcamp/76fed2419e917176369a0d3a2eeff6c936f87286/images/dlthub.png -------------------------------------------------------------------------------- /images/mage.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | -------------------------------------------------------------------------------- /images/piperider.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataTalksClub/data-engineering-zoomcamp/76fed2419e917176369a0d3a2eeff6c936f87286/images/piperider.png -------------------------------------------------------------------------------- /images/rising-wave.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataTalksClub/data-engineering-zoomcamp/76fed2419e917176369a0d3a2eeff6c936f87286/images/rising-wave.png -------------------------------------------------------------------------------- /learning-in-public.md: -------------------------------------------------------------------------------- 1 | # Learning in public 2 | 3 | Most people learn in private: they consume content but don't tell 4 | anyone about it. There's nothing wrong with it. 5 | 6 | But we want to encourage you to document your progress and 7 | share it publicly on social media. 8 | 9 | It helps you get noticed and will lead to: 10 | 11 | * Expanding your network: meeting new people and making new friends 12 | * Being invited to meetups, conferences and podcasts 13 | * Landing a job or getting clients 14 | * Many other good things 15 | 16 | Here's a more compresensive reading on why you want to do it: https://github.com/readme/guides/publishing-your-work 17 | 18 | 19 | ## Learning in Public for Zoomcamps 20 | 21 | When you submit your homework or project, you can also submit 22 | learning in public posts: 23 | 24 | 25 | 26 | You can watch this video to see how your learning in public posts may look like: 27 | 28 | 29 | 30 | 31 | 32 | ## Daily Documentation 33 | 34 | - **Post Daily Diaries**: Document what you learn each day, including the challenges faced and the methods used to overcome them. 35 | - **Create Quick Videos**: Make short videos showcasing your work and upload them to GitHub. 36 | 37 | Send a PR if you want to suggest improvements for this document 38 | -------------------------------------------------------------------------------- /projects/datasets.md: -------------------------------------------------------------------------------- 1 | ## Datasets 2 | 3 | Here are some datasets that you could use for the project: 4 | 5 | 6 | * [Kaggle](https://www.kaggle.com/datasets) 7 | * [AWS datasets](https://registry.opendata.aws/) 8 | * [UK government open data](https://data.gov.uk/) 9 | * [Github archive](https://www.gharchive.org) 10 | * [Awesome public datasets](https://github.com/awesomedata/awesome-public-datasets) 11 | * [Million songs dataset](http://millionsongdataset.com) 12 | * [Some random datasets](https://components.one/datasets/) 13 | * [COVID Datasets](https://www.reddit.com/r/datasets/comments/n3ph2d/coronavirus_datsets/) 14 | * [Datasets from Azure](https://docs.microsoft.com/en-us/azure/azure-sql/public-data-sets) 15 | * [Datasets from BigQuery](https://cloud.google.com/bigquery/public-data/) 16 | * [Dataset search engine from Google](https://datasetsearch.research.google.com/) 17 | * [Public datasets offered by different GCP services](https://cloud.google.com/solutions/datasets) 18 | * [European statistics datasets](https://ec.europa.eu/eurostat/data/database) 19 | * [Datasets for streaming](https://github.com/ColinEberhardt/awesome-public-streaming-datasets) 20 | * [Dataset for Santander bicycle rentals in London](https://cycling.data.tfl.gov.uk/) 21 | * [Common crawl data](https://commoncrawl.org/) (copy of the internet) 22 | * [NASA's EarthData](https://search.earthdata.nasa.gov/search) (May require introductory geospatial analysis) 23 | * Collection Of Data Repositories 24 | * [part 1](https://www.kdnuggets.com/2022/04/complete-collection-data-repositories-part-1.html) (from agriculture and finance to government) 25 | * [part 2](https://www.kdnuggets.com/2022/04/complete-collection-data-repositories-part-2.html) (from healthcare to transportation) 26 | * [Data For Good by Meta](https://dataforgood.facebook.com/dfg/tools) 27 | 28 | PRs with more datasets are welcome! 29 | 30 | It's not mandatory that you use a dataset from this list. You can use any dataset you want. 31 | --------------------------------------------------------------------------------