├── 05-batch ├── .gitignore ├── setup │ ├── config │ │ ├── spark.dockerfile │ │ ├── spark-defaults.conf │ │ └── core-site.xml │ ├── pyspark.md │ └── linux.md └── code │ ├── download_data.sh │ └── 06_spark_sql.py ├── 06-streaming ├── .gitignore ├── python │ ├── requirements.txt │ ├── json_example │ │ ├── settings.py │ │ ├── consumer.py │ │ ├── producer.py │ │ └── ride.py │ ├── redpanda_example │ │ ├── settings.py │ │ ├── producer.py │ │ ├── ride.py │ │ ├── consumer.py │ │ └── docker-compose.yaml │ ├── docker │ │ ├── spark │ │ │ ├── spark-master.Dockerfile │ │ │ ├── spark-worker.Dockerfile │ │ │ ├── jupyterlab.Dockerfile │ │ │ ├── cluster-base.Dockerfile │ │ │ ├── build.sh │ │ │ ├── spark-base.Dockerfile │ │ │ └── docker-compose.yml │ │ ├── README.md │ │ └── kafka │ │ │ └── docker-compose.yml │ ├── resources │ │ └── schemas │ │ │ ├── taxi_ride_key.avsc │ │ │ └── taxi_ride_value.avsc │ ├── streams-example │ │ ├── faust │ │ │ ├── taxi_rides.py │ │ │ ├── stream.py │ │ │ ├── stream_count_vendor_trips.py │ │ │ ├── windowing.py │ │ │ ├── branch_price.py │ │ │ └── producer_taxi_json.py │ │ ├── pyspark │ │ │ ├── spark-submit.sh │ │ │ ├── settings.py │ │ │ ├── README.md │ │ │ ├── consumer.py │ │ │ └── producer.py │ │ └── redpanda │ │ │ ├── spark-submit.sh │ │ │ ├── settings.py │ │ │ ├── README.md │ │ │ ├── consumer.py │ │ │ └── producer.py │ ├── avro_example │ │ ├── settings.py │ │ ├── ride_record_key.py │ │ └── ride_record.py │ └── README.md ├── java │ └── kafka_examples │ │ ├── gradle │ │ └── wrapper │ │ │ ├── gradle-wrapper.jar │ │ │ └── gradle-wrapper.properties │ │ ├── settings.gradle │ │ ├── .gitignore │ │ ├── src │ │ ├── main │ │ │ ├── java │ │ │ │ └── org │ │ │ │ │ └── example │ │ │ │ │ ├── Topics.java │ │ │ │ │ ├── Secrets.java │ │ │ │ │ ├── data │ │ │ │ │ ├── PickupLocation.java │ │ │ │ │ ├── VendorInfo.java │ │ │ │ │ └── Ride.java │ │ │ │ │ ├── customserdes │ │ │ │ │ └── CustomSerdes.java │ │ │ │ │ ├── JsonProducerPickupLocation.java │ │ │ │ │ ├── JsonKStream.java │ │ │ │ │ ├── JsonConsumer.java │ │ │ │ │ ├── JsonKStreamWindow.java │ │ │ │ │ └── JsonProducer.java │ │ │ └── avro │ │ │ │ ├── rides.avsc │ │ │ │ ├── rides_non_compatible.avsc │ │ │ │ └── rides_compatible.avsc │ │ └── test │ │ │ └── java │ │ │ └── org │ │ │ └── example │ │ │ ├── helper │ │ │ └── DataGeneratorHelper.java │ │ │ └── JsonKStreamJoinsTest.java │ │ └── build.gradle └── ksqldb │ └── commands.md ├── 04-analytics-engineering ├── taxi_rides_ny │ ├── .gitkeep │ ├── seeds │ │ ├── .gitkeep │ │ └── seeds_properties.yml │ ├── analyses │ │ ├── .gitkeep │ │ └── hack-load-data.sql │ ├── macros │ │ ├── .gitkeep │ │ ├── macros_properties.yml │ │ └── get_payment_type_description.sql │ ├── snapshots │ │ └── .gitkeep │ ├── packages.yml │ ├── .gitignore │ ├── package-lock.yml │ ├── models │ │ ├── core │ │ │ ├── dim_zones.sql │ │ │ ├── dm_monthly_zone_revenue.sql │ │ │ └── fact_trips.sql │ │ └── staging │ │ │ ├── stg_yellow_tripdata.sql │ │ │ └── stg_green_tripdata.sql │ ├── dbt_project.yml │ └── README.md └── docker_setup │ └── docker-compose.yaml ├── 01-docker-terraform ├── 2_docker_sql │ ├── .gitignore │ ├── pipeline.py │ ├── Dockerfile │ ├── docker-compose.yaml │ ├── ingest_data.py │ └── data-loading-parquet.py └── 1_terraform_gcp │ ├── README.md │ ├── terraform │ ├── README.md │ ├── terraform_with_variables │ │ ├── main.tf │ │ └── variables.tf │ └── terraform_basic │ │ └── main.tf │ ├── 2_gcp_overview.md │ └── 1_terraform_overview.md ├── images ├── dlthub.png ├── aws │ └── iam.png ├── piperider.png ├── rising-wave.png ├── architecture │ ├── arch_v3_workshops.jpg │ ├── arch_v4_workshops.jpg │ └── photo1700757552.jpeg └── mage.svg ├── cohorts ├── 2022 │ ├── week_2_data_ingestion │ │ ├── airflow │ │ │ ├── requirements.txt │ │ │ ├── docs │ │ │ │ ├── arch-diag-airflow.png │ │ │ │ ├── gcs_ingestion_dag.png │ │ │ │ └── 1_concepts.md │ │ │ ├── extras │ │ │ │ └── web_to_gcs.sh │ │ │ ├── scripts │ │ │ │ └── entrypoint.sh │ │ │ ├── .env_example │ │ │ ├── Dockerfile │ │ │ ├── dags_local │ │ │ │ ├── ingest_script.py │ │ │ │ └── data_ingestion_local.py │ │ │ └── docker-compose-nofrills.yml │ │ └── transfer_service │ │ │ └── README.md │ ├── week_3_data_warehouse │ │ └── airflow │ │ │ ├── docs │ │ │ ├── gcs_2_bq_dag_tree_view.png │ │ │ └── gcs_2_bq_dag_graph_view.png │ │ │ ├── scripts │ │ │ └── entrypoint.sh │ │ │ ├── .env_example │ │ │ └── docker-compose-nofrills.yml │ ├── week_6_stream_processing │ │ └── homework.md │ ├── README.md │ ├── project.md │ ├── week_5_batch_processing │ │ └── homework.md │ └── week_1_basics_n_setup │ │ └── homework.md ├── 2023 │ ├── week_6_stream_processing │ │ ├── client.properties │ │ ├── spark-submit.sh │ │ ├── settings.py │ │ └── producer_confluent.py │ ├── week_1_terraform │ │ └── homework.md │ ├── workshops │ │ └── piperider.md │ ├── README.md │ ├── project.md │ ├── week_1_docker_sql │ │ └── homework.md │ └── week_5_batch_processing │ │ └── homework.md ├── 2024 │ ├── workshops │ │ └── dlt_resources │ │ │ └── incremental_loading.png │ ├── 06-streaming │ │ └── docker-compose.yml │ ├── project.md │ ├── README.md │ └── 05-batch │ │ └── homework.md └── 2025 │ ├── workshops │ └── dlt.md │ ├── README.md │ ├── project.md │ └── 05-batch │ └── homework.md ├── 02-workflow-orchestration ├── images │ └── homework.png ├── postgres │ └── docker-compose.yml ├── flows │ ├── 05_gcp_setup.yaml │ ├── 04_gcp_kv.yaml │ ├── 01_getting_started_data_pipeline.yaml │ ├── 03_postgres_dbt.yaml │ └── 07_gcp_dbt.yaml ├── docker-compose.yml └── homework.md ├── 03-data-warehouse ├── extras │ ├── README.md │ └── web_to_gcs.py ├── extract_model.md ├── big_query_hw.sql ├── big_query.sql └── big_query_ml.sql ├── .gitignore ├── after-sign-up.md ├── learning-in-public.md ├── certificates.md ├── projects └── datasets.md └── dataset.md /05-batch/.gitignore: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /06-streaming/.gitignore: -------------------------------------------------------------------------------- 1 | week6_venv -------------------------------------------------------------------------------- /04-analytics-engineering/taxi_rides_ny/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /04-analytics-engineering/taxi_rides_ny/seeds/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /04-analytics-engineering/taxi_rides_ny/analyses/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /04-analytics-engineering/taxi_rides_ny/macros/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /04-analytics-engineering/taxi_rides_ny/snapshots/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /05-batch/setup/config/spark.dockerfile: -------------------------------------------------------------------------------- 1 | FROM library/openjdk:11 -------------------------------------------------------------------------------- /01-docker-terraform/2_docker_sql/.gitignore: -------------------------------------------------------------------------------- 1 | .ipynb_checkpoints/ 2 | ny_taxi_postgres_data/ 3 | *.csv -------------------------------------------------------------------------------- /images/dlthub.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Supraav/data-engineering-zoomcamp/main/images/dlthub.png -------------------------------------------------------------------------------- /cohorts/2022/week_2_data_ingestion/airflow/requirements.txt: -------------------------------------------------------------------------------- 1 | apache-airflow-providers-google 2 | pyarrow 3 | -------------------------------------------------------------------------------- /images/aws/iam.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Supraav/data-engineering-zoomcamp/main/images/aws/iam.png -------------------------------------------------------------------------------- /images/piperider.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Supraav/data-engineering-zoomcamp/main/images/piperider.png -------------------------------------------------------------------------------- /images/rising-wave.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Supraav/data-engineering-zoomcamp/main/images/rising-wave.png -------------------------------------------------------------------------------- /06-streaming/python/requirements.txt: -------------------------------------------------------------------------------- 1 | kafka-python==1.4.6 2 | confluent_kafka 3 | requests 4 | avro 5 | faust 6 | fastavro 7 | -------------------------------------------------------------------------------- /cohorts/2025/workshops/dlt.md: -------------------------------------------------------------------------------- 1 | # Data ingestion with dlt 2 | 3 | Sign up here: https://lu.ma/quyfn4q8 (optional) 4 | 5 | Details TBA 6 | -------------------------------------------------------------------------------- /images/architecture/arch_v3_workshops.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Supraav/data-engineering-zoomcamp/main/images/architecture/arch_v3_workshops.jpg -------------------------------------------------------------------------------- /images/architecture/arch_v4_workshops.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Supraav/data-engineering-zoomcamp/main/images/architecture/arch_v4_workshops.jpg -------------------------------------------------------------------------------- /images/architecture/photo1700757552.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Supraav/data-engineering-zoomcamp/main/images/architecture/photo1700757552.jpeg -------------------------------------------------------------------------------- /02-workflow-orchestration/images/homework.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Supraav/data-engineering-zoomcamp/main/02-workflow-orchestration/images/homework.png -------------------------------------------------------------------------------- /06-streaming/python/json_example/settings.py: -------------------------------------------------------------------------------- 1 | INPUT_DATA_PATH = '../resources/rides.csv' 2 | 3 | BOOTSTRAP_SERVERS = ['localhost:9092'] 4 | KAFKA_TOPIC = 'rides_json' 5 | -------------------------------------------------------------------------------- /04-analytics-engineering/taxi_rides_ny/packages.yml: -------------------------------------------------------------------------------- 1 | packages: 2 | - package: dbt-labs/dbt_utils 3 | version: 1.1.1 4 | - package: dbt-labs/codegen 5 | version: 0.12.1 -------------------------------------------------------------------------------- /06-streaming/python/redpanda_example/settings.py: -------------------------------------------------------------------------------- 1 | INPUT_DATA_PATH = '../resources/rides.csv' 2 | 3 | BOOTSTRAP_SERVERS = ['localhost:9092'] 4 | KAFKA_TOPIC = 'rides_json' 5 | -------------------------------------------------------------------------------- /cohorts/2024/workshops/dlt_resources/incremental_loading.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Supraav/data-engineering-zoomcamp/main/cohorts/2024/workshops/dlt_resources/incremental_loading.png -------------------------------------------------------------------------------- /06-streaming/java/kafka_examples/gradle/wrapper/gradle-wrapper.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Supraav/data-engineering-zoomcamp/main/06-streaming/java/kafka_examples/gradle/wrapper/gradle-wrapper.jar -------------------------------------------------------------------------------- /06-streaming/java/kafka_examples/settings.gradle: -------------------------------------------------------------------------------- 1 | pluginManagement { 2 | repositories { 3 | gradlePluginPortal() 4 | mavenCentral() 5 | } 6 | } 7 | rootProject.name = 'kafka_examples' -------------------------------------------------------------------------------- /cohorts/2022/week_2_data_ingestion/airflow/docs/arch-diag-airflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Supraav/data-engineering-zoomcamp/main/cohorts/2022/week_2_data_ingestion/airflow/docs/arch-diag-airflow.png -------------------------------------------------------------------------------- /cohorts/2022/week_2_data_ingestion/airflow/docs/gcs_ingestion_dag.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Supraav/data-engineering-zoomcamp/main/cohorts/2022/week_2_data_ingestion/airflow/docs/gcs_ingestion_dag.png -------------------------------------------------------------------------------- /04-analytics-engineering/taxi_rides_ny/.gitignore: -------------------------------------------------------------------------------- 1 | # you shouldn't commit these into source control 2 | # these are the default directory names, adjust/add to fit your needs 3 | target/ 4 | dbt_packages/ 5 | logs/ 6 | -------------------------------------------------------------------------------- /05-batch/setup/config/spark-defaults.conf: -------------------------------------------------------------------------------- 1 | spark-master yarn 2 | spark.hadoop.google.cloud.auth.service.account.enable true 3 | spark.hadoop.google.cloud.auth.service.account.json.keyfile /home/alexey 4 | -------------------------------------------------------------------------------- /cohorts/2022/week_3_data_warehouse/airflow/docs/gcs_2_bq_dag_tree_view.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Supraav/data-engineering-zoomcamp/main/cohorts/2022/week_3_data_warehouse/airflow/docs/gcs_2_bq_dag_tree_view.png -------------------------------------------------------------------------------- /cohorts/2022/week_3_data_warehouse/airflow/docs/gcs_2_bq_dag_graph_view.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Supraav/data-engineering-zoomcamp/main/cohorts/2022/week_3_data_warehouse/airflow/docs/gcs_2_bq_dag_graph_view.png -------------------------------------------------------------------------------- /04-analytics-engineering/taxi_rides_ny/package-lock.yml: -------------------------------------------------------------------------------- 1 | packages: 2 | - package: dbt-labs/dbt_utils 3 | version: 1.1.1 4 | - package: dbt-labs/codegen 5 | version: 0.12.1 6 | sha1_hash: d974113b0f072cce35300077208f38581075ab40 7 | -------------------------------------------------------------------------------- /06-streaming/java/kafka_examples/.gitignore: -------------------------------------------------------------------------------- 1 | .gradle 2 | bin 3 | !src/main/resources/rides.csv 4 | 5 | build/classes 6 | build/generated 7 | build/libs 8 | build/reports 9 | build/resources 10 | build/test-results 11 | build/tmp 12 | -------------------------------------------------------------------------------- /01-docker-terraform/2_docker_sql/pipeline.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | import pandas as pd 4 | 5 | print(sys.argv) 6 | 7 | day = sys.argv[1] 8 | 9 | # some fancy stuff with pandas 10 | 11 | print(f'job finished successfully for day = {day}') -------------------------------------------------------------------------------- /01-docker-terraform/2_docker_sql/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.9.1 2 | 3 | RUN apt-get install wget 4 | RUN pip install pandas sqlalchemy psycopg2 5 | 6 | WORKDIR /app 7 | COPY ingest_data.py ingest_data.py 8 | 9 | ENTRYPOINT [ "python", "ingest_data.py" ] -------------------------------------------------------------------------------- /04-analytics-engineering/taxi_rides_ny/models/core/dim_zones.sql: -------------------------------------------------------------------------------- 1 | {{ config(materialized='table') }} 2 | 3 | select 4 | locationid, 5 | borough, 6 | zone, 7 | replace(service_zone,'Boro','Green') as service_zone 8 | from {{ ref('taxi_zone_lookup') }} -------------------------------------------------------------------------------- /06-streaming/python/docker/spark/spark-master.Dockerfile: -------------------------------------------------------------------------------- 1 | FROM spark-base 2 | 3 | # -- Runtime 4 | 5 | ARG spark_master_web_ui=8080 6 | 7 | EXPOSE ${spark_master_web_ui} ${SPARK_MASTER_PORT} 8 | CMD bin/spark-class org.apache.spark.deploy.master.Master >> logs/spark-master.out -------------------------------------------------------------------------------- /06-streaming/python/resources/schemas/taxi_ride_key.avsc: -------------------------------------------------------------------------------- 1 | { 2 | "namespace": "com.datatalksclub.taxi", 3 | "type": "record", 4 | "name": "RideRecordKey", 5 | "fields": [ 6 | { 7 | "name": "vendor_id", 8 | "type": "int" 9 | } 10 | ] 11 | } -------------------------------------------------------------------------------- /06-streaming/python/streams-example/faust/taxi_rides.py: -------------------------------------------------------------------------------- 1 | import faust 2 | 3 | 4 | class TaxiRide(faust.Record, validation=True): 5 | vendorId: str 6 | passenger_count: int 7 | trip_distance: float 8 | payment_type: int 9 | total_amount: float 10 | -------------------------------------------------------------------------------- /06-streaming/java/kafka_examples/gradle/wrapper/gradle-wrapper.properties: -------------------------------------------------------------------------------- 1 | distributionBase=GRADLE_USER_HOME 2 | distributionPath=wrapper/dists 3 | distributionUrl=https\://services.gradle.org/distributions/gradle-7.5.1-bin.zip 4 | zipStoreBase=GRADLE_USER_HOME 5 | zipStorePath=wrapper/dists 6 | -------------------------------------------------------------------------------- /06-streaming/python/docker/spark/spark-worker.Dockerfile: -------------------------------------------------------------------------------- 1 | FROM spark-base 2 | 3 | # -- Runtime 4 | 5 | ARG spark_worker_web_ui=8081 6 | 7 | EXPOSE ${spark_worker_web_ui} 8 | CMD bin/spark-class org.apache.spark.deploy.worker.Worker spark://${SPARK_MASTER_HOST}:${SPARK_MASTER_PORT} >> logs/spark-worker.out 9 | -------------------------------------------------------------------------------- /cohorts/2022/week_6_stream_processing/homework.md: -------------------------------------------------------------------------------- 1 | ## Week 6 Homework 2 | [Form](https://forms.gle/mSzfpPCXskWCabeu5) 3 | 4 | The homework is mostly theoretical. In the last question you have to provide working code link, please keep in mind that this 5 | question is not scored. 6 | 7 | Deadline: 14 March, 22:00 CET -------------------------------------------------------------------------------- /03-data-warehouse/extras/README.md: -------------------------------------------------------------------------------- 1 | Quick hack to load files directly to GCS, without Airflow. Downloads csv files from https://nyc-tlc.s3.amazonaws.com/trip+data/ and uploads them to your Cloud Storage Account as parquet files. 2 | 3 | 1. Install pre-reqs (more info in `web_to_gcs.py` script) 4 | 2. Run: `python web_to_gcs.py` 5 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | .DS_Store 3 | .idea 4 | *.tfstate 5 | *.tfstate.* 6 | **.terraform 7 | **.terraform.lock.* 8 | **google_credentials.json 9 | **logs/ 10 | **.env 11 | **__pycache__/ 12 | .history 13 | **/ny_taxi_postgres_data/* 14 | serving_dir 15 | .ipynb_checkpoints/ 16 | !week_6_stream_processing/avro_example/data/rides.csv 17 | -------------------------------------------------------------------------------- /06-streaming/java/kafka_examples/src/main/java/org/example/Topics.java: -------------------------------------------------------------------------------- 1 | package org.example; 2 | 3 | public class Topics { 4 | public static final String INPUT_RIDE_TOPIC = "rides"; 5 | public static final String INPUT_RIDE_LOCATION_TOPIC = "rides_location"; 6 | public static final String OUTPUT_TOPIC = "vendor_info"; 7 | } 8 | -------------------------------------------------------------------------------- /06-streaming/python/avro_example/settings.py: -------------------------------------------------------------------------------- 1 | INPUT_DATA_PATH = '../resources/rides.csv' 2 | 3 | RIDE_KEY_SCHEMA_PATH = '../resources/schemas/taxi_ride_key.avsc' 4 | RIDE_VALUE_SCHEMA_PATH = '../resources/schemas/taxi_ride_value.avsc' 5 | 6 | SCHEMA_REGISTRY_URL = 'http://localhost:8081' 7 | BOOTSTRAP_SERVERS = 'localhost:9092' 8 | KAFKA_TOPIC = 'rides_avro' 9 | -------------------------------------------------------------------------------- /06-streaming/java/kafka_examples/src/main/avro/rides.avsc: -------------------------------------------------------------------------------- 1 | { 2 | "type": "record", 3 | "name":"RideRecord", 4 | "namespace": "schemaregistry", 5 | "fields":[ 6 | {"name":"vendor_id","type":"string"}, 7 | {"name":"passenger_count","type":"int"}, 8 | {"name":"trip_distance","type":"double"} 9 | ] 10 | } -------------------------------------------------------------------------------- /06-streaming/java/kafka_examples/src/main/avro/rides_non_compatible.avsc: -------------------------------------------------------------------------------- 1 | { 2 | "type": "record", 3 | "name":"RideRecordNoneCompatible", 4 | "namespace": "schemaregistry", 5 | "fields":[ 6 | {"name":"vendorId","type":"int"}, 7 | {"name":"passenger_count","type":"int"}, 8 | {"name":"trip_distance","type":"double"} 9 | ] 10 | } -------------------------------------------------------------------------------- /cohorts/2022/week_2_data_ingestion/airflow/extras/web_to_gcs.sh: -------------------------------------------------------------------------------- 1 | dataset_url=${dataset_url} 2 | dataset_file=${dataset_file} 3 | path_to_local_file=${path_to_local_file} 4 | path_to_creds=${path_to_creds} 5 | 6 | curl -sS "$dataset_url" > $path_to_local_file/$dataset_file 7 | gcloud auth activate-service-account --key-file=$path_to_creds 8 | gsutil -m cp $path_to_local_file/$dataset_file gs://$BUCKET 9 | -------------------------------------------------------------------------------- /04-analytics-engineering/docker_setup/docker-compose.yaml: -------------------------------------------------------------------------------- 1 | version: '3' 2 | services: 3 | dbt-bq-dtc: 4 | build: 5 | context: . 6 | target: dbt-bigquery 7 | image: dbt/bigquery 8 | volumes: 9 | - .:/usr/app 10 | - ~/.dbt/:/root/.dbt/ 11 | - ~/.google/credentials/google_credentials.json:/.google/credentials/google_credentials.json 12 | network_mode: host -------------------------------------------------------------------------------- /02-workflow-orchestration/postgres/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3.8" 2 | services: 3 | postgres: 4 | image: postgres 5 | container_name: postgres-db 6 | environment: 7 | POSTGRES_USER: kestra 8 | POSTGRES_PASSWORD: k3str4 9 | POSTGRES_DB: postgres-zoomcamp 10 | ports: 11 | - "5432:5432" 12 | volumes: 13 | - postgres-data:/var/lib/postgresql/data 14 | volumes: 15 | postgres-data: -------------------------------------------------------------------------------- /cohorts/2023/week_6_stream_processing/client.properties: -------------------------------------------------------------------------------- 1 | # Required connection configs for Kafka producer, consumer, and admin 2 | bootstrap.servers=:9092 3 | security.protocol=SASL_SSL 4 | sasl.mechanisms=PLAIN 5 | sasl.username= 6 | sasl.password= 7 | 8 | # Best practice for higher availability in librdkafka clients prior to 1.7 9 | session.timeout.ms=45000 -------------------------------------------------------------------------------- /06-streaming/java/kafka_examples/src/main/avro/rides_compatible.avsc: -------------------------------------------------------------------------------- 1 | { 2 | "type": "record", 3 | "name":"RideRecordCompatible", 4 | "namespace": "schemaregistry", 5 | "fields":[ 6 | {"name":"vendorId","type":"string"}, 7 | {"name":"passenger_count","type":"int"}, 8 | {"name":"trip_distance","type":"double"}, 9 | {"name":"pu_location_id", "type": [ "null", "long" ], "default": null} 10 | ] 11 | } -------------------------------------------------------------------------------- /04-analytics-engineering/taxi_rides_ny/macros/macros_properties.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | macros: 4 | - name: get_payment_type_description 5 | description: > 6 | This macro receives a payment_type and returns the corresponding description. 7 | arguments: 8 | - name: payment_type 9 | type: int 10 | description: > 11 | payment_type value. 12 | Must be one of the accepted values, otherwise the macro will return null -------------------------------------------------------------------------------- /cohorts/2022/week_2_data_ingestion/airflow/scripts/entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | export GOOGLE_APPLICATION_CREDENTIALS=${GOOGLE_APPLICATION_CREDENTIALS} 3 | export AIRFLOW_CONN_GOOGLE_CLOUD_DEFAULT=${AIRFLOW_CONN_GOOGLE_CLOUD_DEFAULT} 4 | 5 | airflow db upgrade 6 | 7 | airflow users create -r Admin -u admin -p admin -e admin@example.com -f admin -l airflow 8 | # "$_AIRFLOW_WWW_USER_USERNAME" -p "$_AIRFLOW_WWW_USER_PASSWORD" 9 | 10 | airflow webserver 11 | -------------------------------------------------------------------------------- /cohorts/2022/week_3_data_warehouse/airflow/scripts/entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | export GOOGLE_APPLICATION_CREDENTIALS=${GOOGLE_APPLICATION_CREDENTIALS} 3 | export AIRFLOW_CONN_GOOGLE_CLOUD_DEFAULT=${AIRFLOW_CONN_GOOGLE_CLOUD_DEFAULT} 4 | 5 | airflow db upgrade 6 | 7 | airflow users create -r Admin -u admin -p admin -e admin@example.com -f admin -l airflow 8 | # "$_AIRFLOW_WWW_USER_USERNAME" -p "$_AIRFLOW_WWW_USER_PASSWORD" 9 | 10 | airflow webserver 11 | -------------------------------------------------------------------------------- /06-streaming/python/streams-example/faust/stream.py: -------------------------------------------------------------------------------- 1 | import faust 2 | from taxi_rides import TaxiRide 3 | 4 | 5 | app = faust.App('datatalksclub.stream.v2', broker='kafka://localhost:9092') 6 | topic = app.topic('datatalkclub.yellow_taxi_ride.json', value_type=TaxiRide) 7 | 8 | 9 | @app.agent(topic) 10 | async def start_reading(records): 11 | async for record in records: 12 | print(record) 13 | 14 | 15 | if __name__ == '__main__': 16 | app.main() 17 | -------------------------------------------------------------------------------- /cohorts/2022/README.md: -------------------------------------------------------------------------------- 1 | 2 | ### 2022 Cohort 3 | 4 | * **Start**: 17 January 2022 5 | * **Registration link**: https://airtable.com/shr6oVXeQvSI5HuWD 6 | * [Leaderboard](https://docs.google.com/spreadsheets/d/e/2PACX-1vR9oQiYnAVvzL4dagnhvp0sngqagF0AceD0FGjhS-dnzMTBzNQIal3-hOgkTibVQvfuqbQ69b0fvRnf/pubhtml) 7 | * Subscribe to our [public Google Calendar](https://calendar.google.com/calendar/?cid=ZXIxcjA1M3ZlYjJpcXU0dTFmaG02MzVxMG9AZ3JvdXAuY2FsZW5kYXIuZ29vZ2xlLmNvbQ) (it works from Desktop only) 8 | -------------------------------------------------------------------------------- /cohorts/2022/week_2_data_ingestion/transfer_service/README.md: -------------------------------------------------------------------------------- 1 | ## Generate AWS Access key 2 | - Login in to AWS account 3 | - Search for IAM 4 | ![aws iam](../../images/aws/iam.png) 5 | - Click on `Manage access key` 6 | - Click on `Create New Access Key` 7 | - Download the csv, your access key and secret would be in that csv (Please note that once lost secret cannot be recovered) 8 | 9 | ## Transfer service 10 | https://console.cloud.google.com/transfer/cloud/jobs 11 | 12 | 13 | -------------------------------------------------------------------------------- /04-analytics-engineering/taxi_rides_ny/seeds/seeds_properties.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | seeds: 4 | - name: taxi_zone_lookup 5 | description: > 6 | Taxi Zones roughly based on NYC Department of City Planning's Neighborhood 7 | Tabulation Areas (NTAs) and are meant to approximate neighborhoods, so you can see which 8 | neighborhood a passenger was picked up in, and which neighborhood they were dropped off in. 9 | Includes associated service_zone (EWR, Boro Zone, Yellow Zone) -------------------------------------------------------------------------------- /06-streaming/python/docker/spark/jupyterlab.Dockerfile: -------------------------------------------------------------------------------- 1 | FROM cluster-base 2 | 3 | # -- Layer: JupyterLab 4 | 5 | ARG spark_version=3.3.1 6 | ARG jupyterlab_version=3.6.1 7 | 8 | RUN apt-get update -y && \ 9 | apt-get install -y python3-pip && \ 10 | pip3 install wget pyspark==${spark_version} jupyterlab==${jupyterlab_version} 11 | 12 | # -- Runtime 13 | 14 | EXPOSE 8888 15 | WORKDIR ${SHARED_WORKSPACE} 16 | CMD jupyter lab --ip=0.0.0.0 --port=8888 --no-browser --allow-root --NotebookApp.token= 17 | -------------------------------------------------------------------------------- /06-streaming/java/kafka_examples/src/main/java/org/example/Secrets.java: -------------------------------------------------------------------------------- 1 | package org.example; 2 | 3 | public class Secrets { 4 | public static final String KAFKA_CLUSTER_KEY = "REPLACE_WITH_YOUR_KAFKA_CLUSTER_KEY"; 5 | public static final String KAFKA_CLUSTER_SECRET = "REPLACE_WITH_YOUR_KAFKA_CLUSTER_SECRET"; 6 | 7 | public static final String SCHEMA_REGISTRY_KEY = "REPLACE_WITH_SCHEMA_REGISTRY_KEY"; 8 | public static final String SCHEMA_REGISTRY_SECRET = "REPLACE_WITH_SCHEMA_REGISTRY_SECRET"; 9 | 10 | } 11 | -------------------------------------------------------------------------------- /06-streaming/java/kafka_examples/src/main/java/org/example/data/PickupLocation.java: -------------------------------------------------------------------------------- 1 | package org.example.data; 2 | 3 | import java.time.LocalDateTime; 4 | 5 | public class PickupLocation { 6 | public PickupLocation(long PULocationID, LocalDateTime tpep_pickup_datetime) { 7 | this.PULocationID = PULocationID; 8 | this.tpep_pickup_datetime = tpep_pickup_datetime; 9 | } 10 | 11 | public PickupLocation() { 12 | } 13 | 14 | public long PULocationID; 15 | public LocalDateTime tpep_pickup_datetime; 16 | } 17 | -------------------------------------------------------------------------------- /01-docker-terraform/2_docker_sql/docker-compose.yaml: -------------------------------------------------------------------------------- 1 | services: 2 | pgdatabase: 3 | image: postgres:13 4 | environment: 5 | - POSTGRES_USER=root 6 | - POSTGRES_PASSWORD=root 7 | - POSTGRES_DB=ny_taxi 8 | volumes: 9 | - "./ny_taxi_postgres_data:/var/lib/postgresql/data:rw" 10 | ports: 11 | - "5432:5432" 12 | pgadmin: 13 | image: dpage/pgadmin4 14 | environment: 15 | - PGADMIN_DEFAULT_EMAIL=admin@admin.com 16 | - PGADMIN_DEFAULT_PASSWORD=root 17 | ports: 18 | - "8080:80" 19 | -------------------------------------------------------------------------------- /04-analytics-engineering/taxi_rides_ny/macros/get_payment_type_description.sql: -------------------------------------------------------------------------------- 1 | {# 2 | This macro returns the description of the payment_type 3 | #} 4 | 5 | {% macro get_payment_type_description(payment_type) -%} 6 | 7 | case {{ dbt.safe_cast("payment_type", api.Column.translate_type("integer")) }} 8 | when 1 then 'Credit card' 9 | when 2 then 'Cash' 10 | when 3 then 'No charge' 11 | when 4 then 'Dispute' 12 | when 5 then 'Unknown' 13 | when 6 then 'Voided trip' 14 | else 'EMPTY' 15 | end 16 | 17 | {%- endmacro %} -------------------------------------------------------------------------------- /06-streaming/python/streams-example/faust/stream_count_vendor_trips.py: -------------------------------------------------------------------------------- 1 | import faust 2 | from taxi_rides import TaxiRide 3 | 4 | 5 | app = faust.App('datatalksclub.stream.v2', broker='kafka://localhost:9092') 6 | topic = app.topic('datatalkclub.yellow_taxi_ride.json', value_type=TaxiRide) 7 | 8 | vendor_rides = app.Table('vendor_rides', default=int) 9 | 10 | 11 | @app.agent(topic) 12 | async def process(stream): 13 | async for event in stream.group_by(TaxiRide.vendorId): 14 | vendor_rides[event.vendorId] += 1 15 | 16 | if __name__ == '__main__': 17 | app.main() 18 | -------------------------------------------------------------------------------- /06-streaming/python/resources/schemas/taxi_ride_value.avsc: -------------------------------------------------------------------------------- 1 | { 2 | "namespace": "com.datatalksclub.taxi", 3 | "type": "record", 4 | "name": "RideRecord", 5 | "fields": [ 6 | { 7 | "name": "vendor_id", 8 | "type": "int" 9 | }, 10 | { 11 | "name": "passenger_count", 12 | "type": "int" 13 | }, 14 | { 15 | "name": "trip_distance", 16 | "type": "float" 17 | }, 18 | { 19 | "name": "payment_type", 20 | "type": "int" 21 | }, 22 | { 23 | "name": "total_amount", 24 | "type": "float" 25 | } 26 | ] 27 | } -------------------------------------------------------------------------------- /05-batch/code/download_data.sh: -------------------------------------------------------------------------------- 1 | 2 | set -e 3 | 4 | TAXI_TYPE=$1 # "yellow" 5 | YEAR=$2 # 2020 6 | 7 | URL_PREFIX="https://github.com/DataTalksClub/nyc-tlc-data/releases/download" 8 | 9 | for MONTH in {1..12}; do 10 | FMONTH=`printf "%02d" ${MONTH}` 11 | 12 | URL="${URL_PREFIX}/${TAXI_TYPE}/${TAXI_TYPE}_tripdata_${YEAR}-${FMONTH}.csv.gz" 13 | 14 | LOCAL_PREFIX="data/raw/${TAXI_TYPE}/${YEAR}/${FMONTH}" 15 | LOCAL_FILE="${TAXI_TYPE}_tripdata_${YEAR}_${FMONTH}.csv.gz" 16 | LOCAL_PATH="${LOCAL_PREFIX}/${LOCAL_FILE}" 17 | 18 | echo "downloading ${URL} to ${LOCAL_PATH}" 19 | mkdir -p ${LOCAL_PREFIX} 20 | wget ${URL} -O ${LOCAL_PATH} 21 | 22 | done 23 | -------------------------------------------------------------------------------- /06-streaming/python/avro_example/ride_record_key.py: -------------------------------------------------------------------------------- 1 | from typing import Dict 2 | 3 | 4 | class RideRecordKey: 5 | def __init__(self, vendor_id): 6 | self.vendor_id = vendor_id 7 | 8 | @classmethod 9 | def from_dict(cls, d: Dict): 10 | return cls(vendor_id=d['vendor_id']) 11 | 12 | def __repr__(self): 13 | return f'{self.__class__.__name__}: {self.__dict__}' 14 | 15 | 16 | def dict_to_ride_record_key(obj, ctx): 17 | if obj is None: 18 | return None 19 | 20 | return RideRecordKey.from_dict(obj) 21 | 22 | 23 | def ride_record_key_to_dict(ride_record_key: RideRecordKey, ctx): 24 | return ride_record_key.__dict__ 25 | -------------------------------------------------------------------------------- /06-streaming/python/streams-example/faust/windowing.py: -------------------------------------------------------------------------------- 1 | from datetime import timedelta 2 | import faust 3 | from taxi_rides import TaxiRide 4 | 5 | 6 | app = faust.App('datatalksclub.stream.v2', broker='kafka://localhost:9092') 7 | topic = app.topic('datatalkclub.yellow_taxi_ride.json', value_type=TaxiRide) 8 | 9 | vendor_rides = app.Table('vendor_rides_windowed', default=int).tumbling( 10 | timedelta(minutes=1), 11 | expires=timedelta(hours=1), 12 | ) 13 | 14 | 15 | @app.agent(topic) 16 | async def process(stream): 17 | async for event in stream.group_by(TaxiRide.vendorId): 18 | vendor_rides[event.vendorId] += 1 19 | 20 | 21 | if __name__ == '__main__': 22 | app.main() 23 | -------------------------------------------------------------------------------- /06-streaming/java/kafka_examples/src/main/java/org/example/data/VendorInfo.java: -------------------------------------------------------------------------------- 1 | package org.example.data; 2 | 3 | import java.time.LocalDateTime; 4 | 5 | public class VendorInfo { 6 | 7 | public VendorInfo(String vendorID, long PULocationID, LocalDateTime pickupTime, LocalDateTime lastDropoffTime) { 8 | VendorID = vendorID; 9 | this.PULocationID = PULocationID; 10 | this.pickupTime = pickupTime; 11 | this.lastDropoffTime = lastDropoffTime; 12 | } 13 | 14 | public VendorInfo() { 15 | } 16 | 17 | public String VendorID; 18 | public long PULocationID; 19 | public LocalDateTime pickupTime; 20 | public LocalDateTime lastDropoffTime; 21 | } 22 | -------------------------------------------------------------------------------- /cohorts/2023/week_6_stream_processing/spark-submit.sh: -------------------------------------------------------------------------------- 1 | # Submit Python code to SparkMaster 2 | 3 | if [ $# -lt 1 ] 4 | then 5 | echo "Usage: $0 [ executor-memory ]" 6 | echo "(specify memory in string format such as \"512M\" or \"2G\")" 7 | exit 1 8 | fi 9 | PYTHON_JOB=$1 10 | 11 | if [ -z $2 ] 12 | then 13 | EXEC_MEM="1G" 14 | else 15 | EXEC_MEM=$2 16 | fi 17 | spark-submit --master spark://localhost:7077 --num-executors 2 \ 18 | --executor-memory $EXEC_MEM --executor-cores 1 \ 19 | --packages org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.1,org.apache.spark:spark-avro_2.12:3.3.1,org.apache.spark:spark-streaming-kafka-0-10_2.12:3.3.1 \ 20 | $PYTHON_JOB -------------------------------------------------------------------------------- /06-streaming/python/streams-example/pyspark/spark-submit.sh: -------------------------------------------------------------------------------- 1 | # Submit Python code to SparkMaster 2 | 3 | if [ $# -lt 1 ] 4 | then 5 | echo "Usage: $0 [ executor-memory ]" 6 | echo "(specify memory in string format such as \"512M\" or \"2G\")" 7 | exit 1 8 | fi 9 | PYTHON_JOB=$1 10 | 11 | if [ -z $2 ] 12 | then 13 | EXEC_MEM="1G" 14 | else 15 | EXEC_MEM=$2 16 | fi 17 | spark-submit --master spark://localhost:7077 --num-executors 2 \ 18 | --executor-memory $EXEC_MEM --executor-cores 1 \ 19 | --packages org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.1,org.apache.spark:spark-avro_2.12:3.3.1,org.apache.spark:spark-streaming-kafka-0-10_2.12:3.3.1 \ 20 | $PYTHON_JOB -------------------------------------------------------------------------------- /06-streaming/python/streams-example/redpanda/spark-submit.sh: -------------------------------------------------------------------------------- 1 | # Submit Python code to SparkMaster 2 | 3 | if [ $# -lt 1 ] 4 | then 5 | echo "Usage: $0 [ executor-memory ]" 6 | echo "(specify memory in string format such as \"512M\" or \"2G\")" 7 | exit 1 8 | fi 9 | PYTHON_JOB=$1 10 | 11 | if [ -z $2 ] 12 | then 13 | EXEC_MEM="1G" 14 | else 15 | EXEC_MEM=$2 16 | fi 17 | spark-submit --master spark://localhost:7077 --num-executors 2 \ 18 | --executor-memory $EXEC_MEM --executor-cores 1 \ 19 | --packages org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.1,org.apache.spark:spark-avro_2.12:3.5.1,org.apache.spark:spark-streaming-kafka-0-10_2.12:3.5.1 \ 20 | $PYTHON_JOB 21 | -------------------------------------------------------------------------------- /06-streaming/python/docker/spark/cluster-base.Dockerfile: -------------------------------------------------------------------------------- 1 | # Reference from offical Apache Spark repository Dockerfile for Kubernetes 2 | # https://github.com/apache/spark/blob/master/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/Dockerfile 3 | ARG java_image_tag=17-jre 4 | FROM eclipse-temurin:${java_image_tag} 5 | 6 | # -- Layer: OS + Python 7 | 8 | ARG shared_workspace=/opt/workspace 9 | 10 | RUN mkdir -p ${shared_workspace} && \ 11 | apt-get update -y && \ 12 | apt-get install -y python3 && \ 13 | ln -s /usr/bin/python3 /usr/bin/python && \ 14 | rm -rf /var/lib/apt/lists/* 15 | 16 | ENV SHARED_WORKSPACE=${shared_workspace} 17 | 18 | # -- Runtime 19 | 20 | VOLUME ${shared_workspace} 21 | CMD ["bash"] -------------------------------------------------------------------------------- /02-workflow-orchestration/flows/05_gcp_setup.yaml: -------------------------------------------------------------------------------- 1 | id: 05_gcp_setup 2 | namespace: zoomcamp 3 | 4 | tasks: 5 | - id: create_gcs_bucket 6 | type: io.kestra.plugin.gcp.gcs.CreateBucket 7 | ifExists: SKIP 8 | storageClass: REGIONAL 9 | name: "{{kv('GCP_BUCKET_NAME')}}" # make sure it's globally unique! 10 | 11 | - id: create_bq_dataset 12 | type: io.kestra.plugin.gcp.bigquery.CreateDataset 13 | name: "{{kv('GCP_DATASET')}}" 14 | ifExists: SKIP 15 | 16 | pluginDefaults: 17 | - type: io.kestra.plugin.gcp 18 | values: 19 | serviceAccount: "{{kv('GCP_CREDS')}}" 20 | projectId: "{{kv('GCP_PROJECT_ID')}}" 21 | location: "{{kv('GCP_LOCATION')}}" 22 | bucket: "{{kv('GCP_BUCKET_NAME')}}" 23 | -------------------------------------------------------------------------------- /01-docker-terraform/1_terraform_gcp/README.md: -------------------------------------------------------------------------------- 1 | ## Local Setup for Terraform and GCP 2 | 3 | ### Pre-Requisites 4 | 1. Terraform client installation: https://www.terraform.io/downloads 5 | 2. Cloud Provider account: https://console.cloud.google.com/ 6 | 7 | ### Terraform Concepts 8 | [Terraform Overview](1_terraform_overview.md) 9 | 10 | ### GCP setup 11 | 12 | 1. [Setup for First-time](2_gcp_overview.md#initial-setup) 13 | * [Only for Windows](windows.md) - Steps 4 & 5 14 | 2. [IAM / Access specific to this course](2_gcp_overview.md#setup-for-access) 15 | 16 | ### Terraform Workshop for GCP Infra 17 | Your setup is ready! 18 | Now head to the [terraform](terraform) directory, and perform the execution steps to create your infrastructure. 19 | -------------------------------------------------------------------------------- /01-docker-terraform/1_terraform_gcp/terraform/README.md: -------------------------------------------------------------------------------- 1 | ### Concepts 2 | * [Terraform_overview](../1_terraform_overview.md) 3 | * [Audio](https://drive.google.com/file/d/1IqMRDwJV-m0v9_le_i2HA_UbM_sIWgWx/view?usp=sharing) 4 | 5 | ### Execution 6 | 7 | ```shell 8 | # Refresh service-account's auth-token for this session 9 | gcloud auth application-default login 10 | 11 | # Initialize state file (.tfstate) 12 | terraform init 13 | 14 | # Check changes to new infra plan 15 | terraform plan -var="project=" 16 | ``` 17 | 18 | ```shell 19 | # Create new infra 20 | terraform apply -var="project=" 21 | ``` 22 | 23 | ```shell 24 | # Delete infra after your work, to avoid costs on any running services 25 | terraform destroy 26 | ``` 27 | -------------------------------------------------------------------------------- /05-batch/setup/config/core-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | fs.AbstractFileSystem.gs.impl 7 | com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS 8 | 9 | 10 | fs.gs.impl 11 | com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem 12 | 13 | 14 | fs.gs.auth.service.account.json.keyfile 15 | /home/alexey/.google/credentials/google_credentials.json 16 | 17 | 18 | fs.gs.auth.service.account.enable 19 | true 20 | 21 | -------------------------------------------------------------------------------- /06-streaming/python/streams-example/pyspark/settings.py: -------------------------------------------------------------------------------- 1 | import pyspark.sql.types as T 2 | 3 | INPUT_DATA_PATH = '../../resources/rides.csv' 4 | BOOTSTRAP_SERVERS = 'localhost:9092' 5 | 6 | TOPIC_WINDOWED_VENDOR_ID_COUNT = 'vendor_counts_windowed' 7 | 8 | PRODUCE_TOPIC_RIDES_CSV = CONSUME_TOPIC_RIDES_CSV = 'rides_csv' 9 | 10 | RIDE_SCHEMA = T.StructType( 11 | [T.StructField("vendor_id", T.IntegerType()), 12 | T.StructField('tpep_pickup_datetime', T.TimestampType()), 13 | T.StructField('tpep_dropoff_datetime', T.TimestampType()), 14 | T.StructField("passenger_count", T.IntegerType()), 15 | T.StructField("trip_distance", T.FloatType()), 16 | T.StructField("payment_type", T.IntegerType()), 17 | T.StructField("total_amount", T.FloatType()), 18 | ]) 19 | -------------------------------------------------------------------------------- /06-streaming/python/streams-example/redpanda/settings.py: -------------------------------------------------------------------------------- 1 | import pyspark.sql.types as T 2 | 3 | INPUT_DATA_PATH = '../../resources/rides.csv' 4 | BOOTSTRAP_SERVERS = 'localhost:9092' 5 | 6 | TOPIC_WINDOWED_VENDOR_ID_COUNT = 'vendor_counts_windowed' 7 | 8 | PRODUCE_TOPIC_RIDES_CSV = CONSUME_TOPIC_RIDES_CSV = 'rides_csv' 9 | 10 | RIDE_SCHEMA = T.StructType( 11 | [T.StructField("vendor_id", T.IntegerType()), 12 | T.StructField('tpep_pickup_datetime', T.TimestampType()), 13 | T.StructField('tpep_dropoff_datetime', T.TimestampType()), 14 | T.StructField("passenger_count", T.IntegerType()), 15 | T.StructField("trip_distance", T.FloatType()), 16 | T.StructField("payment_type", T.IntegerType()), 17 | T.StructField("total_amount", T.FloatType()), 18 | ]) 19 | -------------------------------------------------------------------------------- /06-streaming/python/docker/spark/build.sh: -------------------------------------------------------------------------------- 1 | # -- Software Stack Version 2 | 3 | SPARK_VERSION="3.3.1" 4 | HADOOP_VERSION="3" 5 | JUPYTERLAB_VERSION="3.6.1" 6 | 7 | # -- Building the Images 8 | 9 | docker build \ 10 | -f cluster-base.Dockerfile \ 11 | -t cluster-base . 12 | 13 | docker build \ 14 | --build-arg spark_version="${SPARK_VERSION}" \ 15 | --build-arg hadoop_version="${HADOOP_VERSION}" \ 16 | -f spark-base.Dockerfile \ 17 | -t spark-base . 18 | 19 | docker build \ 20 | -f spark-master.Dockerfile \ 21 | -t spark-master . 22 | 23 | docker build \ 24 | -f spark-worker.Dockerfile \ 25 | -t spark-worker . 26 | 27 | docker build \ 28 | --build-arg spark_version="${SPARK_VERSION}" \ 29 | --build-arg jupyterlab_version="${JUPYTERLAB_VERSION}" \ 30 | -f jupyterlab.Dockerfile \ 31 | -t jupyterlab . 32 | -------------------------------------------------------------------------------- /06-streaming/python/docker/spark/spark-base.Dockerfile: -------------------------------------------------------------------------------- 1 | FROM cluster-base 2 | 3 | # -- Layer: Apache Spark 4 | 5 | ARG spark_version=3.3.1 6 | ARG hadoop_version=3 7 | 8 | RUN apt-get update -y && \ 9 | apt-get install -y curl && \ 10 | curl https://archive.apache.org/dist/spark/spark-${spark_version}/spark-${spark_version}-bin-hadoop${hadoop_version}.tgz -o spark.tgz && \ 11 | tar -xf spark.tgz && \ 12 | mv spark-${spark_version}-bin-hadoop${hadoop_version} /usr/bin/ && \ 13 | mkdir /usr/bin/spark-${spark_version}-bin-hadoop${hadoop_version}/logs && \ 14 | rm spark.tgz 15 | 16 | ENV SPARK_HOME /usr/bin/spark-${spark_version}-bin-hadoop${hadoop_version} 17 | ENV SPARK_MASTER_HOST spark-master 18 | ENV SPARK_MASTER_PORT 7077 19 | ENV PYSPARK_PYTHON python3 20 | 21 | # -- Runtime 22 | 23 | WORKDIR ${SPARK_HOME} -------------------------------------------------------------------------------- /06-streaming/python/streams-example/faust/branch_price.py: -------------------------------------------------------------------------------- 1 | import faust 2 | from taxi_rides import TaxiRide 3 | from faust import current_event 4 | 5 | app = faust.App('datatalksclub.stream.v3', broker='kafka://localhost:9092', consumer_auto_offset_reset="earliest") 6 | topic = app.topic('datatalkclub.yellow_taxi_ride.json', value_type=TaxiRide) 7 | 8 | high_amount_rides = app.topic('datatalks.yellow_taxi_rides.high_amount') 9 | low_amount_rides = app.topic('datatalks.yellow_taxi_rides.low_amount') 10 | 11 | 12 | @app.agent(topic) 13 | async def process(stream): 14 | async for event in stream: 15 | if event.total_amount >= 40.0: 16 | await current_event().forward(high_amount_rides) 17 | else: 18 | await current_event().forward(low_amount_rides) 19 | 20 | if __name__ == '__main__': 21 | app.main() 22 | -------------------------------------------------------------------------------- /01-docker-terraform/1_terraform_gcp/terraform/terraform_with_variables/main.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | required_providers { 3 | google = { 4 | source = "hashicorp/google" 5 | version = "5.6.0" 6 | } 7 | } 8 | } 9 | 10 | provider "google" { 11 | credentials = file(var.credentials) 12 | project = var.project 13 | region = var.region 14 | } 15 | 16 | 17 | resource "google_storage_bucket" "demo-bucket" { 18 | name = var.gcs_bucket_name 19 | location = var.location 20 | force_destroy = true 21 | 22 | 23 | lifecycle_rule { 24 | condition { 25 | age = 1 26 | } 27 | action { 28 | type = "AbortIncompleteMultipartUpload" 29 | } 30 | } 31 | } 32 | 33 | 34 | 35 | resource "google_bigquery_dataset" "demo_dataset" { 36 | dataset_id = var.bq_dataset_name 37 | location = var.location 38 | } -------------------------------------------------------------------------------- /06-streaming/python/streams-example/faust/producer_taxi_json.py: -------------------------------------------------------------------------------- 1 | import csv 2 | from json import dumps 3 | from kafka import KafkaProducer 4 | from time import sleep 5 | 6 | 7 | producer = KafkaProducer(bootstrap_servers=['localhost:9092'], 8 | key_serializer=lambda x: dumps(x).encode('utf-8'), 9 | value_serializer=lambda x: dumps(x).encode('utf-8')) 10 | 11 | file = open('../../resources/rides.csv') 12 | 13 | csvreader = csv.reader(file) 14 | header = next(csvreader) 15 | for row in csvreader: 16 | key = {"vendorId": int(row[0])} 17 | value = {"vendorId": int(row[0]), "passenger_count": int(row[3]), "trip_distance": float(row[4]), "payment_type": int(row[9]), "total_amount": float(row[16])} 18 | producer.send('datatalkclub.yellow_taxi_ride.json', value=value, key=key) 19 | print("producing") 20 | sleep(1) -------------------------------------------------------------------------------- /03-data-warehouse/extract_model.md: -------------------------------------------------------------------------------- 1 | ## Model deployment 2 | [Tutorial](https://cloud.google.com/bigquery-ml/docs/export-model-tutorial) 3 | ### Steps 4 | - gcloud auth login 5 | - bq --project_id taxi-rides-ny extract -m nytaxi.tip_model gs://taxi_ml_model/tip_model 6 | - mkdir /tmp/model 7 | - gsutil cp -r gs://taxi_ml_model/tip_model /tmp/model 8 | - mkdir -p serving_dir/tip_model/1 9 | - cp -r /tmp/model/tip_model/* serving_dir/tip_model/1 10 | - docker pull tensorflow/serving 11 | - docker run -p 8501:8501 --mount type=bind,source=`pwd`/serving_dir/tip_model,target= 12 | /models/tip_model -e MODEL_NAME=tip_model -t tensorflow/serving & 13 | - curl -d '{"instances": [{"passenger_count":1, "trip_distance":12.2, "PULocationID":"193", "DOLocationID":"264", "payment_type":"2","fare_amount":20.4,"tolls_amount":0.0}]}' -X POST http://localhost:8501/v1/models/tip_model:predict 14 | - http://localhost:8501/v1/models/tip_model -------------------------------------------------------------------------------- /after-sign-up.md: -------------------------------------------------------------------------------- 1 | ## Thank you! 2 | 3 | Thanks for signing up for the course. 4 | 5 | The process of adding you to the mailing list is not automated yet, 6 | but you will hear from us closer to the course start. 7 | 8 | To make sure you don't miss any announcements 9 | 10 | - Register in [DataTalks.Club's Slack](https://datatalks.club/slack.html) and 11 | join the [`#course-data-engineering`](https://app.slack.com/client/T01ATQK62F8/C01FABYF2RG) channel 12 | - Join the [course Telegram channel with announcements](https://t.me/dezoomcamp) 13 | - Subscribe to [DataTalks.Club's YouTube channel](https://www.youtube.com/c/DataTalksClub) and check 14 | [the course playlist](https://www.youtube.com/playlist?list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb) 15 | - Subscribe to our [public Google Calendar](https://calendar.google.com/calendar/?cid=ZXIxcjA1M3ZlYjJpcXU0dTFmaG02MzVxMG9AZ3JvdXAuY2FsZW5kYXIuZ29vZ2xlLmNvbQ) (it works from Desktop only) 16 | 17 | See you in January! 18 | -------------------------------------------------------------------------------- /cohorts/2023/week_1_terraform/homework.md: -------------------------------------------------------------------------------- 1 | ## Week 1 Homework 2 | 3 | In this homework we'll prepare the environment by creating resources in GCP with Terraform. 4 | 5 | In your VM on GCP install Terraform. Copy the files from the course repo 6 | [here](https://github.com/DataTalksClub/data-engineering-zoomcamp/tree/main/week_1_basics_n_setup/1_terraform_gcp/terraform) to your VM. 7 | 8 | Modify the files as necessary to create a GCP Bucket and Big Query Dataset. 9 | 10 | 11 | ## Question 1. Creating Resources 12 | 13 | After updating the main.tf and variable.tf files run: 14 | 15 | ``` 16 | terraform apply 17 | ``` 18 | 19 | Paste the output of this command into the homework submission form. 20 | 21 | 22 | ## Submitting the solutions 23 | 24 | * Form for submitting: [form](https://forms.gle/S57Xs3HL9nB3YTzj9) 25 | * You can submit your homework multiple times. In this case, only the last submission will be used. 26 | 27 | Deadline: 30 January (Monday), 22:00 CET 28 | 29 | -------------------------------------------------------------------------------- /02-workflow-orchestration/flows/04_gcp_kv.yaml: -------------------------------------------------------------------------------- 1 | id: 04_gcp_kv 2 | namespace: zoomcamp 3 | 4 | tasks: 5 | - id: gcp_creds 6 | type: io.kestra.plugin.core.kv.Set 7 | key: GCP_CREDS 8 | kvType: JSON 9 | value: | 10 | { 11 | "type": "service_account", 12 | "project_id": "...", 13 | } 14 | 15 | - id: gcp_project_id 16 | type: io.kestra.plugin.core.kv.Set 17 | key: GCP_PROJECT_ID 18 | kvType: STRING 19 | value: kestra-sandbox # TODO replace with your project id 20 | 21 | - id: gcp_location 22 | type: io.kestra.plugin.core.kv.Set 23 | key: GCP_LOCATION 24 | kvType: STRING 25 | value: europe-west2 26 | 27 | - id: gcp_bucket_name 28 | type: io.kestra.plugin.core.kv.Set 29 | key: GCP_BUCKET_NAME 30 | kvType: STRING 31 | value: your-name-kestra # TODO make sure it's globally unique! 32 | 33 | - id: gcp_dataset 34 | type: io.kestra.plugin.core.kv.Set 35 | key: GCP_DATASET 36 | kvType: STRING 37 | value: zoomcamp -------------------------------------------------------------------------------- /06-streaming/java/kafka_examples/src/test/java/org/example/helper/DataGeneratorHelper.java: -------------------------------------------------------------------------------- 1 | package org.example.helper; 2 | 3 | import org.example.data.PickupLocation; 4 | import org.example.data.Ride; 5 | import org.example.data.VendorInfo; 6 | 7 | import java.time.LocalDateTime; 8 | import java.time.format.DateTimeFormatter; 9 | import java.util.List; 10 | 11 | public class DataGeneratorHelper { 12 | public static Ride generateRide() { 13 | var arrivalTime = LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss")); 14 | var departureTime = LocalDateTime.now().minusMinutes(30).format(DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss")); 15 | return new Ride(new String[]{"1", departureTime, arrivalTime,"1","1.50","1","N","238","75","2","8","0.5","0.5","0","0","0.3","9.3","0"}); 16 | } 17 | 18 | public static PickupLocation generatePickUpLocation(long pickupLocationId) { 19 | return new PickupLocation(pickupLocationId, LocalDateTime.now()); 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /cohorts/2023/week_6_stream_processing/settings.py: -------------------------------------------------------------------------------- 1 | import pyspark.sql.types as T 2 | 3 | GREEN_TRIP_DATA_PATH = './resources/green_tripdata/green_tripdata_2019-01.csv' 4 | FHV_TRIP_DATA_PATH = './resources/fhv_tripdata/fhv_tripdata_2019-01.csv' 5 | BOOTSTRAP_SERVERS = 'localhost:9092' 6 | 7 | RIDES_TOPIC = 'all_rides' 8 | FHV_TAXI_TOPIC = 'fhv_taxi_rides' 9 | GREEN_TAXI_TOPIC = 'green_taxi_rides' 10 | 11 | ALL_RIDE_SCHEMA = T.StructType( 12 | [T.StructField("PUlocationID", T.StringType()), 13 | T.StructField("DOlocationID", T.StringType()), 14 | ]) 15 | 16 | 17 | def read_ccloud_config(config_file): 18 | conf = {} 19 | with open(config_file) as fh: 20 | for line in fh: 21 | line = line.strip() 22 | if len(line) != 0 and line[0] != "#": 23 | parameter, value = line.strip().split('=', 1) 24 | conf[parameter] = value.strip() 25 | return conf 26 | 27 | 28 | CONFLUENT_CLOUD_CONFIG = read_ccloud_config('client_original.properties') 29 | -------------------------------------------------------------------------------- /06-streaming/python/avro_example/ride_record.py: -------------------------------------------------------------------------------- 1 | from typing import List, Dict 2 | 3 | 4 | class RideRecord: 5 | 6 | def __init__(self, arr: List[str]): 7 | self.vendor_id = int(arr[0]) 8 | self.passenger_count = int(arr[1]) 9 | self.trip_distance = float(arr[2]) 10 | self.payment_type = int(arr[3]) 11 | self.total_amount = float(arr[4]) 12 | 13 | @classmethod 14 | def from_dict(cls, d: Dict): 15 | return cls(arr=[ 16 | d['vendor_id'], 17 | d['passenger_count'], 18 | d['trip_distance'], 19 | d['payment_type'], 20 | d['total_amount'] 21 | ] 22 | ) 23 | 24 | def __repr__(self): 25 | return f'{self.__class__.__name__}: {self.__dict__}' 26 | 27 | 28 | def dict_to_ride_record(obj, ctx): 29 | if obj is None: 30 | return None 31 | 32 | return RideRecord.from_dict(obj) 33 | 34 | 35 | def ride_record_to_dict(ride_record: RideRecord, ctx): 36 | return ride_record.__dict__ 37 | -------------------------------------------------------------------------------- /cohorts/2024/06-streaming/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3.7' 2 | services: 3 | # Redpanda cluster 4 | redpanda-1: 5 | image: docker.redpanda.com/vectorized/redpanda:v22.3.5 6 | container_name: redpanda-1 7 | command: 8 | - redpanda 9 | - start 10 | - --smp 11 | - '1' 12 | - --reserve-memory 13 | - 0M 14 | - --overprovisioned 15 | - --node-id 16 | - '1' 17 | - --kafka-addr 18 | - PLAINTEXT://0.0.0.0:29092,OUTSIDE://0.0.0.0:9092 19 | - --advertise-kafka-addr 20 | - PLAINTEXT://redpanda-1:29092,OUTSIDE://localhost:9092 21 | - --pandaproxy-addr 22 | - PLAINTEXT://0.0.0.0:28082,OUTSIDE://0.0.0.0:8082 23 | - --advertise-pandaproxy-addr 24 | - PLAINTEXT://redpanda-1:28082,OUTSIDE://localhost:8082 25 | - --rpc-addr 26 | - 0.0.0.0:33145 27 | - --advertise-rpc-addr 28 | - redpanda-1:33145 29 | ports: 30 | # - 8081:8081 31 | - 8082:8082 32 | - 9092:9092 33 | - 28082:28082 34 | - 29092:29092 -------------------------------------------------------------------------------- /04-analytics-engineering/taxi_rides_ny/models/core/dm_monthly_zone_revenue.sql: -------------------------------------------------------------------------------- 1 | {{ config(materialized='table') }} 2 | 3 | with trips_data as ( 4 | select * from {{ ref('fact_trips') }} 5 | ) 6 | select 7 | -- Reveneue grouping 8 | pickup_zone as revenue_zone, 9 | {{ dbt.date_trunc("month", "pickup_datetime") }} as revenue_month, 10 | 11 | service_type, 12 | 13 | -- Revenue calculation 14 | sum(fare_amount) as revenue_monthly_fare, 15 | sum(extra) as revenue_monthly_extra, 16 | sum(mta_tax) as revenue_monthly_mta_tax, 17 | sum(tip_amount) as revenue_monthly_tip_amount, 18 | sum(tolls_amount) as revenue_monthly_tolls_amount, 19 | sum(ehail_fee) as revenue_monthly_ehail_fee, 20 | sum(improvement_surcharge) as revenue_monthly_improvement_surcharge, 21 | sum(total_amount) as revenue_monthly_total_amount, 22 | 23 | -- Additional calculations 24 | count(tripid) as total_monthly_trips, 25 | avg(passenger_count) as avg_monthly_passenger_count, 26 | avg(trip_distance) as avg_monthly_trip_distance 27 | 28 | from trips_data 29 | group by 1,2,3 -------------------------------------------------------------------------------- /cohorts/2022/week_2_data_ingestion/airflow/.env_example: -------------------------------------------------------------------------------- 1 | # Custom 2 | COMPOSE_PROJECT_NAME=dtc-de 3 | GOOGLE_APPLICATION_CREDENTIALS=/.google/credentials/google_credentials.json 4 | AIRFLOW_CONN_GOOGLE_CLOUD_DEFAULT=google-cloud-platform://?extra__google_cloud_platform__key_path=/.google/credentials/google_credentials.json 5 | # AIRFLOW_UID= 6 | GCP_PROJECT_ID= 7 | GCP_GCS_BUCKET= 8 | 9 | # Postgres 10 | POSTGRES_USER=airflow 11 | POSTGRES_PASSWORD=airflow 12 | POSTGRES_DB=airflow 13 | 14 | # Airflow 15 | AIRFLOW__CORE__EXECUTOR=LocalExecutor 16 | AIRFLOW__SCHEDULER__SCHEDULER_HEARTBEAT_SEC=10 17 | 18 | AIRFLOW__CORE__SQL_ALCHEMY_CONN=postgresql+psycopg2://${POSTGRES_USER}:${POSTGRES_PASSWORD}@postgres:5432/${POSTGRES_DB} 19 | AIRFLOW_CONN_METADATA_DB=postgres+psycopg2://airflow:airflow@postgres:5432/airflow 20 | AIRFLOW_VAR__METADATA_DB_SCHEMA=airflow 21 | 22 | _AIRFLOW_WWW_USER_CREATE=True 23 | _AIRFLOW_WWW_USER_USERNAME=${_AIRFLOW_WWW_USER_USERNAME:airflow} 24 | _AIRFLOW_WWW_USER_PASSWORD=${_AIRFLOW_WWW_USER_PASSWORD:airflow} 25 | 26 | AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION=True 27 | AIRFLOW__CORE__LOAD_EXAMPLES=False 28 | -------------------------------------------------------------------------------- /cohorts/2022/week_3_data_warehouse/airflow/.env_example: -------------------------------------------------------------------------------- 1 | # Custom 2 | COMPOSE_PROJECT_NAME=dtc-de 3 | GOOGLE_APPLICATION_CREDENTIALS=/.google/credentials/google_credentials.json 4 | AIRFLOW_CONN_GOOGLE_CLOUD_DEFAULT=google-cloud-platform://?extra__google_cloud_platform__key_path=/.google/credentials/google_credentials.json 5 | # AIRFLOW_UID= 6 | GCP_PROJECT_ID= 7 | GCP_GCS_BUCKET= 8 | 9 | # Postgres 10 | POSTGRES_USER=airflow 11 | POSTGRES_PASSWORD=airflow 12 | POSTGRES_DB=airflow 13 | 14 | # Airflow 15 | AIRFLOW__CORE__EXECUTOR=LocalExecutor 16 | AIRFLOW__SCHEDULER__SCHEDULER_HEARTBEAT_SEC=10 17 | 18 | AIRFLOW__CORE__SQL_ALCHEMY_CONN=postgresql+psycopg2://${POSTGRES_USER}:${POSTGRES_PASSWORD}@postgres:5432/${POSTGRES_DB} 19 | AIRFLOW_CONN_METADATA_DB=postgres+psycopg2://airflow:airflow@postgres:5432/airflow 20 | AIRFLOW_VAR__METADATA_DB_SCHEMA=airflow 21 | 22 | _AIRFLOW_WWW_USER_CREATE=True 23 | _AIRFLOW_WWW_USER_USERNAME=${_AIRFLOW_WWW_USER_USERNAME:airflow} 24 | _AIRFLOW_WWW_USER_PASSWORD=${_AIRFLOW_WWW_USER_PASSWORD:airflow} 25 | 26 | AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION=True 27 | AIRFLOW__CORE__LOAD_EXAMPLES=False 28 | -------------------------------------------------------------------------------- /01-docker-terraform/1_terraform_gcp/terraform/terraform_basic/main.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | required_providers { 3 | google = { 4 | source = "hashicorp/google" 5 | version = "4.51.0" 6 | } 7 | } 8 | } 9 | 10 | provider "google" { 11 | # Credentials only needs to be set if you do not have the GOOGLE_APPLICATION_CREDENTIALS set 12 | # credentials = 13 | project = "" 14 | region = "us-central1" 15 | } 16 | 17 | 18 | 19 | resource "google_storage_bucket" "data-lake-bucket" { 20 | name = "" 21 | location = "US" 22 | 23 | # Optional, but recommended settings: 24 | storage_class = "STANDARD" 25 | uniform_bucket_level_access = true 26 | 27 | versioning { 28 | enabled = true 29 | } 30 | 31 | lifecycle_rule { 32 | action { 33 | type = "Delete" 34 | } 35 | condition { 36 | age = 30 // days 37 | } 38 | } 39 | 40 | force_destroy = true 41 | } 42 | 43 | 44 | resource "google_bigquery_dataset" "dataset" { 45 | dataset_id = "" 46 | project = "" 47 | location = "US" 48 | } -------------------------------------------------------------------------------- /06-streaming/python/README.md: -------------------------------------------------------------------------------- 1 | ### Stream-Processing with Python 2 | 3 | In this document, you will be finding information about stream processing 4 | using different Python libraries (`kafka-python`,`confluent-kafka`,`pyspark`, `faust`). 5 | 6 | This Python module can be separated in following modules. 7 | 8 | #### 1. Docker 9 | Docker module includes, Dockerfiles and docker-compose definitions 10 | to run Kafka and Spark in a docker container. Setting up required services is 11 | the prerequsite step for running following modules. 12 | 13 | #### 2. Kafka Producer - Consumer Examples 14 | - [Json Producer-Consumer Example](json_example) using `kafka-python` library 15 | - [Avro Producer-Consumer Example](avro_example) using `confluent-kafka` library 16 | 17 | Both of these examples require, up-and running Kafka services, therefore please ensure 18 | following steps under [docker-README](docker/README.md) 19 | 20 | To run the producer-consumer examples in the respective example folder, run following commands 21 | ```bash 22 | # Start producer script 23 | python3 producer.py 24 | # Start consumer script 25 | python3 consumer.py 26 | ``` 27 | 28 | 29 | 30 | 31 | 32 | -------------------------------------------------------------------------------- /03-data-warehouse/big_query_hw.sql: -------------------------------------------------------------------------------- 1 | CREATE OR REPLACE EXTERNAL TABLE `taxi-rides-ny.nytaxi.fhv_tripdata` 2 | OPTIONS ( 3 | format = 'CSV', 4 | uris = ['gs://nyc-tl-data/trip data/fhv_tripdata_2019-*.csv'] 5 | ); 6 | 7 | 8 | SELECT count(*) FROM `taxi-rides-ny.nytaxi.fhv_tripdata`; 9 | 10 | 11 | SELECT COUNT(DISTINCT(dispatching_base_num)) FROM `taxi-rides-ny.nytaxi.fhv_tripdata`; 12 | 13 | 14 | CREATE OR REPLACE TABLE `taxi-rides-ny.nytaxi.fhv_nonpartitioned_tripdata` 15 | AS SELECT * FROM `taxi-rides-ny.nytaxi.fhv_tripdata`; 16 | 17 | CREATE OR REPLACE TABLE `taxi-rides-ny.nytaxi.fhv_partitioned_tripdata` 18 | PARTITION BY DATE(dropoff_datetime) 19 | CLUSTER BY dispatching_base_num AS ( 20 | SELECT * FROM `taxi-rides-ny.nytaxi.fhv_tripdata` 21 | ); 22 | 23 | SELECT count(*) FROM `taxi-rides-ny.nytaxi.fhv_nonpartitioned_tripdata` 24 | WHERE DATE(dropoff_datetime) BETWEEN '2019-01-01' AND '2019-03-31' 25 | AND dispatching_base_num IN ('B00987', 'B02279', 'B02060'); 26 | 27 | 28 | SELECT count(*) FROM `taxi-rides-ny.nytaxi.fhv_partitioned_tripdata` 29 | WHERE DATE(dropoff_datetime) BETWEEN '2019-01-01' AND '2019-03-31' 30 | AND dispatching_base_num IN ('B00987', 'B02279', 'B02060'); 31 | -------------------------------------------------------------------------------- /06-streaming/java/kafka_examples/build.gradle: -------------------------------------------------------------------------------- 1 | plugins { 2 | id 'java' 3 | id "com.github.davidmc24.gradle.plugin.avro" version "1.5.0" 4 | } 5 | 6 | 7 | group 'org.example' 8 | version '1.0-SNAPSHOT' 9 | 10 | repositories { 11 | mavenCentral() 12 | maven { 13 | url "https://packages.confluent.io/maven" 14 | } 15 | } 16 | 17 | dependencies { 18 | implementation 'org.apache.kafka:kafka-clients:3.3.1' 19 | implementation 'com.opencsv:opencsv:5.7.1' 20 | implementation 'io.confluent:kafka-json-serializer:7.3.1' 21 | implementation 'org.apache.kafka:kafka-streams:3.3.1' 22 | implementation 'io.confluent:kafka-avro-serializer:7.3.1' 23 | implementation 'io.confluent:kafka-schema-registry-client:7.3.1' 24 | implementation 'io.confluent:kafka-streams-avro-serde:7.3.1' 25 | implementation "org.apache.avro:avro:1.11.0" 26 | testImplementation 'org.junit.jupiter:junit-jupiter-api:5.8.1' 27 | testRuntimeOnly 'org.junit.jupiter:junit-jupiter-engine:5.8.1' 28 | testImplementation 'org.apache.kafka:kafka-streams-test-utils:3.3.1' 29 | } 30 | 31 | sourceSets.main.java.srcDirs = ['build/generated-main-avro-java','src/main/java'] 32 | 33 | test { 34 | useJUnitPlatform() 35 | } 36 | 37 | -------------------------------------------------------------------------------- /06-streaming/ksqldb/commands.md: -------------------------------------------------------------------------------- 1 | ## KSQL DB Examples 2 | ### Create streams 3 | ```sql 4 | CREATE STREAM ride_streams ( 5 | VendorId varchar, 6 | trip_distance double, 7 | payment_type varchar 8 | ) WITH (KAFKA_TOPIC='rides', 9 | VALUE_FORMAT='JSON'); 10 | ``` 11 | 12 | ### Query stream 13 | ```sql 14 | select * from RIDE_STREAMS 15 | EMIT CHANGES; 16 | ``` 17 | 18 | ### Query stream count 19 | ```sql 20 | SELECT VENDORID, count(*) FROM RIDE_STREAMS 21 | GROUP BY VENDORID 22 | EMIT CHANGES; 23 | ``` 24 | 25 | ### Query stream with filters 26 | ```sql 27 | SELECT payment_type, count(*) FROM RIDE_STREAMS 28 | WHERE payment_type IN ('1', '2') 29 | GROUP BY payment_type 30 | EMIT CHANGES; 31 | ``` 32 | 33 | ### Query stream with window functions 34 | ```sql 35 | CREATE TABLE payment_type_sessions AS 36 | SELECT payment_type, 37 | count(*) 38 | FROM RIDE_STREAMS 39 | WINDOW SESSION (60 SECONDS) 40 | GROUP BY payment_type 41 | EMIT CHANGES; 42 | ``` 43 | 44 | ## KSQL documentation for details 45 | [KSQL DB Documentation](https://docs.ksqldb.io/en/latest/developer-guide/ksqldb-reference/quick-reference/) 46 | 47 | [KSQL DB Java client](https://docs.ksqldb.io/en/latest/developer-guide/ksqldb-clients/java-client/) -------------------------------------------------------------------------------- /01-docker-terraform/1_terraform_gcp/terraform/terraform_with_variables/variables.tf: -------------------------------------------------------------------------------- 1 | variable "credentials" { 2 | description = "My Credentials" 3 | default = "" 4 | #ex: if you have a directory where this file is called keys with your service account json file 5 | #saved there as my-creds.json you could use default = "./keys/my-creds.json" 6 | } 7 | 8 | 9 | variable "project" { 10 | description = "Project" 11 | default = "" 12 | } 13 | 14 | variable "region" { 15 | description = "Region" 16 | #Update the below to your desired region 17 | default = "us-central1" 18 | } 19 | 20 | variable "location" { 21 | description = "Project Location" 22 | #Update the below to your desired location 23 | default = "US" 24 | } 25 | 26 | variable "bq_dataset_name" { 27 | description = "My BigQuery Dataset Name" 28 | #Update the below to what you want your dataset to be called 29 | default = "demo_dataset" 30 | } 31 | 32 | variable "gcs_bucket_name" { 33 | description = "My Storage Bucket Name" 34 | #Update the below to a unique bucket name 35 | default = "terraform-demo-terra-bucket" 36 | } 37 | 38 | variable "gcs_storage_class" { 39 | description = "Bucket Storage Class" 40 | default = "STANDARD" 41 | } -------------------------------------------------------------------------------- /06-streaming/python/docker/spark/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3.6" 2 | volumes: 3 | shared-workspace: 4 | name: "hadoop-distributed-file-system" 5 | driver: local 6 | networks: 7 | default: 8 | name: kafka-spark-network 9 | external: true 10 | 11 | services: 12 | jupyterlab: 13 | image: jupyterlab 14 | container_name: jupyterlab 15 | ports: 16 | - 8888:8888 17 | volumes: 18 | - shared-workspace:/opt/workspace 19 | spark-master: 20 | image: spark-master 21 | container_name: spark-master 22 | environment: 23 | SPARK_LOCAL_IP: 'spark-master' 24 | ports: 25 | - 8080:8080 26 | - 7077:7077 27 | volumes: 28 | - shared-workspace:/opt/workspace 29 | spark-worker-1: 30 | image: spark-worker 31 | container_name: spark-worker-1 32 | environment: 33 | - SPARK_WORKER_CORES=1 34 | - SPARK_WORKER_MEMORY=4g 35 | ports: 36 | - 8083:8081 37 | volumes: 38 | - shared-workspace:/opt/workspace 39 | depends_on: 40 | - spark-master 41 | spark-worker-2: 42 | image: spark-worker 43 | container_name: spark-worker-2 44 | environment: 45 | - SPARK_WORKER_CORES=1 46 | - SPARK_WORKER_MEMORY=4g 47 | ports: 48 | - 8084:8081 49 | volumes: 50 | - shared-workspace:/opt/workspace 51 | depends_on: 52 | - spark-master 53 | -------------------------------------------------------------------------------- /cohorts/2022/project.md: -------------------------------------------------------------------------------- 1 | ## Course Project 2 | 3 | The goal of this project is to apply everything we learned 4 | in this course and build an end-to-end data pipeline. 5 | 6 | Remember that to pass the project, you must evaluate 3 peers. If you don't do that, your project can't be considered compelete. 7 | 8 | 9 | ### Submitting 10 | 11 | #### Project Cohort #2 12 | 13 | Project: 14 | 15 | * Form: https://forms.gle/JECXB9jYQ1vBXbsw6 16 | * Deadline: 2 May, 22:00 CET 17 | 18 | Peer reviewing: 19 | 20 | * Peer review assignments: [link](https://docs.google.com/spreadsheets/d/e/2PACX-1vShnv8T4iY_5NA8h0nySIS8Wzr-DZGGigEikIW4ZMSi9HlvhaEB4RhwmepVIuIUGaQHS90r5iHR2YXV/pubhtml?gid=964123374&single=true) 21 | * Form: https://forms.gle/Pb2fBwYLQ3GGFsaK6 22 | * Deadline: 9 May, 22:00 CET 23 | 24 | 25 | #### Project Cohort #1 26 | 27 | Project: 28 | 29 | * Form: https://forms.gle/6aeVcEVJipqR2BqC8 30 | * Deadline: 4 April, 22:00 CET 31 | 32 | Peer reviewing: 33 | 34 | * Peer review assignments: [link](https://docs.google.com/spreadsheets/d/e/2PACX-1vShnv8T4iY_5NA8h0nySIS8Wzr-DZGGigEikIW4ZMSi9HlvhaEB4RhwmepVIuIUGaQHS90r5iHR2YXV/pubhtml) 35 | * Form: https://forms.gle/AZ62bXMp4SGcVUmK7 36 | * Deadline: 11 April, 22:00 CET 37 | 38 | Project feedback: [link](https://docs.google.com/spreadsheets/d/e/2PACX-1vRcVCkO-jes5mbPAcikn9X_s2laJ1KhsO8aibHYQxxKqdCUYMVTEJLJQdM8C5aAUWKFl_0SJW4rme7H/pubhtml) 39 | -------------------------------------------------------------------------------- /cohorts/2022/week_2_data_ingestion/airflow/Dockerfile: -------------------------------------------------------------------------------- 1 | # First-time build can take upto 10 mins. 2 | 3 | FROM apache/airflow:2.2.3 4 | 5 | ENV AIRFLOW_HOME=/opt/airflow 6 | 7 | USER root 8 | RUN apt-get update -qq && apt-get install vim -qqq 9 | # git gcc g++ -qqq 10 | 11 | COPY requirements.txt . 12 | RUN pip install --no-cache-dir -r requirements.txt 13 | 14 | # Ref: https://airflow.apache.org/docs/docker-stack/recipes.html 15 | 16 | SHELL ["/bin/bash", "-o", "pipefail", "-e", "-u", "-x", "-c"] 17 | 18 | ARG CLOUD_SDK_VERSION=322.0.0 19 | ENV GCLOUD_HOME=/home/google-cloud-sdk 20 | 21 | ENV PATH="${GCLOUD_HOME}/bin/:${PATH}" 22 | 23 | RUN DOWNLOAD_URL="https://dl.google.com/dl/cloudsdk/channels/rapid/downloads/google-cloud-sdk-${CLOUD_SDK_VERSION}-linux-x86_64.tar.gz" \ 24 | && TMP_DIR="$(mktemp -d)" \ 25 | && curl -fL "${DOWNLOAD_URL}" --output "${TMP_DIR}/google-cloud-sdk.tar.gz" \ 26 | && mkdir -p "${GCLOUD_HOME}" \ 27 | && tar xzf "${TMP_DIR}/google-cloud-sdk.tar.gz" -C "${GCLOUD_HOME}" --strip-components=1 \ 28 | && "${GCLOUD_HOME}/install.sh" \ 29 | --bash-completion=false \ 30 | --path-update=false \ 31 | --usage-reporting=false \ 32 | --quiet \ 33 | && rm -rf "${TMP_DIR}" \ 34 | && gcloud --version 35 | 36 | WORKDIR $AIRFLOW_HOME 37 | 38 | COPY scripts scripts 39 | RUN chmod +x scripts 40 | 41 | USER $AIRFLOW_UID 42 | -------------------------------------------------------------------------------- /06-streaming/python/streams-example/pyspark/README.md: -------------------------------------------------------------------------------- 1 | 2 | # Running PySpark Streaming 3 | 4 | #### Prerequisite 5 | 6 | Ensure your Kafka and Spark services up and running by following the [docker setup readme](./../../docker/README.md). 7 | It is important to create network and volume as described in the document. Therefore please ensure, your volume and network are created correctly 8 | 9 | ```bash 10 | docker volume ls # should list hadoop-distributed-file-system 11 | docker network ls # should list kafka-spark-network 12 | ``` 13 | 14 | 15 | ### Running Producer and Consumer 16 | ```bash 17 | # Run producer 18 | python3 producer.py 19 | 20 | # Run consumer with default settings 21 | python3 consumer.py 22 | # Run consumer for specific topic 23 | python3 consumer.py --topic 24 | ``` 25 | 26 | ### Running Streaming Script 27 | 28 | spark-submit script ensures installation of necessary jars before running the streaming.py 29 | 30 | ```bash 31 | ./spark-submit.sh streaming.py 32 | ``` 33 | 34 | ### Additional Resources 35 | - [Structured Streaming Programming Guide](https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html#structured-streaming-programming-guide) 36 | - [Structured Streaming + Kafka Integration](https://spark.apache.org/docs/latest/structured-streaming-kafka-integration.html#structured-streaming-kafka-integration-guide-kafka-broker-versio) 37 | -------------------------------------------------------------------------------- /06-streaming/python/docker/README.md: -------------------------------------------------------------------------------- 1 | 2 | # Running Spark and Kafka Clusters on Docker 3 | 4 | ### 1. Build Required Images for running Spark 5 | 6 | The details of how to spark-images are build in different layers can be created can be read through 7 | the blog post written by André Perez on [Medium blog -Towards Data Science](https://towardsdatascience.com/apache-spark-cluster-on-docker-ft-a-juyterlab-interface-418383c95445) 8 | 9 | ```bash 10 | # Build Spark Images 11 | ./build.sh 12 | ``` 13 | 14 | ### 2. Create Docker Network & Volume 15 | 16 | ```bash 17 | # Create Network 18 | docker network create kafka-spark-network 19 | 20 | # Create Volume 21 | docker volume create --name=hadoop-distributed-file-system 22 | ``` 23 | 24 | ### 3. Run Services on Docker 25 | ```bash 26 | # Start Docker-Compose (within for kafka and spark folders) 27 | docker compose up -d 28 | ``` 29 | In depth explanation of [Kafka Listeners](https://www.confluent.io/blog/kafka-listeners-explained/) 30 | 31 | Explanation of [Kafka Listeners](https://www.confluent.io/blog/kafka-listeners-explained/) 32 | 33 | ### 4. Stop Services on Docker 34 | ```bash 35 | # Stop Docker-Compose (within for kafka and spark folders) 36 | docker compose down 37 | ``` 38 | 39 | ### 5. Helpful Comands 40 | ```bash 41 | # Delete all Containers 42 | docker rm -f $(docker ps -a -q) 43 | 44 | # Delete all volumes 45 | docker volume rm $(docker volume ls -q) 46 | ``` 47 | 48 | -------------------------------------------------------------------------------- /cohorts/2025/README.md: -------------------------------------------------------------------------------- 1 | ## Data Engineering Zoomcamp 2025 Cohort 2 | 3 | * [Pre-launch Q&A stream](TBA) 4 | * [Launch stream with course overview](TBA) 5 | * [Course Google calendar](https://calendar.google.com/calendar/?cid=ZXIxcjA1M3ZlYjJpcXU0dTFmaG02MzVxMG9AZ3JvdXAuY2FsZW5kYXIuZ29vZ2xlLmNvbQ) 6 | * [FAQ](https://docs.google.com/document/d/19bnYs80DwuUimHM65UV3sylsCn2j1vziPOwzBwQrebw/edit?usp=sharing) 7 | * Course Playlist: Only 2024 Live videos & homeworks (TODO) 8 | 9 | 10 | [**Module 1: Introduction & Prerequisites**](01-docker-terraform/) 11 | 12 | * [Homework](01-docker-terraform/homework.md) 13 | 14 | 15 | [**Module 2: Workflow Orchestration**](02-workflow-orchestration) 16 | 17 | * [Homework](02-workflow-orchestration/homework.md) 18 | * Office hours 19 | 20 | [**Workshop 1: Data Ingestion**](workshops/dlt.md) 21 | 22 | * Workshop with dlt 23 | * [Homework](workshops/dlt.md) 24 | 25 | 26 | [**Module 3: Data Warehouse**](03-data-warehouse) 27 | 28 | * [Homework](03-data-warehouse/homework.md) 29 | 30 | 31 | [**Module 4: Analytics Engineering**](04-analytics-engineering/) 32 | 33 | * [Homework](04-analytics-engineering/homework.md) 34 | 35 | 36 | [**Module 5: Batch processing**](05-batch/) 37 | 38 | * [Homework](05-batch/homework.md) 39 | 40 | 41 | [**Module 6: Stream Processing**](06-streaming) 42 | 43 | * [Homework](06-streaming/homework.md) 44 | 45 | 46 | [**Project**](project.md) 47 | 48 | More information [here](project.md) 49 | -------------------------------------------------------------------------------- /learning-in-public.md: -------------------------------------------------------------------------------- 1 | # Learning in public 2 | 3 | Most people learn in private: they consume content but don't tell 4 | anyone about it. There's nothing wrong with it. 5 | 6 | But we want to encourage you to document your progress and 7 | share it publicly on social media. 8 | 9 | It helps you get noticed and will lead to: 10 | 11 | * Expanding your network: meeting new people and making new friends 12 | * Being invited to meetups, conferences and podcasts 13 | * Landing a job or getting clients 14 | * Many other good things 15 | 16 | Here's a more compresensive reading on why you want to do it: https://github.com/readme/guides/publishing-your-work 17 | 18 | 19 | ## Learning in Public for Zoomcamps 20 | 21 | When you submit your homework or project, you can also submit 22 | learning in public posts: 23 | 24 | 25 | 26 | You can watch this video to see how your learning in public posts may look like: 27 | 28 | 29 | 30 | 31 | 32 | ## Daily Documentation 33 | 34 | - **Post Daily Diaries**: Document what you learn each day, including the challenges faced and the methods used to overcome them. 35 | - **Create Quick Videos**: Make short videos showcasing your work and upload them to GitHub. 36 | 37 | Send a PR if you want to suggest improvements for this document 38 | -------------------------------------------------------------------------------- /06-streaming/python/streams-example/redpanda/README.md: -------------------------------------------------------------------------------- 1 | 2 | # Running PySpark Streaming with Redpanda 3 | 4 | ### 1. Prerequisite 5 | 6 | It is important to create network and volume as described in the document. Therefore please ensure, your volume and network are created correctly. 7 | 8 | ```bash 9 | docker volume ls # should list hadoop-distributed-file-system 10 | docker network ls # should list kafka-spark-network 11 | ``` 12 | 13 | ### 2. Create Docker Network & Volume 14 | 15 | If you have not followed any other examples, and above `ls` steps shows no output, create them now. 16 | 17 | ```bash 18 | # Create Network 19 | docker network create kafka-spark-network 20 | 21 | # Create Volume 22 | docker volume create --name=hadoop-distributed-file-system 23 | ``` 24 | 25 | ### Running Producer and Consumer 26 | ```bash 27 | # Run producer 28 | python producer.py 29 | 30 | # Run consumer with default settings 31 | python consumer.py 32 | # Run consumer for specific topic 33 | python consumer.py --topic 34 | ``` 35 | 36 | ### Running Streaming Script 37 | 38 | spark-submit script ensures installation of necessary jars before running the streaming.py 39 | 40 | ```bash 41 | ./spark-submit.sh streaming.py 42 | ``` 43 | 44 | ### Additional Resources 45 | - [Structured Streaming Programming Guide](https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html#structured-streaming-programming-guide) 46 | - [Structured Streaming + Kafka Integration](https://spark.apache.org/docs/latest/structured-streaming-kafka-integration.html#structured-streaming-kafka-integration-guide-kafka-broker-versio) 47 | -------------------------------------------------------------------------------- /cohorts/2022/week_2_data_ingestion/airflow/dags_local/ingest_script.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from time import time 4 | 5 | import pandas as pd 6 | from sqlalchemy import create_engine 7 | 8 | 9 | def ingest_callable(user, password, host, port, db, table_name, csv_file, execution_date): 10 | print(table_name, csv_file, execution_date) 11 | 12 | engine = create_engine(f'postgresql://{user}:{password}@{host}:{port}/{db}') 13 | engine.connect() 14 | 15 | print('connection established successfully, inserting data...') 16 | 17 | t_start = time() 18 | df_iter = pd.read_csv(csv_file, iterator=True, chunksize=100000) 19 | 20 | df = next(df_iter) 21 | 22 | df.tpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime) 23 | df.tpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime) 24 | 25 | df.head(n=0).to_sql(name=table_name, con=engine, if_exists='replace') 26 | 27 | df.to_sql(name=table_name, con=engine, if_exists='append') 28 | 29 | t_end = time() 30 | print('inserted the first chunk, took %.3f second' % (t_end - t_start)) 31 | 32 | while True: 33 | t_start = time() 34 | 35 | try: 36 | df = next(df_iter) 37 | except StopIteration: 38 | print("completed") 39 | break 40 | 41 | df.tpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime) 42 | df.tpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime) 43 | 44 | df.to_sql(name=table_name, con=engine, if_exists='append') 45 | 46 | t_end = time() 47 | 48 | print('inserted another chunk, took %.3f second' % (t_end - t_start)) 49 | -------------------------------------------------------------------------------- /02-workflow-orchestration/flows/01_getting_started_data_pipeline.yaml: -------------------------------------------------------------------------------- 1 | id: 01_getting_started_data_pipeline 2 | namespace: zoomcamp 3 | 4 | inputs: 5 | - id: columns_to_keep 6 | type: ARRAY 7 | itemType: STRING 8 | defaults: 9 | - brand 10 | - price 11 | 12 | tasks: 13 | - id: extract 14 | type: io.kestra.plugin.core.http.Download 15 | uri: https://dummyjson.com/products 16 | 17 | - id: transform 18 | type: io.kestra.plugin.scripts.python.Script 19 | containerImage: python:3.11-alpine 20 | inputFiles: 21 | data.json: "{{outputs.extract.uri}}" 22 | outputFiles: 23 | - "*.json" 24 | env: 25 | COLUMNS_TO_KEEP: "{{inputs.columns_to_keep}}" 26 | script: | 27 | import json 28 | import os 29 | 30 | columns_to_keep_str = os.getenv("COLUMNS_TO_KEEP") 31 | columns_to_keep = json.loads(columns_to_keep_str) 32 | 33 | with open("data.json", "r") as file: 34 | data = json.load(file) 35 | 36 | filtered_data = [ 37 | {column: product.get(column, "N/A") for column in columns_to_keep} 38 | for product in data["products"] 39 | ] 40 | 41 | with open("products.json", "w") as file: 42 | json.dump(filtered_data, file, indent=4) 43 | 44 | - id: query 45 | type: io.kestra.plugin.jdbc.duckdb.Query 46 | inputFiles: 47 | products.json: "{{outputs.transform.outputFiles['products.json']}}" 48 | sql: | 49 | INSTALL json; 50 | LOAD json; 51 | SELECT brand, round(avg(price), 2) as avg_price 52 | FROM read_json_auto('{{workingDir}}/products.json') 53 | GROUP BY brand 54 | ORDER BY avg_price DESC; 55 | fetchType: STORE 56 | -------------------------------------------------------------------------------- /cohorts/2024/project.md: -------------------------------------------------------------------------------- 1 | ## Course Project 2 | 3 | The goal of this project is to apply everything we learned 4 | in this course and build an end-to-end data pipeline. 5 | 6 | You will have two attempts to submit your project. If you don't have 7 | time to submit your project by the end of attempt #1 (you started the 8 | course late, you have vacation plans, life/work got in the way, etc.) 9 | or you fail your first attempt, 10 | then you will have a second chance to submit your project as attempt 11 | #2. 12 | 13 | There are only two attempts. 14 | 15 | Remember that to pass the project, you must evaluate 3 peers. If you don't do that, 16 | your project can't be considered complete. 17 | 18 | To find the projects assigned to you, use the peer review assignments link 19 | and find your hash in the first column. You will see three rows: you need to evaluate 20 | each of these projects. For each project, you need to submit the form once, 21 | so in total, you will make three submissions. 22 | 23 | 24 | ### Submitting 25 | 26 | #### Project Attempt #1 27 | 28 | * Project: https://courses.datatalks.club/de-zoomcamp-2024/project/project1 29 | * Review: https://courses.datatalks.club/de-zoomcamp-2024/project/project1/eval 30 | 31 | #### Project Attempt #2 32 | 33 | * Project: https://courses.datatalks.club/de-zoomcamp-2024/project/project2 34 | * Review: https://courses.datatalks.club/de-zoomcamp-2024/project/project2/eval 35 | 36 | > **Important**: update your "Certificate name" here: https://courses.datatalks.club/de-zoomcamp-2024/enrollment - 37 | this is what we will use when generating certificates for you. 38 | 39 | ### Evaluation criteria 40 | 41 | See [here](../../week_7_project/README.md) 42 | 43 | 44 | -------------------------------------------------------------------------------- /cohorts/2025/project.md: -------------------------------------------------------------------------------- 1 | ## Course Project 2 | 3 | The goal of this project is to apply everything we learned 4 | in this course and build an end-to-end data pipeline. 5 | 6 | You will have two attempts to submit your project. If you don't have 7 | time to submit your project by the end of attempt #1 (you started the 8 | course late, you have vacation plans, life/work got in the way, etc.) 9 | or you fail your first attempt, 10 | then you will have a second chance to submit your project as attempt 11 | #2. 12 | 13 | There are only two attempts. 14 | 15 | Remember that to pass the project, you must evaluate 3 peers. If you don't do that, 16 | your project can't be considered complete. 17 | 18 | To find the projects assigned to you, use the peer review assignments link 19 | and find your hash in the first column. You will see three rows: you need to evaluate 20 | each of these projects. For each project, you need to submit the form once, 21 | so in total, you will make three submissions. 22 | 23 | 24 | ### Submitting 25 | 26 | #### Project Attempt #1 27 | 28 | * Project: https://courses.datatalks.club/de-zoomcamp-2024/project/project1 29 | * Review: https://courses.datatalks.club/de-zoomcamp-2024/project/project1/eval 30 | 31 | #### Project Attempt #2 32 | 33 | * Project: https://courses.datatalks.club/de-zoomcamp-2024/project/project2 34 | * Review: https://courses.datatalks.club/de-zoomcamp-2024/project/project2/eval 35 | 36 | > **Important**: update your "Certificate name" here: https://courses.datatalks.club/de-zoomcamp-2024/enrollment - 37 | this is what we will use when generating certificates for you. 38 | 39 | ### Evaluation criteria 40 | 41 | See [here](../../week_7_project/README.md) 42 | 43 | 44 | -------------------------------------------------------------------------------- /06-streaming/python/json_example/consumer.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, List 2 | from json import loads 3 | from kafka import KafkaConsumer 4 | 5 | from ride import Ride 6 | from settings import BOOTSTRAP_SERVERS, KAFKA_TOPIC 7 | 8 | 9 | class JsonConsumer: 10 | def __init__(self, props: Dict): 11 | self.consumer = KafkaConsumer(**props) 12 | 13 | def consume_from_kafka(self, topics: List[str]): 14 | self.consumer.subscribe(topics) 15 | print('Consuming from Kafka started') 16 | print('Available topics to consume: ', self.consumer.subscription()) 17 | while True: 18 | try: 19 | # SIGINT can't be handled when polling, limit timeout to 1 second. 20 | message = self.consumer.poll(1.0) 21 | if message is None or message == {}: 22 | continue 23 | for message_key, message_value in message.items(): 24 | for msg_val in message_value: 25 | print(msg_val.key, msg_val.value) 26 | except KeyboardInterrupt: 27 | break 28 | 29 | self.consumer.close() 30 | 31 | 32 | if __name__ == '__main__': 33 | config = { 34 | 'bootstrap_servers': BOOTSTRAP_SERVERS, 35 | 'auto_offset_reset': 'earliest', 36 | 'enable_auto_commit': True, 37 | 'key_deserializer': lambda key: int(key.decode('utf-8')), 38 | 'value_deserializer': lambda x: loads(x.decode('utf-8'), object_hook=lambda d: Ride.from_dict(d)), 39 | 'group_id': 'consumer.group.id.json-example.1', 40 | } 41 | 42 | json_consumer = JsonConsumer(props=config) 43 | json_consumer.consume_from_kafka(topics=[KAFKA_TOPIC]) 44 | -------------------------------------------------------------------------------- /06-streaming/python/json_example/producer.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import json 3 | from typing import List, Dict 4 | from kafka import KafkaProducer 5 | from kafka.errors import KafkaTimeoutError 6 | 7 | from ride import Ride 8 | from settings import BOOTSTRAP_SERVERS, INPUT_DATA_PATH, KAFKA_TOPIC 9 | 10 | 11 | class JsonProducer(KafkaProducer): 12 | def __init__(self, props: Dict): 13 | self.producer = KafkaProducer(**props) 14 | 15 | @staticmethod 16 | def read_records(resource_path: str): 17 | records = [] 18 | with open(resource_path, 'r') as f: 19 | reader = csv.reader(f) 20 | header = next(reader) # skip the header row 21 | for row in reader: 22 | records.append(Ride(arr=row)) 23 | return records 24 | 25 | def publish_rides(self, topic: str, messages: List[Ride]): 26 | for ride in messages: 27 | try: 28 | record = self.producer.send(topic=topic, key=ride.pu_location_id, value=ride) 29 | print('Record {} successfully produced at offset {}'.format(ride.pu_location_id, record.get().offset)) 30 | except KafkaTimeoutError as e: 31 | print(e.__str__()) 32 | 33 | 34 | if __name__ == '__main__': 35 | # Config Should match with the KafkaProducer expectation 36 | config = { 37 | 'bootstrap_servers': BOOTSTRAP_SERVERS, 38 | 'key_serializer': lambda key: str(key).encode(), 39 | 'value_serializer': lambda x: json.dumps(x.__dict__, default=str).encode('utf-8') 40 | } 41 | producer = JsonProducer(props=config) 42 | rides = producer.read_records(resource_path=INPUT_DATA_PATH) 43 | producer.publish_rides(topic=KAFKA_TOPIC, messages=rides) 44 | -------------------------------------------------------------------------------- /cohorts/2022/week_2_data_ingestion/airflow/dags_local/data_ingestion_local.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from datetime import datetime 4 | 5 | from airflow import DAG 6 | 7 | from airflow.operators.bash import BashOperator 8 | from airflow.operators.python import PythonOperator 9 | 10 | from ingest_script import ingest_callable 11 | 12 | 13 | AIRFLOW_HOME = os.environ.get("AIRFLOW_HOME", "/opt/airflow/") 14 | 15 | 16 | PG_HOST = os.getenv('PG_HOST') 17 | PG_USER = os.getenv('PG_USER') 18 | PG_PASSWORD = os.getenv('PG_PASSWORD') 19 | PG_PORT = os.getenv('PG_PORT') 20 | PG_DATABASE = os.getenv('PG_DATABASE') 21 | 22 | 23 | local_workflow = DAG( 24 | "LocalIngestionDag", 25 | schedule_interval="0 6 2 * *", 26 | start_date=datetime(2021, 1, 1) 27 | ) 28 | 29 | 30 | URL_PREFIX = 'https://s3.amazonaws.com/nyc-tlc/trip+data' 31 | URL_TEMPLATE = URL_PREFIX + '/yellow_tripdata_{{ execution_date.strftime(\'%Y-%m\') }}.csv' 32 | OUTPUT_FILE_TEMPLATE = AIRFLOW_HOME + '/output_{{ execution_date.strftime(\'%Y-%m\') }}.csv' 33 | TABLE_NAME_TEMPLATE = 'yellow_taxi_{{ execution_date.strftime(\'%Y_%m\') }}' 34 | 35 | with local_workflow: 36 | wget_task = BashOperator( 37 | task_id='wget', 38 | bash_command=f'curl -sSL {URL_TEMPLATE} > {OUTPUT_FILE_TEMPLATE}' 39 | ) 40 | 41 | ingest_task = PythonOperator( 42 | task_id="ingest", 43 | python_callable=ingest_callable, 44 | op_kwargs=dict( 45 | user=PG_USER, 46 | password=PG_PASSWORD, 47 | host=PG_HOST, 48 | port=PG_PORT, 49 | db=PG_DATABASE, 50 | table_name=TABLE_NAME_TEMPLATE, 51 | csv_file=OUTPUT_FILE_TEMPLATE 52 | ), 53 | ) 54 | 55 | wget_task >> ingest_task -------------------------------------------------------------------------------- /cohorts/2024/README.md: -------------------------------------------------------------------------------- 1 | ## Data Engineering Zoomcamp 2024 Cohort 2 | 3 | * [Pre-launch Q&A stream](https://www.youtube.com/watch?v=91b8u9GmqB4) 4 | * [Launch stream with course overview](https://www.youtube.com/live/AtRhA-NfS24?si=5JzA_E8BmJjiLi8l) 5 | * [Deadline calendar](https://docs.google.com/spreadsheets/d/e/2PACX-1vQACMLuutV5rvXg5qICuJGL-yZqIV0FBD84CxPdC5eZHf8TfzB-CJT_3Mo7U7oGVTXmSihPgQxuuoku/pubhtml) 6 | * [Course Google calendar](https://calendar.google.com/calendar/?cid=ZXIxcjA1M3ZlYjJpcXU0dTFmaG02MzVxMG9AZ3JvdXAuY2FsZW5kYXIuZ29vZ2xlLmNvbQ) 7 | * [FAQ](https://docs.google.com/document/d/19bnYs80DwuUimHM65UV3sylsCn2j1vziPOwzBwQrebw/edit?usp=sharing) 8 | * Course Playlist: Only 2024 Live videos & homeworks (TODO) 9 | * [Public Leaderboard of Top-100 Participants](leaderboard.md) 10 | 11 | 12 | [**Module 1: Introduction & Prerequisites**](01-docker-terraform/) 13 | 14 | * [Homework](01-docker-terraform/homework.md) 15 | 16 | 17 | [**Module 2: Workflow Orchestration**](02-workflow-orchestration) 18 | 19 | * [Homework](02-workflow-orchestration/homework.md) 20 | * Office hours 21 | 22 | [**Workshop 1: Data Ingestion**](workshops/dlt.md) 23 | 24 | * Workshop with dlt 25 | * [Homework](workshops/dlt.md) 26 | 27 | 28 | [**Module 3: Data Warehouse**](03-data-warehouse) 29 | 30 | * [Homework](03-data-warehouse/homework.md) 31 | 32 | 33 | [**Module 4: Analytics Engineering**](04-analytics-engineering/) 34 | 35 | * [Homework](04-analytics-engineering/homework.md) 36 | 37 | 38 | [**Module 5: Batch processing**](05-batch/) 39 | 40 | * [Homework](05-batch/homework.md) 41 | 42 | 43 | [**Module 6: Stream Processing**](06-streaming) 44 | 45 | * [Homework](06-streaming/homework.md) 46 | 47 | 48 | [**Project**](project.md) 49 | 50 | More information [here](project.md) 51 | -------------------------------------------------------------------------------- /cohorts/2022/week_2_data_ingestion/airflow/docker-compose-nofrills.yml: -------------------------------------------------------------------------------- 1 | version: '3' 2 | services: 3 | postgres: 4 | image: postgres:13 5 | env_file: 6 | - .env 7 | volumes: 8 | - postgres-db-volume:/var/lib/postgresql/data 9 | healthcheck: 10 | test: ["CMD", "pg_isready", "-U", "airflow"] 11 | interval: 5s 12 | retries: 5 13 | restart: always 14 | 15 | scheduler: 16 | build: . 17 | command: scheduler 18 | restart: on-failure 19 | depends_on: 20 | - postgres 21 | env_file: 22 | - .env 23 | volumes: 24 | - ./dags:/opt/airflow/dags 25 | - ./logs:/opt/airflow/logs 26 | - ./plugins:/opt/airflow/plugins 27 | - ./scripts:/opt/airflow/scripts 28 | - ~/.google/credentials/:/.google/credentials 29 | 30 | 31 | webserver: 32 | build: . 33 | entrypoint: ./scripts/entrypoint.sh 34 | restart: on-failure 35 | depends_on: 36 | - postgres 37 | - scheduler 38 | env_file: 39 | - .env 40 | volumes: 41 | - ./dags:/opt/airflow/dags 42 | - ./logs:/opt/airflow/logs 43 | - ./plugins:/opt/airflow/plugins 44 | - ~/.google/credentials/:/.google/credentials:ro 45 | - ./scripts:/opt/airflow/scripts 46 | 47 | user: "${AIRFLOW_UID:-50000}:0" 48 | ports: 49 | - "8080:8080" 50 | healthcheck: 51 | test: [ "CMD-SHELL", "[ -f /home/airflow/airflow-webserver.pid ]" ] 52 | interval: 30s 53 | timeout: 30s 54 | retries: 3 55 | 56 | volumes: 57 | postgres-db-volume: -------------------------------------------------------------------------------- /cohorts/2022/week_3_data_warehouse/airflow/docker-compose-nofrills.yml: -------------------------------------------------------------------------------- 1 | version: '3' 2 | services: 3 | postgres: 4 | image: postgres:13 5 | env_file: 6 | - .env 7 | volumes: 8 | - postgres-db-volume:/var/lib/postgresql/data 9 | healthcheck: 10 | test: ["CMD", "pg_isready", "-U", "airflow"] 11 | interval: 5s 12 | retries: 5 13 | restart: always 14 | 15 | scheduler: 16 | build: . 17 | command: scheduler 18 | restart: on-failure 19 | depends_on: 20 | - postgres 21 | env_file: 22 | - .env 23 | volumes: 24 | - ./dags:/opt/airflow/dags 25 | - ./logs:/opt/airflow/logs 26 | - ./plugins:/opt/airflow/plugins 27 | - ./scripts:/opt/airflow/scripts 28 | - ~/.google/credentials/:/.google/credentials:ro 29 | 30 | 31 | webserver: 32 | build: . 33 | entrypoint: ./scripts/entrypoint.sh 34 | restart: on-failure 35 | depends_on: 36 | - postgres 37 | - scheduler 38 | env_file: 39 | - .env 40 | volumes: 41 | - ./dags:/opt/airflow/dags 42 | - ./logs:/opt/airflow/logs 43 | - ./plugins:/opt/airflow/plugins 44 | - ~/.google/credentials/:/.google/credentials:ro 45 | - ./scripts:/opt/airflow/scripts 46 | 47 | user: "${AIRFLOW_UID:-50000}:0" 48 | ports: 49 | - "8080:8080" 50 | healthcheck: 51 | test: [ "CMD-SHELL", "[ -f /home/airflow/airflow-webserver.pid ]" ] 52 | interval: 30s 53 | timeout: 30s 54 | retries: 3 55 | 56 | volumes: 57 | postgres-db-volume: -------------------------------------------------------------------------------- /06-streaming/python/redpanda_example/producer.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import json 3 | from typing import List, Dict 4 | from kafka import KafkaProducer 5 | from kafka.errors import KafkaTimeoutError 6 | 7 | from ride import Ride 8 | from settings import BOOTSTRAP_SERVERS, INPUT_DATA_PATH, KAFKA_TOPIC 9 | 10 | 11 | class JsonProducer(KafkaProducer): 12 | def __init__(self, props: Dict): 13 | self.producer = KafkaProducer(**props) 14 | 15 | @staticmethod 16 | def read_records(resource_path: str): 17 | records = [] 18 | with open(resource_path, 'r') as f: 19 | reader = csv.reader(f) 20 | header = next(reader) # skip the header row 21 | for row in reader: 22 | records.append(Ride(arr=row)) 23 | return records 24 | 25 | def publish_rides(self, topic: str, messages: List[Ride]): 26 | for ride in messages: 27 | try: 28 | record = self.producer.send(topic=topic, key=ride.pu_location_id, value=ride) 29 | print('Record {} successfully produced at offset {}'.format(ride.pu_location_id, record.get().offset)) 30 | except KafkaTimeoutError as e: 31 | print(e.__str__()) 32 | 33 | 34 | if __name__ == '__main__': 35 | # Config Should match with the KafkaProducer expectation 36 | # kafka expects binary format for the key-value pair 37 | config = { 38 | 'bootstrap_servers': BOOTSTRAP_SERVERS, 39 | 'key_serializer': lambda key: str(key).encode(), 40 | 'value_serializer': lambda x: json.dumps(x.__dict__, default=str).encode('utf-8') 41 | } 42 | producer = JsonProducer(props=config) 43 | rides = producer.read_records(resource_path=INPUT_DATA_PATH) 44 | producer.publish_rides(topic=KAFKA_TOPIC, messages=rides) 45 | -------------------------------------------------------------------------------- /02-workflow-orchestration/docker-compose.yml: -------------------------------------------------------------------------------- 1 | volumes: 2 | postgres-data: 3 | driver: local 4 | kestra-data: 5 | driver: local 6 | 7 | services: 8 | postgres: 9 | image: postgres 10 | volumes: 11 | - postgres-data:/var/lib/postgresql/data 12 | environment: 13 | POSTGRES_DB: kestra 14 | POSTGRES_USER: kestra 15 | POSTGRES_PASSWORD: k3str4 16 | healthcheck: 17 | test: ["CMD-SHELL", "pg_isready -d $${POSTGRES_DB} -U $${POSTGRES_USER}"] 18 | interval: 30s 19 | timeout: 10s 20 | retries: 10 21 | 22 | kestra: 23 | image: kestra/kestra:latest 24 | pull_policy: always 25 | user: "root" 26 | command: server standalone 27 | volumes: 28 | - kestra-data:/app/storage 29 | - /var/run/docker.sock:/var/run/docker.sock 30 | - /tmp/kestra-wd:/tmp/kestra-wd 31 | environment: 32 | KESTRA_CONFIGURATION: | 33 | datasources: 34 | postgres: 35 | url: jdbc:postgresql://postgres:5432/kestra 36 | driverClassName: org.postgresql.Driver 37 | username: kestra 38 | password: k3str4 39 | kestra: 40 | server: 41 | basicAuth: 42 | enabled: false 43 | username: "admin@kestra.io" # it must be a valid email address 44 | password: kestra 45 | repository: 46 | type: postgres 47 | storage: 48 | type: local 49 | local: 50 | basePath: "/app/storage" 51 | queue: 52 | type: postgres 53 | tasks: 54 | tmpDir: 55 | path: /tmp/kestra-wd/tmp 56 | url: http://localhost:8080/ 57 | ports: 58 | - "8080:8080" 59 | - "8081:8081" 60 | depends_on: 61 | postgres: 62 | condition: service_started 63 | -------------------------------------------------------------------------------- /certificates.md: -------------------------------------------------------------------------------- 1 | ## Getting your certificate 2 | 3 | Congratulations on finishing the course! 4 | 5 | Here's how you can get your certificate. 6 | 7 | First, get your certificate id using the `compute_certificate_id` function: 8 | 9 | ```python 10 | from hashlib import sha1 11 | 12 | def compute_hash(email): 13 | return sha1(email.encode('utf-8')).hexdigest() 14 | 15 | def compute_certificate_id(email): 16 | email_clean = email.lower().strip() 17 | return compute_hash(email_clean + '_') 18 | ``` 19 | 20 | > **Note** that this is not the same hash as you have on the leaderboard 21 | > There's an extra "_" added to your email, so the hash is different. 22 | 23 | 24 | Then use this hash to get the URL 25 | 26 | ```python 27 | cohort = 2024 28 | course = 'dezoomcamp' 29 | your_id = compute_certificate_id('never.give.up@gmail.com') 30 | url = f"https://certificate.datatalks.club/{course}/{cohort}/{your_id}.pdf" 31 | print(url) 32 | ``` 33 | 34 | Example: https://certificate.datatalks.club/dezoomcamp/2024/fe629854d45c559e9c10b3b8458ea392fdeb68a9.pdf 35 | 36 | 37 | ## Adding to LinkedIn 38 | 39 | You can add your certificate to LinkedIn: 40 | 41 | * Log in to your LinkedIn account, then go to your profile. 42 | * On the right, in the "Add profile" section dropdown, choose "Background" and then select the drop-down triangle next to "Licenses & Certifications". 43 | * In "Name", enter "Data Engineering Zoomcamp". 44 | * In "Issuing Organization", enter "DataTalksClub". 45 | * (Optional) In "Issue Date", enter the time when the certificate was created. 46 | * (Optional) Select the checkbox This certification does not expire. 47 | * Put your certificate ID. 48 | * In "Certification URL", enter the URL for your certificate. 49 | 50 | [Adapted from here](https://support.edx.org/hc/en-us/articles/206501938-How-can-I-add-my-certificate-to-my-LinkedIn-profile-) 51 | -------------------------------------------------------------------------------- /05-batch/setup/pyspark.md: -------------------------------------------------------------------------------- 1 | 2 | ## PySpark 3 | 4 | This document assumes you already have python. 5 | 6 | To run PySpark, we first need to add it to `PYTHONPATH`: 7 | 8 | ```bash 9 | export PYTHONPATH="${SPARK_HOME}/python/:$PYTHONPATH" 10 | export PYTHONPATH="${SPARK_HOME}/python/lib/py4j-0.10.9-src.zip:$PYTHONPATH" 11 | ``` 12 | 13 | Make sure that the version under `${SPARK_HOME}/python/lib/` matches the filename of py4j or you will 14 | encounter `ModuleNotFoundError: No module named 'py4j'` while executing `import pyspark`. 15 | 16 | For example, if the file under `${SPARK_HOME}/python/lib/` is `py4j-0.10.9.3-src.zip`, then the 17 | `export PYTHONPATH` statement above should be changed to 18 | 19 | ```bash 20 | export PYTHONPATH="${SPARK_HOME}/python/lib/py4j-0.10.9.3-src.zip:$PYTHONPATH" 21 | ``` 22 | 23 | On Windows, you may have to do path conversion from unix-style to windowns-style: 24 | 25 | ```bash 26 | SPARK_WIN=`cygpath -w ${SPARK_HOME}` 27 | 28 | export PYTHONPATH="${SPARK_WIN}\\python\\" 29 | export PYTHONPATH="${SPARK_WIN}\\python\\lib\\py4j-0.10.9-src.zip;$PYTHONPATH" 30 | ``` 31 | 32 | Now you can run Jupyter or IPython to test if things work. Go to some other directory, e.g. `~/tmp`. 33 | 34 | Download a CSV file that we'll use for testing: 35 | 36 | ```bash 37 | wget https://d37ci6vzurychx.cloudfront.net/misc/taxi_zone_lookup.csv 38 | ``` 39 | 40 | Now let's run `ipython` (or `jupyter notebook`) and execute: 41 | 42 | ```python 43 | import pyspark 44 | from pyspark.sql import SparkSession 45 | 46 | spark = SparkSession.builder \ 47 | .master("local[*]") \ 48 | .appName('test') \ 49 | .getOrCreate() 50 | 51 | df = spark.read \ 52 | .option("header", "true") \ 53 | .csv('taxi_zone_lookup.csv') 54 | 55 | df.show() 56 | ``` 57 | 58 | Test that writing works as well: 59 | 60 | ```python 61 | df.write.parquet('zones') 62 | ``` 63 | -------------------------------------------------------------------------------- /04-analytics-engineering/taxi_rides_ny/models/core/fact_trips.sql: -------------------------------------------------------------------------------- 1 | {{ 2 | config( 3 | materialized='table' 4 | ) 5 | }} 6 | 7 | with green_tripdata as ( 8 | select *, 9 | 'Green' as service_type 10 | from {{ ref('stg_green_tripdata') }} 11 | ), 12 | yellow_tripdata as ( 13 | select *, 14 | 'Yellow' as service_type 15 | from {{ ref('stg_yellow_tripdata') }} 16 | ), 17 | trips_unioned as ( 18 | select * from green_tripdata 19 | union all 20 | select * from yellow_tripdata 21 | ), 22 | dim_zones as ( 23 | select * from {{ ref('dim_zones') }} 24 | where borough != 'Unknown' 25 | ) 26 | select trips_unioned.tripid, 27 | trips_unioned.vendorid, 28 | trips_unioned.service_type, 29 | trips_unioned.ratecodeid, 30 | trips_unioned.pickup_locationid, 31 | pickup_zone.borough as pickup_borough, 32 | pickup_zone.zone as pickup_zone, 33 | trips_unioned.dropoff_locationid, 34 | dropoff_zone.borough as dropoff_borough, 35 | dropoff_zone.zone as dropoff_zone, 36 | trips_unioned.pickup_datetime, 37 | trips_unioned.dropoff_datetime, 38 | trips_unioned.store_and_fwd_flag, 39 | trips_unioned.passenger_count, 40 | trips_unioned.trip_distance, 41 | trips_unioned.trip_type, 42 | trips_unioned.fare_amount, 43 | trips_unioned.extra, 44 | trips_unioned.mta_tax, 45 | trips_unioned.tip_amount, 46 | trips_unioned.tolls_amount, 47 | trips_unioned.ehail_fee, 48 | trips_unioned.improvement_surcharge, 49 | trips_unioned.total_amount, 50 | trips_unioned.payment_type, 51 | trips_unioned.payment_type_description 52 | from trips_unioned 53 | inner join dim_zones as pickup_zone 54 | on trips_unioned.pickup_locationid = pickup_zone.locationid 55 | inner join dim_zones as dropoff_zone 56 | on trips_unioned.dropoff_locationid = dropoff_zone.locationid -------------------------------------------------------------------------------- /06-streaming/java/kafka_examples/src/main/java/org/example/customserdes/CustomSerdes.java: -------------------------------------------------------------------------------- 1 | package org.example.customserdes; 2 | 3 | import io.confluent.kafka.serializers.AbstractKafkaAvroSerDeConfig; 4 | import io.confluent.kafka.serializers.KafkaJsonDeserializer; 5 | import io.confluent.kafka.serializers.KafkaJsonSerializer; 6 | import io.confluent.kafka.streams.serdes.avro.SpecificAvroSerde; 7 | import org.apache.avro.specific.SpecificRecordBase; 8 | import org.apache.kafka.common.serialization.Deserializer; 9 | import org.apache.kafka.common.serialization.Serde; 10 | import org.apache.kafka.common.serialization.Serdes; 11 | import org.apache.kafka.common.serialization.Serializer; 12 | import org.example.data.PickupLocation; 13 | import org.example.data.Ride; 14 | import org.example.data.VendorInfo; 15 | 16 | import java.util.HashMap; 17 | import java.util.Map; 18 | 19 | public class CustomSerdes { 20 | 21 | public static Serde getSerde(Class classOf) { 22 | Map serdeProps = new HashMap<>(); 23 | serdeProps.put("json.value.type", classOf); 24 | final Serializer mySerializer = new KafkaJsonSerializer<>(); 25 | mySerializer.configure(serdeProps, false); 26 | 27 | final Deserializer myDeserializer = new KafkaJsonDeserializer<>(); 28 | myDeserializer.configure(serdeProps, false); 29 | return Serdes.serdeFrom(mySerializer, myDeserializer); 30 | } 31 | 32 | public static SpecificAvroSerde getAvroSerde(boolean isKey, String schemaRegistryUrl) { 33 | var serde = new SpecificAvroSerde(); 34 | 35 | Map serdeProps = new HashMap<>(); 36 | serdeProps.put(AbstractKafkaAvroSerDeConfig.SCHEMA_REGISTRY_URL_CONFIG, schemaRegistryUrl); 37 | serde.configure(serdeProps, isKey); 38 | return serde; 39 | } 40 | 41 | 42 | } 43 | -------------------------------------------------------------------------------- /projects/datasets.md: -------------------------------------------------------------------------------- 1 | ## Datasets 2 | 3 | Here are some datasets that you could use for the project: 4 | 5 | 6 | * [Kaggle](https://www.kaggle.com/datasets) 7 | * [AWS datasets](https://registry.opendata.aws/) 8 | * [UK government open data](https://data.gov.uk/) 9 | * [Github archive](https://www.gharchive.org) 10 | * [Awesome public datasets](https://github.com/awesomedata/awesome-public-datasets) 11 | * [Million songs dataset](http://millionsongdataset.com) 12 | * [Some random datasets](https://components.one/datasets/) 13 | * [COVID Datasets](https://www.reddit.com/r/datasets/comments/n3ph2d/coronavirus_datsets/) 14 | * [Datasets from Azure](https://docs.microsoft.com/en-us/azure/azure-sql/public-data-sets) 15 | * [Datasets from BigQuery](https://cloud.google.com/bigquery/public-data/) 16 | * [Dataset search engine from Google](https://datasetsearch.research.google.com/) 17 | * [Public datasets offered by different GCP services](https://cloud.google.com/solutions/datasets) 18 | * [European statistics datasets](https://webgate.acceptance.ec.europa.eu/eurostat/data/database) 19 | * [Datasets for streaming](https://github.com/ColinEberhardt/awesome-public-streaming-datasets) 20 | * [Dataset for Santander bicycle rentals in London](https://cycling.data.tfl.gov.uk/) 21 | * [Common crawl data](https://commoncrawl.org/) (copy of the internet) 22 | * [NASA's EarthData](https://search.earthdata.nasa.gov/search) (May require introductory geospatial analysis) 23 | * Collection Of Data Repositories 24 | * [part 1](https://www.kdnuggets.com/2022/04/complete-collection-data-repositories-part-1.html) (from agriculture and finance to government) 25 | * [part 2](https://www.kdnuggets.com/2022/04/complete-collection-data-repositories-part-2.html) (from healthcare to transportation) 26 | 27 | PRs with more datasets are welcome! 28 | 29 | It's not mandatory that you use a dataset from this list. You can use any dataset you want. 30 | -------------------------------------------------------------------------------- /06-streaming/python/streams-example/pyspark/consumer.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from typing import Dict, List 3 | from kafka import KafkaConsumer 4 | 5 | from settings import BOOTSTRAP_SERVERS, CONSUME_TOPIC_RIDES_CSV 6 | 7 | 8 | class RideCSVConsumer: 9 | def __init__(self, props: Dict): 10 | self.consumer = KafkaConsumer(**props) 11 | 12 | def consume_from_kafka(self, topics: List[str]): 13 | self.consumer.subscribe(topics=topics) 14 | print('Consuming from Kafka started') 15 | print('Available topics to consume: ', self.consumer.subscription()) 16 | while True: 17 | try: 18 | # SIGINT can't be handled when polling, limit timeout to 1 second. 19 | msg = self.consumer.poll(1.0) 20 | if msg is None or msg == {}: 21 | continue 22 | for msg_key, msg_values in msg.items(): 23 | for msg_val in msg_values: 24 | print(f'Key:{msg_val.key}-type({type(msg_val.key)}), ' 25 | f'Value:{msg_val.value}-type({type(msg_val.value)})') 26 | except KeyboardInterrupt: 27 | break 28 | 29 | self.consumer.close() 30 | 31 | 32 | if __name__ == '__main__': 33 | parser = argparse.ArgumentParser(description='Kafka Consumer') 34 | parser.add_argument('--topic', type=str, default=CONSUME_TOPIC_RIDES_CSV) 35 | args = parser.parse_args() 36 | 37 | topic = args.topic 38 | config = { 39 | 'bootstrap_servers': [BOOTSTRAP_SERVERS], 40 | 'auto_offset_reset': 'earliest', 41 | 'enable_auto_commit': True, 42 | 'key_deserializer': lambda key: int(key.decode('utf-8')), 43 | 'value_deserializer': lambda value: value.decode('utf-8'), 44 | 'group_id': 'consumer.group.id.csv-example.1', 45 | } 46 | csv_consumer = RideCSVConsumer(props=config) 47 | csv_consumer.consume_from_kafka(topics=[topic]) 48 | -------------------------------------------------------------------------------- /06-streaming/python/streams-example/redpanda/consumer.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from typing import Dict, List 3 | from kafka import KafkaConsumer 4 | 5 | from settings import BOOTSTRAP_SERVERS, CONSUME_TOPIC_RIDES_CSV 6 | 7 | 8 | class RideCSVConsumer: 9 | def __init__(self, props: Dict): 10 | self.consumer = KafkaConsumer(**props) 11 | 12 | def consume_from_kafka(self, topics: List[str]): 13 | self.consumer.subscribe(topics=topics) 14 | print('Consuming from Kafka started') 15 | print('Available topics to consume: ', self.consumer.subscription()) 16 | while True: 17 | try: 18 | # SIGINT can't be handled when polling, limit timeout to 1 second. 19 | msg = self.consumer.poll(1.0) 20 | if msg is None or msg == {}: 21 | continue 22 | for msg_key, msg_values in msg.items(): 23 | for msg_val in msg_values: 24 | print(f'Key:{msg_val.key}-type({type(msg_val.key)}), ' 25 | f'Value:{msg_val.value}-type({type(msg_val.value)})') 26 | except KeyboardInterrupt: 27 | break 28 | 29 | self.consumer.close() 30 | 31 | 32 | if __name__ == '__main__': 33 | parser = argparse.ArgumentParser(description='Kafka Consumer') 34 | parser.add_argument('--topic', type=str, default=CONSUME_TOPIC_RIDES_CSV) 35 | args = parser.parse_args() 36 | 37 | topic = args.topic 38 | config = { 39 | 'bootstrap_servers': [BOOTSTRAP_SERVERS], 40 | 'auto_offset_reset': 'earliest', 41 | 'enable_auto_commit': True, 42 | 'key_deserializer': lambda key: int(key.decode('utf-8')), 43 | 'value_deserializer': lambda value: value.decode('utf-8'), 44 | 'group_id': 'consumer.group.id.csv-example.1', 45 | } 46 | csv_consumer = RideCSVConsumer(props=config) 47 | csv_consumer.consume_from_kafka(topics=[topic]) 48 | -------------------------------------------------------------------------------- /06-streaming/python/json_example/ride.py: -------------------------------------------------------------------------------- 1 | from typing import List, Dict 2 | from decimal import Decimal 3 | from datetime import datetime 4 | 5 | 6 | class Ride: 7 | def __init__(self, arr: List[str]): 8 | self.vendor_id = arr[0] 9 | self.tpep_pickup_datetime = datetime.strptime(arr[1], "%Y-%m-%d %H:%M:%S"), 10 | self.tpep_dropoff_datetime = datetime.strptime(arr[2], "%Y-%m-%d %H:%M:%S"), 11 | self.passenger_count = int(arr[3]) 12 | self.trip_distance = Decimal(arr[4]) 13 | self.rate_code_id = int(arr[5]) 14 | self.store_and_fwd_flag = arr[6] 15 | self.pu_location_id = int(arr[7]) 16 | self.do_location_id = int(arr[8]) 17 | self.payment_type = arr[9] 18 | self.fare_amount = Decimal(arr[10]) 19 | self.extra = Decimal(arr[11]) 20 | self.mta_tax = Decimal(arr[12]) 21 | self.tip_amount = Decimal(arr[13]) 22 | self.tolls_amount = Decimal(arr[14]) 23 | self.improvement_surcharge = Decimal(arr[15]) 24 | self.total_amount = Decimal(arr[16]) 25 | self.congestion_surcharge = Decimal(arr[17]) 26 | 27 | @classmethod 28 | def from_dict(cls, d: Dict): 29 | return cls(arr=[ 30 | d['vendor_id'], 31 | d['tpep_pickup_datetime'][0], 32 | d['tpep_dropoff_datetime'][0], 33 | d['passenger_count'], 34 | d['trip_distance'], 35 | d['rate_code_id'], 36 | d['store_and_fwd_flag'], 37 | d['pu_location_id'], 38 | d['do_location_id'], 39 | d['payment_type'], 40 | d['fare_amount'], 41 | d['extra'], 42 | d['mta_tax'], 43 | d['tip_amount'], 44 | d['tolls_amount'], 45 | d['improvement_surcharge'], 46 | d['total_amount'], 47 | d['congestion_surcharge'], 48 | ] 49 | ) 50 | 51 | def __repr__(self): 52 | return f'{self.__class__.__name__}: {self.__dict__}' 53 | -------------------------------------------------------------------------------- /06-streaming/python/redpanda_example/ride.py: -------------------------------------------------------------------------------- 1 | from typing import List, Dict 2 | from decimal import Decimal 3 | from datetime import datetime 4 | 5 | 6 | class Ride: 7 | def __init__(self, arr: List[str]): 8 | self.vendor_id = arr[0] 9 | self.tpep_pickup_datetime = datetime.strptime(arr[1], "%Y-%m-%d %H:%M:%S"), 10 | self.tpep_dropoff_datetime = datetime.strptime(arr[2], "%Y-%m-%d %H:%M:%S"), 11 | self.passenger_count = int(arr[3]) 12 | self.trip_distance = Decimal(arr[4]) 13 | self.rate_code_id = int(arr[5]) 14 | self.store_and_fwd_flag = arr[6] 15 | self.pu_location_id = int(arr[7]) 16 | self.do_location_id = int(arr[8]) 17 | self.payment_type = arr[9] 18 | self.fare_amount = Decimal(arr[10]) 19 | self.extra = Decimal(arr[11]) 20 | self.mta_tax = Decimal(arr[12]) 21 | self.tip_amount = Decimal(arr[13]) 22 | self.tolls_amount = Decimal(arr[14]) 23 | self.improvement_surcharge = Decimal(arr[15]) 24 | self.total_amount = Decimal(arr[16]) 25 | self.congestion_surcharge = Decimal(arr[17]) 26 | 27 | @classmethod 28 | def from_dict(cls, d: Dict): 29 | return cls(arr=[ 30 | d['vendor_id'], 31 | d['tpep_pickup_datetime'][0], 32 | d['tpep_dropoff_datetime'][0], 33 | d['passenger_count'], 34 | d['trip_distance'], 35 | d['rate_code_id'], 36 | d['store_and_fwd_flag'], 37 | d['pu_location_id'], 38 | d['do_location_id'], 39 | d['payment_type'], 40 | d['fare_amount'], 41 | d['extra'], 42 | d['mta_tax'], 43 | d['tip_amount'], 44 | d['tolls_amount'], 45 | d['improvement_surcharge'], 46 | d['total_amount'], 47 | d['congestion_surcharge'], 48 | ] 49 | ) 50 | 51 | def __repr__(self): 52 | return f'{self.__class__.__name__}: {self.__dict__}' 53 | -------------------------------------------------------------------------------- /05-batch/setup/linux.md: -------------------------------------------------------------------------------- 1 | 2 | ## Linux 3 | 4 | Here we'll show you how to install Spark 3.3.2 for Linux. 5 | We tested it on Ubuntu 20.04 (also WSL), but it should work 6 | for other Linux distros as well 7 | 8 | 9 | ### Installing Java 10 | 11 | Download OpenJDK 11 or Oracle JDK 11 (It's important that the version is 11 - spark requires 8 or 11) 12 | 13 | We'll use [OpenJDK](https://jdk.java.net/archive/) 14 | 15 | Download it (e.g. to `~/spark`): 16 | 17 | ``` 18 | wget https://download.java.net/java/GA/jdk11/9/GPL/openjdk-11.0.2_linux-x64_bin.tar.gz 19 | ``` 20 | 21 | Unpack it: 22 | 23 | ```bash 24 | tar xzfv openjdk-11.0.2_linux-x64_bin.tar.gz 25 | ``` 26 | 27 | define `JAVA_HOME` and add it to `PATH`: 28 | 29 | ```bash 30 | export JAVA_HOME="${HOME}/spark/jdk-11.0.2" 31 | export PATH="${JAVA_HOME}/bin:${PATH}" 32 | ``` 33 | 34 | check that it works: 35 | 36 | ```bash 37 | java --version 38 | ``` 39 | 40 | Output: 41 | 42 | ``` 43 | openjdk 11.0.2 2019-01-15 44 | OpenJDK Runtime Environment 18.9 (build 11.0.2+9) 45 | OpenJDK 64-Bit Server VM 18.9 (build 11.0.2+9, mixed mode) 46 | ``` 47 | 48 | Remove the archive: 49 | 50 | ```bash 51 | rm openjdk-11.0.2_linux-x64_bin.tar.gz 52 | ``` 53 | 54 | ### Installing Spark 55 | 56 | 57 | Download Spark. Use 3.3.2 version: 58 | 59 | ```bash 60 | wget https://archive.apache.org/dist/spark/spark-3.3.2/spark-3.3.2-bin-hadoop3.tgz 61 | ``` 62 | 63 | Unpack: 64 | 65 | ```bash 66 | tar xzfv spark-3.3.2-bin-hadoop3.tgz 67 | ``` 68 | 69 | Remove the archive: 70 | 71 | ```bash 72 | rm spark-3.3.2-bin-hadoop3.tgz 73 | ``` 74 | 75 | Add it to `PATH`: 76 | 77 | ```bash 78 | export SPARK_HOME="${HOME}/spark/spark-3.3.2-bin-hadoop3" 79 | export PATH="${SPARK_HOME}/bin:${PATH}" 80 | ``` 81 | 82 | ### Testing Spark 83 | 84 | Execute `spark-shell` and run the following: 85 | 86 | ```scala 87 | val data = 1 to 10000 88 | val distData = sc.parallelize(data) 89 | distData.filter(_ < 10).collect() 90 | ``` 91 | 92 | ### PySpark 93 | 94 | It's the same for all platforms. Go to [pyspark.md](pyspark.md). 95 | -------------------------------------------------------------------------------- /04-analytics-engineering/taxi_rides_ny/dbt_project.yml: -------------------------------------------------------------------------------- 1 | 2 | # Name your project! Project names should contain only lowercase characters 3 | # and underscores. A good package name should reflect your organization's 4 | # name or the intended use of these models 5 | name: 'taxi_rides_ny' 6 | version: '1.0.0' 7 | config-version: 2 8 | 9 | # This setting configures which "profile" dbt uses for this project. 10 | profile: 'default' 11 | 12 | # These configurations specify where dbt should look for different types of files. 13 | # The `model-paths` config, for example, states that models in this project can be 14 | # found in the "models/" directory. You probably won't need to change these! 15 | model-paths: ["models"] 16 | analysis-paths: ["analyses"] 17 | test-paths: ["tests"] 18 | seed-paths: ["seeds"] 19 | macro-paths: ["macros"] 20 | snapshot-paths: ["snapshots"] 21 | 22 | target-path: "target" # directory which will store compiled SQL files 23 | clean-targets: # directories to be removed by `dbt clean` 24 | - "target" 25 | - "dbt_packages" 26 | 27 | 28 | # Configuring models 29 | # Full documentation: https://docs.getdbt.com/docs/configuring-models 30 | 31 | # In dbt, the default materialization for a model is a view. This means, when you run 32 | # dbt run or dbt build, all of your models will be built as a view in your data platform. 33 | # The configuration below will override this setting for models in the example folder to 34 | # instead be materialized as tables. Any models you add to the root of the models folder will 35 | # continue to be built as views. These settings can be overridden in the individual model files 36 | # using the `{{ config(...) }}` macro. 37 | 38 | models: 39 | taxi_rides_ny: 40 | # Applies to all files under models/.../ 41 | staging: 42 | materialized: view 43 | core: 44 | materialized: table 45 | vars: 46 | payment_type_values: [1, 2, 3, 4, 5, 6] 47 | 48 | seeds: 49 | taxi_rides_ny: 50 | taxi_zone_lookup: 51 | +column_types: 52 | locationid: numeric -------------------------------------------------------------------------------- /06-streaming/java/kafka_examples/src/main/java/org/example/data/Ride.java: -------------------------------------------------------------------------------- 1 | package org.example.data; 2 | 3 | import java.nio.DoubleBuffer; 4 | import java.time.LocalDate; 5 | import java.time.LocalDateTime; 6 | import java.time.format.DateTimeFormatter; 7 | 8 | public class Ride { 9 | public Ride(String[] arr) { 10 | VendorID = arr[0]; 11 | tpep_pickup_datetime = LocalDateTime.parse(arr[1], DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss")); 12 | tpep_dropoff_datetime = LocalDateTime.parse(arr[2], DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss")); 13 | passenger_count = Integer.parseInt(arr[3]); 14 | trip_distance = Double.parseDouble(arr[4]); 15 | RatecodeID = Long.parseLong(arr[5]); 16 | store_and_fwd_flag = arr[6]; 17 | PULocationID = Long.parseLong(arr[7]); 18 | DOLocationID = Long.parseLong(arr[8]); 19 | payment_type = arr[9]; 20 | fare_amount = Double.parseDouble(arr[10]); 21 | extra = Double.parseDouble(arr[11]); 22 | mta_tax = Double.parseDouble(arr[12]); 23 | tip_amount = Double.parseDouble(arr[13]); 24 | tolls_amount = Double.parseDouble(arr[14]); 25 | improvement_surcharge = Double.parseDouble(arr[15]); 26 | total_amount = Double.parseDouble(arr[16]); 27 | congestion_surcharge = Double.parseDouble(arr[17]); 28 | } 29 | public Ride(){} 30 | public String VendorID; 31 | public LocalDateTime tpep_pickup_datetime; 32 | public LocalDateTime tpep_dropoff_datetime; 33 | public int passenger_count; 34 | public double trip_distance; 35 | public long RatecodeID; 36 | public String store_and_fwd_flag; 37 | public long PULocationID; 38 | public long DOLocationID; 39 | public String payment_type; 40 | public double fare_amount; 41 | public double extra; 42 | public double mta_tax; 43 | public double tip_amount; 44 | public double tolls_amount; 45 | public double improvement_surcharge; 46 | public double total_amount; 47 | public double congestion_surcharge; 48 | 49 | } 50 | -------------------------------------------------------------------------------- /images/mage.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | -------------------------------------------------------------------------------- /cohorts/2023/workshops/piperider.md: -------------------------------------------------------------------------------- 1 | 2 | ## Workshop: Maximizing Confidence in Your Data Model Changes with dbt and PipeRider 3 | 4 | To learn how to use PipeRider together with dbt for detecting changes in model and data, sign up for a workshop 5 | 6 | - Video: https://www.youtube.com/watch?v=O-tyUOQccSs 7 | - Repository: https://github.com/InfuseAI/taxi_rides_ny_duckdb 8 | 9 | 10 | ## Homework 11 | 12 | The following questions follow on from the original Week 4 homework, and so use the same data as required by those questions: 13 | 14 | https://github.com/DataTalksClub/data-engineering-zoomcamp/blob/main/cohorts/2023/week_4_analytics_engineering/homework.md 15 | 16 | Yellow taxi data - Years 2019 and 2020 17 | Green taxi data - Years 2019 and 2020 18 | fhv data - Year 2019. 19 | 20 | ### Question 1: 21 | 22 | What is the distribution between vendor id filtering by years 2019 and 2020 data? 23 | 24 | You will need to run PipeRider and check the report 25 | 26 | * 70.1/29.6/0.5 27 | * 60.1/39.5/0.4 28 | * 90.2/9.5/0.3 29 | * 80.1/19.7/0.2 30 | 31 | ### Question 2: 32 | 33 | What is the composition of total amount (positive/zero/negative) filtering by years 2019 and 2020 data? 34 | 35 | You will need to run PipeRider and check the report 36 | 37 | 38 | * 51.4M/15K/48.6K 39 | * 21.4M/5K/248.6K 40 | * 61.4M/25K/148.6K 41 | * 81.4M/35K/14.6K 42 | 43 | ### Question 3: 44 | 45 | What is the numeric statistics (average/standard deviation/min/max/sum) of trip distances filtering by years 2019 and 2020 data? 46 | 47 | You will need to run PipeRider and check the report 48 | 49 | 50 | * 1.95/35.43/0/16.3K/151.5M 51 | * 3.95/25.43/23.88/267.3K/281.5M 52 | * 5.95/75.43/-63.88/67.3K/81.5M 53 | * 2.95/35.43/-23.88/167.3K/181.5M 54 | 55 | 56 | 57 | ## Submitting the solutions 58 | 59 | * Form for submitting: https://forms.gle/WyLQHBu1DNwNTfqe8 60 | * You can submit your homework multiple times. In this case, only the last submission will be used. 61 | 62 | Deadline: 20 March, 22:00 CET 63 | 64 | 65 | ## Solution 66 | 67 | Video: https://www.youtube.com/watch?v=inNrUys7W8U&list=PL3MmuxUbc_hJjEePXIdE-LVUx_1ZZjYGW 68 | -------------------------------------------------------------------------------- /04-analytics-engineering/taxi_rides_ny/models/staging/stg_yellow_tripdata.sql: -------------------------------------------------------------------------------- 1 | {{ config(materialized='view') }} 2 | 3 | with tripdata as 4 | ( 5 | select *, 6 | row_number() over(partition by vendorid, tpep_pickup_datetime) as rn 7 | from {{ source('staging','yellow_tripdata') }} 8 | where vendorid is not null 9 | ) 10 | select 11 | -- identifiers 12 | {{ dbt_utils.generate_surrogate_key(['vendorid', 'tpep_pickup_datetime']) }} as tripid, 13 | {{ dbt.safe_cast("vendorid", api.Column.translate_type("integer")) }} as vendorid, 14 | {{ dbt.safe_cast("ratecodeid", api.Column.translate_type("integer")) }} as ratecodeid, 15 | {{ dbt.safe_cast("pulocationid", api.Column.translate_type("integer")) }} as pickup_locationid, 16 | {{ dbt.safe_cast("dolocationid", api.Column.translate_type("integer")) }} as dropoff_locationid, 17 | 18 | -- timestamps 19 | cast(tpep_pickup_datetime as timestamp) as pickup_datetime, 20 | cast(tpep_dropoff_datetime as timestamp) as dropoff_datetime, 21 | 22 | -- trip info 23 | store_and_fwd_flag, 24 | {{ dbt.safe_cast("passenger_count", api.Column.translate_type("integer")) }} as passenger_count, 25 | cast(trip_distance as numeric) as trip_distance, 26 | -- yellow cabs are always street-hail 27 | 1 as trip_type, 28 | 29 | -- payment info 30 | cast(fare_amount as numeric) as fare_amount, 31 | cast(extra as numeric) as extra, 32 | cast(mta_tax as numeric) as mta_tax, 33 | cast(tip_amount as numeric) as tip_amount, 34 | cast(tolls_amount as numeric) as tolls_amount, 35 | cast(0 as numeric) as ehail_fee, 36 | cast(improvement_surcharge as numeric) as improvement_surcharge, 37 | cast(total_amount as numeric) as total_amount, 38 | coalesce({{ dbt.safe_cast("payment_type", api.Column.translate_type("integer")) }},0) as payment_type, 39 | {{ get_payment_type_description('payment_type') }} as payment_type_description 40 | from tripdata 41 | where rn = 1 42 | 43 | -- dbt build --select --vars '{'is_test_run: false}' 44 | {% if var('is_test_run', default=true) %} 45 | 46 | limit 100 47 | 48 | {% endif %} -------------------------------------------------------------------------------- /06-streaming/python/redpanda_example/consumer.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import Dict, List 3 | from json import loads 4 | from kafka import KafkaConsumer 5 | 6 | from ride import Ride 7 | from settings import BOOTSTRAP_SERVERS, KAFKA_TOPIC 8 | 9 | 10 | class JsonConsumer: 11 | def __init__(self, props: Dict): 12 | self.consumer = KafkaConsumer(**props) 13 | 14 | def consume_from_kafka(self, topics: List[str]): 15 | self.consumer.subscribe(topics) 16 | print('Consuming from Kafka started') 17 | print('Available topics to consume: ', self.consumer.subscription()) 18 | while True: 19 | try: 20 | # SIGINT can't be handled when polling, limit timeout to 1 second. 21 | message = self.consumer.poll(1.0) 22 | if message is None or message == {}: 23 | continue 24 | for message_key, message_value in message.items(): 25 | for msg_val in message_value: 26 | print(msg_val.key, msg_val.value) 27 | except KeyboardInterrupt: 28 | break 29 | 30 | self.consumer.close() 31 | 32 | 33 | if __name__ == '__main__': 34 | config = { 35 | 'bootstrap_servers': BOOTSTRAP_SERVERS, 36 | 'auto_offset_reset': 'earliest', 37 | 'enable_auto_commit': True, 38 | 'key_deserializer': lambda key: int(key.decode('utf-8')), 39 | 'value_deserializer': lambda x: loads(x.decode('utf-8'), object_hook=lambda d: Ride.from_dict(d)), 40 | 'group_id': 'consumer.group.id.json-example.1', 41 | } 42 | 43 | json_consumer = JsonConsumer(props=config) 44 | json_consumer.consume_from_kafka(topics=[KAFKA_TOPIC]) 45 | 46 | 47 | # There's no schema in JSON format, so if the schema changes and one column is removed or new one added or the data types is changed, the Ride class would still work and produce-consume messages would still run without a hitch. 48 | # But the issue is in the downstream Analytics as the dataset would no longer have that column and the dashboards would thus fail. Therefore, the trust in our data and processes would erodes. -------------------------------------------------------------------------------- /04-analytics-engineering/taxi_rides_ny/models/staging/stg_green_tripdata.sql: -------------------------------------------------------------------------------- 1 | {{ 2 | config( 3 | materialized='view' 4 | ) 5 | }} 6 | 7 | with tripdata as 8 | ( 9 | select *, 10 | row_number() over(partition by vendorid, lpep_pickup_datetime) as rn 11 | from {{ source('staging','green_tripdata') }} 12 | where vendorid is not null 13 | ) 14 | select 15 | -- identifiers 16 | {{ dbt_utils.generate_surrogate_key(['vendorid', 'lpep_pickup_datetime']) }} as tripid, 17 | {{ dbt.safe_cast("vendorid", api.Column.translate_type("integer")) }} as vendorid, 18 | {{ dbt.safe_cast("ratecodeid", api.Column.translate_type("integer")) }} as ratecodeid, 19 | {{ dbt.safe_cast("pulocationid", api.Column.translate_type("integer")) }} as pickup_locationid, 20 | {{ dbt.safe_cast("dolocationid", api.Column.translate_type("integer")) }} as dropoff_locationid, 21 | 22 | -- timestamps 23 | cast(lpep_pickup_datetime as timestamp) as pickup_datetime, 24 | cast(lpep_dropoff_datetime as timestamp) as dropoff_datetime, 25 | 26 | -- trip info 27 | store_and_fwd_flag, 28 | {{ dbt.safe_cast("passenger_count", api.Column.translate_type("integer")) }} as passenger_count, 29 | cast(trip_distance as numeric) as trip_distance, 30 | {{ dbt.safe_cast("trip_type", api.Column.translate_type("integer")) }} as trip_type, 31 | 32 | -- payment info 33 | cast(fare_amount as numeric) as fare_amount, 34 | cast(extra as numeric) as extra, 35 | cast(mta_tax as numeric) as mta_tax, 36 | cast(tip_amount as numeric) as tip_amount, 37 | cast(tolls_amount as numeric) as tolls_amount, 38 | cast(ehail_fee as numeric) as ehail_fee, 39 | cast(improvement_surcharge as numeric) as improvement_surcharge, 40 | cast(total_amount as numeric) as total_amount, 41 | coalesce({{ dbt.safe_cast("payment_type", api.Column.translate_type("integer")) }},0) as payment_type, 42 | {{ get_payment_type_description("payment_type") }} as payment_type_description 43 | from tripdata 44 | where rn = 1 45 | 46 | 47 | -- dbt build --select --vars '{'is_test_run': 'false'}' 48 | {% if var('is_test_run', default=true) %} 49 | 50 | limit 100 51 | 52 | {% endif %} -------------------------------------------------------------------------------- /02-workflow-orchestration/flows/03_postgres_dbt.yaml: -------------------------------------------------------------------------------- 1 | id: 03_postgres_dbt 2 | namespace: zoomcamp 3 | inputs: 4 | - id: dbt_command 5 | type: SELECT 6 | allowCustomValue: true 7 | defaults: dbt build 8 | values: 9 | - dbt build 10 | - dbt debug # use when running the first time to validate DB connection 11 | tasks: 12 | - id: sync 13 | type: io.kestra.plugin.git.SyncNamespaceFiles 14 | url: https://github.com/DataTalksClub/data-engineering-zoomcamp 15 | branch: main 16 | namespace: "{{ flow.namespace }}" 17 | gitDirectory: 04-analytics-engineering/taxi_rides_ny 18 | dryRun: false 19 | # disabled: true # this Git Sync is needed only when running it the first time, afterwards the task can be disabled 20 | 21 | - id: dbt-build 22 | type: io.kestra.plugin.dbt.cli.DbtCLI 23 | env: 24 | DBT_DATABASE: postgres-zoomcamp 25 | DBT_SCHEMA: public 26 | namespaceFiles: 27 | enabled: true 28 | containerImage: ghcr.io/kestra-io/dbt-postgres:latest 29 | taskRunner: 30 | type: io.kestra.plugin.scripts.runner.docker.Docker 31 | commands: 32 | - dbt deps 33 | - "{{ inputs.dbt_command }}" 34 | storeManifest: 35 | key: manifest.json 36 | namespace: "{{ flow.namespace }}" 37 | profiles: | 38 | default: 39 | outputs: 40 | dev: 41 | type: postgres 42 | host: host.docker.internal 43 | user: kestra 44 | password: k3str4 45 | port: 5432 46 | dbname: postgres-zoomcamp 47 | schema: public 48 | threads: 8 49 | connect_timeout: 10 50 | priority: interactive 51 | target: dev 52 | description: | 53 | Note that you need to adjust the models/staging/schema.yml file to match your database and schema. Select and edit that Namespace File from the UI. Save and run this flow. Once https://github.com/DataTalksClub/data-engineering-zoomcamp/pull/565/files is merged, you can ignore this note as it will be dynamically adjusted based on env variables. 54 | ```yaml 55 | sources: 56 | - name: staging 57 | database: postgres-zoomcamp 58 | schema: public 59 | ``` 60 | -------------------------------------------------------------------------------- /01-docker-terraform/1_terraform_gcp/2_gcp_overview.md: -------------------------------------------------------------------------------- 1 | ## GCP Overview 2 | 3 | [Video](https://www.youtube.com/watch?v=18jIzE41fJ4&list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb&index=2) 4 | 5 | 6 | ### Project infrastructure modules in GCP: 7 | * Google Cloud Storage (GCS): Data Lake 8 | * BigQuery: Data Warehouse 9 | 10 | (Concepts explained in Week 2 - Data Ingestion) 11 | 12 | ### Initial Setup 13 | 14 | For this course, we'll use a free version (upto EUR 300 credits). 15 | 16 | 1. Create an account with your Google email ID 17 | 2. Setup your first [project](https://console.cloud.google.com/) if you haven't already 18 | * eg. "DTC DE Course", and note down the "Project ID" (we'll use this later when deploying infra with TF) 19 | 3. Setup [service account & authentication](https://cloud.google.com/docs/authentication/getting-started) for this project 20 | * Grant `Viewer` role to begin with. 21 | * Download service-account-keys (.json) for auth. 22 | 4. Download [SDK](https://cloud.google.com/sdk/docs/quickstart) for local setup 23 | 5. Set environment variable to point to your downloaded GCP keys: 24 | ```shell 25 | export GOOGLE_APPLICATION_CREDENTIALS=".json" 26 | 27 | # Refresh token/session, and verify authentication 28 | gcloud auth application-default login 29 | ``` 30 | 31 | ### Setup for Access 32 | 33 | 1. [IAM Roles](https://cloud.google.com/storage/docs/access-control/iam-roles) for Service account: 34 | * Go to the *IAM* section of *IAM & Admin* https://console.cloud.google.com/iam-admin/iam 35 | * Click the *Edit principal* icon for your service account. 36 | * Add these roles in addition to *Viewer* : **Storage Admin** + **Storage Object Admin** + **BigQuery Admin** 37 | 38 | 2. Enable these APIs for your project: 39 | * https://console.cloud.google.com/apis/library/iam.googleapis.com 40 | * https://console.cloud.google.com/apis/library/iamcredentials.googleapis.com 41 | 42 | 3. Please ensure `GOOGLE_APPLICATION_CREDENTIALS` env-var is set. 43 | ```shell 44 | export GOOGLE_APPLICATION_CREDENTIALS=".json" 45 | ``` 46 | 47 | ### Terraform Workshop to create GCP Infra 48 | Continue [here](./terraform): `week_1_basics_n_setup/1_terraform_gcp/terraform` 49 | -------------------------------------------------------------------------------- /02-workflow-orchestration/flows/07_gcp_dbt.yaml: -------------------------------------------------------------------------------- 1 | id: 07_gcp_dbt 2 | namespace: zoomcamp 3 | inputs: 4 | - id: dbt_command 5 | type: SELECT 6 | allowCustomValue: true 7 | defaults: dbt build 8 | values: 9 | - dbt build 10 | - dbt debug # use when running the first time to validate DB connection 11 | 12 | tasks: 13 | - id: sync 14 | type: io.kestra.plugin.git.SyncNamespaceFiles 15 | url: https://github.com/DataTalksClub/data-engineering-zoomcamp 16 | branch: main 17 | namespace: "{{flow.namespace}}" 18 | gitDirectory: 04-analytics-engineering/taxi_rides_ny 19 | dryRun: false 20 | # disabled: true # this Git Sync is needed only when running it the first time, afterwards the task can be disabled 21 | 22 | - id: dbt-build 23 | type: io.kestra.plugin.dbt.cli.DbtCLI 24 | env: 25 | DBT_DATABASE: "{{kv('GCP_PROJECT_ID')}}" 26 | DBT_SCHEMA: "{{kv('GCP_DATASET')}}" 27 | namespaceFiles: 28 | enabled: true 29 | containerImage: ghcr.io/kestra-io/dbt-bigquery:latest 30 | taskRunner: 31 | type: io.kestra.plugin.scripts.runner.docker.Docker 32 | inputFiles: 33 | sa.json: "{{kv('GCP_CREDS')}}" 34 | commands: 35 | - dbt deps 36 | - "{{ inputs.dbt_command }}" 37 | storeManifest: 38 | key: manifest.json 39 | namespace: "{{ flow.namespace }}" 40 | profiles: | 41 | default: 42 | outputs: 43 | dev: 44 | type: bigquery 45 | dataset: "{{kv('GCP_DATASET')}}" 46 | project: "{{kv('GCP_PROJECT_ID')}}" 47 | location: "{{kv('GCP_LOCATION')}}" 48 | keyfile: sa.json 49 | method: service-account 50 | priority: interactive 51 | threads: 16 52 | timeout_seconds: 300 53 | fixed_retries: 1 54 | target: dev 55 | description: | 56 | Note that you need to adjust the models/staging/schema.yml file to match your database and schema. Select and edit that Namespace File from the UI. Save and run this flow. Once https://github.com/DataTalksClub/data-engineering-zoomcamp/pull/565/files is merged, you can ignore this note as it will be dynamically adjusted based on env variables. 57 | ```yaml 58 | sources: 59 | - name: staging 60 | database: kestra-sandbox 61 | schema: zoomcamp 62 | ``` 63 | -------------------------------------------------------------------------------- /cohorts/2022/week_5_batch_processing/homework.md: -------------------------------------------------------------------------------- 1 | ## Week 5 Homework 2 | 3 | In this homework we'll put what we learned about Spark 4 | in practice. 5 | 6 | We'll use high volume for-hire vehicles (HVFHV) dataset for that. 7 | 8 | ## Question 1. Install Spark and PySpark 9 | 10 | * Install Spark 11 | * Run PySpark 12 | * Create a local spark session 13 | * Execute `spark.version` 14 | 15 | What's the output? 16 | 17 | 18 | ## Question 2. HVFHW February 2021 19 | 20 | Download the HVFHV data for february 2021: 21 | 22 | ```bash 23 | wget https://nyc-tlc.s3.amazonaws.com/trip+data/fhvhv_tripdata_2021-02.csv 24 | ``` 25 | 26 | Read it with Spark using the same schema as we did 27 | in the lessons. We will use this dataset for all 28 | the remaining questions. 29 | 30 | Repartition it to 24 partitions and save it to 31 | parquet. 32 | 33 | What's the size of the folder with results (in MB)? 34 | 35 | 36 | ## Question 3. Count records 37 | 38 | How many taxi trips were there on February 15? 39 | 40 | Consider only trips that started on February 15. 41 | 42 | 43 | ## Question 4. Longest trip for each day 44 | 45 | Now calculate the duration for each trip. 46 | 47 | Trip starting on which day was the longest? 48 | 49 | 50 | ## Question 5. Most frequent `dispatching_base_num` 51 | 52 | Now find the most frequently occurring `dispatching_base_num` 53 | in this dataset. 54 | 55 | How many stages this spark job has? 56 | 57 | > Note: the answer may depend on how you write the query, 58 | > so there are multiple correct answers. 59 | > Select the one you have. 60 | 61 | 62 | ## Question 6. Most common locations pair 63 | 64 | Find the most common pickup-dropoff pair. 65 | 66 | For example: 67 | 68 | "Jamaica Bay / Clinton East" 69 | 70 | Enter two zone names separated by a slash 71 | 72 | If any of the zone names are unknown (missing), use "Unknown". For example, "Unknown / Clinton East". 73 | 74 | 75 | ## Bonus question. Join type 76 | 77 | (not graded) 78 | 79 | For finding the answer to Q6, you'll need to perform a join. 80 | 81 | What type of join is it? 82 | 83 | And how many stages your spark job has? 84 | 85 | 86 | ## Submitting the solutions 87 | 88 | * Form for submitting: https://forms.gle/dBkVK9yT8cSMDwuw7 89 | * You can submit your homework multiple times. In this case, only the last submission will be used. 90 | 91 | Deadline: 07 March (Monday), 22:00 CET 92 | -------------------------------------------------------------------------------- /04-analytics-engineering/taxi_rides_ny/README.md: -------------------------------------------------------------------------------- 1 | Welcome to your new dbt project! 2 | 3 | ### How to run this project 4 | ### About the project 5 | This project is based in [dbt starter project](https://github.com/dbt-labs/dbt-starter-project) (generated by running `dbt init`) 6 | Try running the following commands: 7 | - dbt run 8 | - dbt test 9 | 10 | A project includes the following files: 11 | - dbt_project.yml: file used to configure the dbt project. If you are using dbt locally, make sure the profile here matches the one setup during installation in ~/.dbt/profiles.yml 12 | - *.yml files under folders models, data, macros: documentation files 13 | - csv files in the data folder: these will be our sources, files described above 14 | - Files inside folder models: The sql files contain the scripts to run our models, this will cover staging, core and a datamarts models. At the end, these models will follow this structure: 15 | 16 | ![image](https://user-images.githubusercontent.com/4315804/152691312-e71b56a4-53ff-4884-859c-c9090dbd0db8.png) 17 | 18 | 19 | #### Workflow 20 | ![image](https://user-images.githubusercontent.com/4315804/148699280-964c4e0b-e685-4c0f-a266-4f3e097156c9.png) 21 | 22 | #### Execution 23 | After having installed the required tools and cloning this repo, execute the following commnads: 24 | 25 | 1. Change into the project's directory from the command line: `$ cd [..]/taxi_rides_ny` 26 | 2. Load the CSVs into the database. This materializes the CSVs as tables in your target schema: `$ dbt seed` 27 | 3. Run the models: `$ dbt run` 28 | 4. Test your data: `$ dbt test` 29 | _Alternative: use `$ dbt build` to execute with one command the 3 steps above together_ 30 | 5. Generate documentation for the project: `$ dbt docs generate` 31 | 6. View the documentation for the project, this step should open the documentation page on a webserver, but it can also be accessed from http://localhost:8080 : `$ dbt docs serve` 32 | 33 | ### dbt resources: 34 | - Learn more about dbt [in the docs](https://docs.getdbt.com/docs/introduction) 35 | - Check out [Discourse](https://discourse.getdbt.com/) for commonly asked questions and answers 36 | - Join the [chat](http://slack.getdbt.com/) on Slack for live discussions and support 37 | - Find [dbt events](https://events.getdbt.com) near you 38 | - Check out [the blog](https://blog.getdbt.com/) for the latest news on dbt's development and best practices -------------------------------------------------------------------------------- /03-data-warehouse/extras/web_to_gcs.py: -------------------------------------------------------------------------------- 1 | import io 2 | import os 3 | import requests 4 | import pandas as pd 5 | from google.cloud import storage 6 | 7 | """ 8 | Pre-reqs: 9 | 1. `pip install pandas pyarrow google-cloud-storage` 10 | 2. Set GOOGLE_APPLICATION_CREDENTIALS to your project/service-account key 11 | 3. Set GCP_GCS_BUCKET as your bucket or change default value of BUCKET 12 | """ 13 | 14 | # services = ['fhv','green','yellow'] 15 | init_url = 'https://github.com/DataTalksClub/nyc-tlc-data/releases/download/' 16 | # switch out the bucketname 17 | BUCKET = os.environ.get("GCP_GCS_BUCKET", "dtc-data-lake-bucketname") 18 | 19 | 20 | def upload_to_gcs(bucket, object_name, local_file): 21 | """ 22 | Ref: https://cloud.google.com/storage/docs/uploading-objects#storage-upload-object-python 23 | """ 24 | # # WORKAROUND to prevent timeout for files > 6 MB on 800 kbps upload speed. 25 | # # (Ref: https://github.com/googleapis/python-storage/issues/74) 26 | # storage.blob._MAX_MULTIPART_SIZE = 5 * 1024 * 1024 # 5 MB 27 | # storage.blob._DEFAULT_CHUNKSIZE = 5 * 1024 * 1024 # 5 MB 28 | 29 | client = storage.Client() 30 | bucket = client.bucket(bucket) 31 | blob = bucket.blob(object_name) 32 | blob.upload_from_filename(local_file) 33 | 34 | 35 | def web_to_gcs(year, service): 36 | for i in range(12): 37 | 38 | # sets the month part of the file_name string 39 | month = '0'+str(i+1) 40 | month = month[-2:] 41 | 42 | # csv file_name 43 | file_name = f"{service}_tripdata_{year}-{month}.csv.gz" 44 | 45 | # download it using requests via a pandas df 46 | request_url = f"{init_url}{service}/{file_name}" 47 | r = requests.get(request_url) 48 | open(file_name, 'wb').write(r.content) 49 | print(f"Local: {file_name}") 50 | 51 | # read it back into a parquet file 52 | df = pd.read_csv(file_name, compression='gzip') 53 | file_name = file_name.replace('.csv.gz', '.parquet') 54 | df.to_parquet(file_name, engine='pyarrow') 55 | print(f"Parquet: {file_name}") 56 | 57 | # upload it to gcs 58 | upload_to_gcs(BUCKET, f"{service}/{file_name}", file_name) 59 | print(f"GCS: {service}/{file_name}") 60 | 61 | 62 | web_to_gcs('2019', 'green') 63 | web_to_gcs('2020', 'green') 64 | # web_to_gcs('2019', 'yellow') 65 | # web_to_gcs('2020', 'yellow') 66 | 67 | -------------------------------------------------------------------------------- /06-streaming/java/kafka_examples/src/main/java/org/example/JsonProducerPickupLocation.java: -------------------------------------------------------------------------------- 1 | package org.example; 2 | 3 | import com.opencsv.exceptions.CsvException; 4 | import org.apache.kafka.clients.producer.KafkaProducer; 5 | import org.apache.kafka.clients.producer.ProducerConfig; 6 | import org.apache.kafka.clients.producer.ProducerRecord; 7 | import org.example.data.PickupLocation; 8 | 9 | import java.io.IOException; 10 | import java.time.LocalDateTime; 11 | import java.util.Properties; 12 | import java.util.concurrent.ExecutionException; 13 | 14 | public class JsonProducerPickupLocation { 15 | private Properties props = new Properties(); 16 | 17 | public JsonProducerPickupLocation() { 18 | props.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, "pkc-75m1o.europe-west3.gcp.confluent.cloud:9092"); 19 | props.put("security.protocol", "SASL_SSL"); 20 | props.put("sasl.jaas.config", "org.apache.kafka.common.security.plain.PlainLoginModule required username='"+Secrets.KAFKA_CLUSTER_KEY+"' password='"+Secrets.KAFKA_CLUSTER_SECRET+"';"); 21 | props.put("sasl.mechanism", "PLAIN"); 22 | props.put("client.dns.lookup", "use_all_dns_ips"); 23 | props.put("session.timeout.ms", "45000"); 24 | props.put(ProducerConfig.ACKS_CONFIG, "all"); 25 | props.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringSerializer"); 26 | props.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, "io.confluent.kafka.serializers.KafkaJsonSerializer"); 27 | } 28 | 29 | public void publish(PickupLocation pickupLocation) throws ExecutionException, InterruptedException { 30 | KafkaProducer kafkaProducer = new KafkaProducer(props); 31 | var record = kafkaProducer.send(new ProducerRecord<>("rides_location", String.valueOf(pickupLocation.PULocationID), pickupLocation), (metadata, exception) -> { 32 | if (exception != null) { 33 | System.out.println(exception.getMessage()); 34 | } 35 | }); 36 | System.out.println(record.get().offset()); 37 | } 38 | 39 | 40 | public static void main(String[] args) throws IOException, CsvException, ExecutionException, InterruptedException { 41 | var producer = new JsonProducerPickupLocation(); 42 | producer.publish(new PickupLocation(186, LocalDateTime.now())); 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /cohorts/2023/README.md: -------------------------------------------------------------------------------- 1 | ## Data Engineering Zoomcamp 2023 Cohort 2 | 3 | * [Launch stream with course overview](https://www.youtube.com/watch?v=-zpVha7bw5A) 4 | * [Course Google calendar](https://calendar.google.com/calendar/?cid=ZXIxcjA1M3ZlYjJpcXU0dTFmaG02MzVxMG9AZ3JvdXAuY2FsZW5kYXIuZ29vZ2xlLmNvbQ) 5 | * [FAQ](https://docs.google.com/document/d/19bnYs80DwuUimHM65UV3sylsCn2j1vziPOwzBwQrebw/edit?usp=sharing) 6 | * [Public Leaderboard](leaderboard.md) and [Private Leaderboard](https://docs.google.com/spreadsheets/d/e/2PACX-1vTbL00GcdQp0bJt9wf1ROltMq7s3qyxl-NYF7Pvk79Jfxgwfn9dNWmPD_yJHTDq_Wzvps8EIr6cOKWm/pubhtml) 7 | * [Course Playlist: Only 2023 Live videos & homeworks](https://www.youtube.com/playlist?list=PL3MmuxUbc_hJjEePXIdE-LVUx_1ZZjYGW) 8 | 9 | [**Week 1: Introduction & Prerequisites**](week_1_docker_sql/) 10 | 11 | * [Homework SQL](week_1_docker_sql/homework.md) and [solution](https://www.youtube.com/watch?v=KIh_9tZiroA) 12 | * [Homework Terraform](week_1_terraform/homework.md) 13 | * [Office hours](https://www.youtube.com/watch?v=RVTryVvSyw4&list=PL3MmuxUbc_hJjEePXIdE-LVUx_1ZZjYGW) 14 | 15 | [**Week 2: Workflow Orchestration**](week_2_workflow_orchestration) 16 | 17 | * [Homework](week_2_workflow_orchestration/homework.md) 18 | * [Office hours part 1](https://www.youtube.com/watch?v=a_nmLHb8hzw&list=PL3MmuxUbc_hJjEePXIdE-LVUx_1ZZjYGW) and [part 2](https://www.youtube.com/watch?v=PK8yyMY54Vk&list=PL3MmuxUbc_hJjEePXIdE-LVUx_1ZZjYGW&index=7) 19 | 20 | [**Week 3: Data Warehouse**](week_3_data_warehouse) 21 | 22 | * [Homework](week_3_data_warehouse/homework.md) 23 | * [Office hours](https://www.youtube.com/watch?v=QXfmtJp3bXE&list=PL3MmuxUbc_hJjEePXIdE-LVUx_1ZZjYGW) 24 | 25 | [**Week 4: Analytics Engineering**](week_4_analytics_engineering/) 26 | 27 | * [Homework](week_4_analytics_engineering/homework.md) 28 | * [PipeRider + dbt Workshop](workshops/piperider.md) 29 | * [Office hours](https://www.youtube.com/watch?v=ODYg_r72qaE&list=PL3MmuxUbc_hJjEePXIdE-LVUx_1ZZjYGW) 30 | 31 | [**Week 5: Batch processing**](week_5_batch_processing/) 32 | 33 | * [Homework](week_5_batch_processing/homework.md) 34 | * [Office hours](https://www.youtube.com/watch?v=5_69yL2PPYI&list=PL3MmuxUbc_hJjEePXIdE-LVUx_1ZZjYGW) 35 | 36 | [**Week 6: Stream Processing**](week_6_stream_processing) 37 | 38 | * [Homework](week_6_stream_processing/homework.md) 39 | 40 | 41 | [**Week 7, 8 & 9: Project**](project.md) 42 | 43 | More information [here](project.md) 44 | -------------------------------------------------------------------------------- /dataset.md: -------------------------------------------------------------------------------- 1 | [Medium article](https://medium.com/@NYCTLC/what-makes-a-city-street-smart-23496d92f60d) 2 | 3 | [Trip record user guide](https://www1.nyc.gov/assets/tlc/downloads/pdf/trip_record_user_guide.pdf) 4 | 5 | The data set is divided into 4 parts: 6 | 7 | - Yellow cabs 8 | - Green cabs 9 | - For Hire Vehicles 10 | - High volume for hire vehicles 11 | 12 | 13 | 14 | Below I am only concentrating on Yellow and green cabs 15 | 16 | ### Yellow and green cabs 17 | 18 | , 19 | 20 | | Columns | Definition | Example | 21 | | --------------------- | ---------- | ------------------- | 22 | | VendorID | | 2 | 23 | | lpep_pickup_datetime | | 2021-01-01 00:15:56 | 24 | | lpep_dropoff_datetime | | 2021-01-01 00:19:52 | 25 | | store_and_fwd_flag | | N, | 26 | | RatecodeID | | 1 | 27 | | PULocationID | | 43 | 28 | | DOLocationID | | 151 | 29 | | passenger_count | | 1 | 30 | | trip_distance | | 1.01 | 31 | | fare_amount | | 5.5 | 32 | | extra | | 0.5 | 33 | | mta_tax | | 0.5 | 34 | | tip_amount | | 0 | 35 | | tolls_amount | | 0 | 36 | | ehail_fee | | | 37 | | improvement_surcharge | | 0.3 | 38 | | total_amount | | 6.8 | 39 | | payment_type | | 2 | 40 | | trip_type | | 1 | 41 | | congestion_surcharge | | 0 | 42 | 43 | 44 | 45 | ### Taxi zone Loopup 46 | 47 | | Columns | Definition | Example | 48 | | ------------ | ---------- | -------------- | 49 | | LocationID | | 1 | 50 | | Borough | | EWR | 51 | | Zone | | Newark Airport | 52 | | service_zone | | EWR | 53 | 54 | [Shapefile from S3](https://s3.amazonaws.com/nyctlc/misc/taxi_zones.zip) 55 | 56 | [Taxi zones](https://data.cityofnewyork.us/Transportation/NYC-Taxi-Zones/d3c5-ddgc) 57 | 58 | -------------------------------------------------------------------------------- /03-data-warehouse/big_query.sql: -------------------------------------------------------------------------------- 1 | -- Query public available table 2 | SELECT station_id, name FROM 3 | bigquery-public-data.new_york_citibike.citibike_stations 4 | LIMIT 100; 5 | 6 | 7 | -- Creating external table referring to gcs path 8 | CREATE OR REPLACE EXTERNAL TABLE `taxi-rides-ny.nytaxi.external_yellow_tripdata` 9 | OPTIONS ( 10 | format = 'CSV', 11 | uris = ['gs://nyc-tl-data/trip data/yellow_tripdata_2019-*.csv', 'gs://nyc-tl-data/trip data/yellow_tripdata_2020-*.csv'] 12 | ); 13 | 14 | -- Check yello trip data 15 | SELECT * FROM taxi-rides-ny.nytaxi.external_yellow_tripdata limit 10; 16 | 17 | -- Create a non partitioned table from external table 18 | CREATE OR REPLACE TABLE taxi-rides-ny.nytaxi.yellow_tripdata_non_partitoned AS 19 | SELECT * FROM taxi-rides-ny.nytaxi.external_yellow_tripdata; 20 | 21 | 22 | -- Create a partitioned table from external table 23 | CREATE OR REPLACE TABLE taxi-rides-ny.nytaxi.yellow_tripdata_partitoned 24 | PARTITION BY 25 | DATE(tpep_pickup_datetime) AS 26 | SELECT * FROM taxi-rides-ny.nytaxi.external_yellow_tripdata; 27 | 28 | -- Impact of partition 29 | -- Scanning 1.6GB of data 30 | SELECT DISTINCT(VendorID) 31 | FROM taxi-rides-ny.nytaxi.yellow_tripdata_non_partitoned 32 | WHERE DATE(tpep_pickup_datetime) BETWEEN '2019-06-01' AND '2019-06-30'; 33 | 34 | -- Scanning ~106 MB of DATA 35 | SELECT DISTINCT(VendorID) 36 | FROM taxi-rides-ny.nytaxi.yellow_tripdata_partitoned 37 | WHERE DATE(tpep_pickup_datetime) BETWEEN '2019-06-01' AND '2019-06-30'; 38 | 39 | -- Let's look into the partitons 40 | SELECT table_name, partition_id, total_rows 41 | FROM `nytaxi.INFORMATION_SCHEMA.PARTITIONS` 42 | WHERE table_name = 'yellow_tripdata_partitoned' 43 | ORDER BY total_rows DESC; 44 | 45 | -- Creating a partition and cluster table 46 | CREATE OR REPLACE TABLE taxi-rides-ny.nytaxi.yellow_tripdata_partitoned_clustered 47 | PARTITION BY DATE(tpep_pickup_datetime) 48 | CLUSTER BY VendorID AS 49 | SELECT * FROM taxi-rides-ny.nytaxi.external_yellow_tripdata; 50 | 51 | -- Query scans 1.1 GB 52 | SELECT count(*) as trips 53 | FROM taxi-rides-ny.nytaxi.yellow_tripdata_partitoned 54 | WHERE DATE(tpep_pickup_datetime) BETWEEN '2019-06-01' AND '2020-12-31' 55 | AND VendorID=1; 56 | 57 | -- Query scans 864.5 MB 58 | SELECT count(*) as trips 59 | FROM taxi-rides-ny.nytaxi.yellow_tripdata_partitoned_clustered 60 | WHERE DATE(tpep_pickup_datetime) BETWEEN '2019-06-01' AND '2020-12-31' 61 | AND VendorID=1; 62 | 63 | -------------------------------------------------------------------------------- /06-streaming/python/streams-example/pyspark/producer.py: -------------------------------------------------------------------------------- 1 | import csv 2 | from time import sleep 3 | from typing import Dict 4 | from kafka import KafkaProducer 5 | 6 | from settings import BOOTSTRAP_SERVERS, INPUT_DATA_PATH, PRODUCE_TOPIC_RIDES_CSV 7 | 8 | 9 | def delivery_report(err, msg): 10 | if err is not None: 11 | print("Delivery failed for record {}: {}".format(msg.key(), err)) 12 | return 13 | print('Record {} successfully produced to {} [{}] at offset {}'.format( 14 | msg.key(), msg.topic(), msg.partition(), msg.offset())) 15 | 16 | 17 | class RideCSVProducer: 18 | def __init__(self, props: Dict): 19 | self.producer = KafkaProducer(**props) 20 | # self.producer = Producer(producer_props) 21 | 22 | @staticmethod 23 | def read_records(resource_path: str): 24 | records, ride_keys = [], [] 25 | i = 0 26 | with open(resource_path, 'r') as f: 27 | reader = csv.reader(f) 28 | header = next(reader) # skip the header 29 | for row in reader: 30 | # vendor_id, passenger_count, trip_distance, payment_type, total_amount 31 | records.append(f'{row[0]}, {row[1]}, {row[2]}, {row[3]}, {row[4]}, {row[9]}, {row[16]}') 32 | ride_keys.append(str(row[0])) 33 | i += 1 34 | if i == 5: 35 | break 36 | return zip(ride_keys, records) 37 | 38 | def publish(self, topic: str, records: [str, str]): 39 | for key_value in records: 40 | key, value = key_value 41 | try: 42 | self.producer.send(topic=topic, key=key, value=value) 43 | print(f"Producing record for ") 44 | except KeyboardInterrupt: 45 | break 46 | except Exception as e: 47 | print(f"Exception while producing record - {value}: {e}") 48 | 49 | self.producer.flush() 50 | sleep(1) 51 | 52 | 53 | if __name__ == "__main__": 54 | config = { 55 | 'bootstrap_servers': [BOOTSTRAP_SERVERS], 56 | 'key_serializer': lambda x: x.encode('utf-8'), 57 | 'value_serializer': lambda x: x.encode('utf-8') 58 | } 59 | producer = RideCSVProducer(props=config) 60 | ride_records = producer.read_records(resource_path=INPUT_DATA_PATH) 61 | print(ride_records) 62 | producer.publish(topic=PRODUCE_TOPIC_RIDES_CSV, records=ride_records) 63 | -------------------------------------------------------------------------------- /06-streaming/python/streams-example/redpanda/producer.py: -------------------------------------------------------------------------------- 1 | import csv 2 | from time import sleep 3 | from typing import Dict 4 | from kafka import KafkaProducer 5 | 6 | from settings import BOOTSTRAP_SERVERS, INPUT_DATA_PATH, PRODUCE_TOPIC_RIDES_CSV 7 | 8 | 9 | def delivery_report(err, msg): 10 | if err is not None: 11 | print("Delivery failed for record {}: {}".format(msg.key(), err)) 12 | return 13 | print('Record {} successfully produced to {} [{}] at offset {}'.format( 14 | msg.key(), msg.topic(), msg.partition(), msg.offset())) 15 | 16 | 17 | class RideCSVProducer: 18 | def __init__(self, props: Dict): 19 | self.producer = KafkaProducer(**props) 20 | # self.producer = Producer(producer_props) 21 | 22 | @staticmethod 23 | def read_records(resource_path: str): 24 | records, ride_keys = [], [] 25 | i = 0 26 | with open(resource_path, 'r') as f: 27 | reader = csv.reader(f) 28 | header = next(reader) # skip the header 29 | for row in reader: 30 | # vendor_id, passenger_count, trip_distance, payment_type, total_amount 31 | records.append(f'{row[0]}, {row[1]}, {row[2]}, {row[3]}, {row[4]}, {row[9]}, {row[16]}') 32 | ride_keys.append(str(row[0])) 33 | i += 1 34 | if i == 5: 35 | break 36 | return zip(ride_keys, records) 37 | 38 | def publish(self, topic: str, records: [str, str]): 39 | for key_value in records: 40 | key, value = key_value 41 | try: 42 | self.producer.send(topic=topic, key=key, value=value) 43 | print(f"Producing record for ") 44 | except KeyboardInterrupt: 45 | break 46 | except Exception as e: 47 | print(f"Exception while producing record - {value}: {e}") 48 | 49 | self.producer.flush() 50 | sleep(1) 51 | 52 | 53 | if __name__ == "__main__": 54 | config = { 55 | 'bootstrap_servers': [BOOTSTRAP_SERVERS], 56 | 'key_serializer': lambda x: x.encode('utf-8'), 57 | 'value_serializer': lambda x: x.encode('utf-8') 58 | } 59 | producer = RideCSVProducer(props=config) 60 | ride_records = producer.read_records(resource_path=INPUT_DATA_PATH) 61 | print(ride_records) 62 | producer.publish(topic=PRODUCE_TOPIC_RIDES_CSV, records=ride_records) 63 | -------------------------------------------------------------------------------- /03-data-warehouse/big_query_ml.sql: -------------------------------------------------------------------------------- 1 | -- SELECT THE COLUMNS INTERESTED FOR YOU 2 | SELECT passenger_count, trip_distance, PULocationID, DOLocationID, payment_type, fare_amount, tolls_amount, tip_amount 3 | FROM `taxi-rides-ny.nytaxi.yellow_tripdata_partitoned` WHERE fare_amount != 0; 4 | 5 | -- CREATE A ML TABLE WITH APPROPRIATE TYPE 6 | CREATE OR REPLACE TABLE `taxi-rides-ny.nytaxi.yellow_tripdata_ml` ( 7 | `passenger_count` INTEGER, 8 | `trip_distance` FLOAT64, 9 | `PULocationID` STRING, 10 | `DOLocationID` STRING, 11 | `payment_type` STRING, 12 | `fare_amount` FLOAT64, 13 | `tolls_amount` FLOAT64, 14 | `tip_amount` FLOAT64 15 | ) AS ( 16 | SELECT passenger_count, trip_distance, cast(PULocationID AS STRING), CAST(DOLocationID AS STRING), 17 | CAST(payment_type AS STRING), fare_amount, tolls_amount, tip_amount 18 | FROM `taxi-rides-ny.nytaxi.yellow_tripdata_partitoned` WHERE fare_amount != 0 19 | ); 20 | 21 | -- CREATE MODEL WITH DEFAULT SETTING 22 | CREATE OR REPLACE MODEL `taxi-rides-ny.nytaxi.tip_model` 23 | OPTIONS 24 | (model_type='linear_reg', 25 | input_label_cols=['tip_amount'], 26 | DATA_SPLIT_METHOD='AUTO_SPLIT') AS 27 | SELECT 28 | * 29 | FROM 30 | `taxi-rides-ny.nytaxi.yellow_tripdata_ml` 31 | WHERE 32 | tip_amount IS NOT NULL; 33 | 34 | -- CHECK FEATURES 35 | SELECT * FROM ML.FEATURE_INFO(MODEL `taxi-rides-ny.nytaxi.tip_model`); 36 | 37 | -- EVALUATE THE MODEL 38 | SELECT 39 | * 40 | FROM 41 | ML.EVALUATE(MODEL `taxi-rides-ny.nytaxi.tip_model`, 42 | ( 43 | SELECT 44 | * 45 | FROM 46 | `taxi-rides-ny.nytaxi.yellow_tripdata_ml` 47 | WHERE 48 | tip_amount IS NOT NULL 49 | )); 50 | 51 | -- PREDICT THE MODEL 52 | SELECT 53 | * 54 | FROM 55 | ML.PREDICT(MODEL `taxi-rides-ny.nytaxi.tip_model`, 56 | ( 57 | SELECT 58 | * 59 | FROM 60 | `taxi-rides-ny.nytaxi.yellow_tripdata_ml` 61 | WHERE 62 | tip_amount IS NOT NULL 63 | )); 64 | 65 | -- PREDICT AND EXPLAIN 66 | SELECT 67 | * 68 | FROM 69 | ML.EXPLAIN_PREDICT(MODEL `taxi-rides-ny.nytaxi.tip_model`, 70 | ( 71 | SELECT 72 | * 73 | FROM 74 | `taxi-rides-ny.nytaxi.yellow_tripdata_ml` 75 | WHERE 76 | tip_amount IS NOT NULL 77 | ), STRUCT(3 as top_k_features)); 78 | 79 | -- HYPER PARAM TUNNING 80 | CREATE OR REPLACE MODEL `taxi-rides-ny.nytaxi.tip_hyperparam_model` 81 | OPTIONS 82 | (model_type='linear_reg', 83 | input_label_cols=['tip_amount'], 84 | DATA_SPLIT_METHOD='AUTO_SPLIT', 85 | num_trials=5, 86 | max_parallel_trials=2, 87 | l1_reg=hparam_range(0, 20), 88 | l2_reg=hparam_candidates([0, 0.1, 1, 10])) AS 89 | SELECT 90 | * 91 | FROM 92 | `taxi-rides-ny.nytaxi.yellow_tripdata_ml` 93 | WHERE 94 | tip_amount IS NOT NULL; 95 | 96 | -------------------------------------------------------------------------------- /cohorts/2024/05-batch/homework.md: -------------------------------------------------------------------------------- 1 | ## Module 5 Homework 2 | 3 | Solution: https://www.youtube.com/watch?v=YtddC7vJOgQ 4 | 5 | In this homework we'll put what we learned about Spark in practice. 6 | 7 | For this homework we will be using the FHV 2019-10 data found here. [FHV Data](https://github.com/DataTalksClub/nyc-tlc-data/releases/download/fhv/fhv_tripdata_2019-10.csv.gz) 8 | 9 | ### Question 1: 10 | 11 | **Install Spark and PySpark** 12 | 13 | - Install Spark 14 | - Run PySpark 15 | - Create a local spark session 16 | - Execute spark.version. 17 | 18 | What's the output? 19 | 20 | > [!NOTE] 21 | > To install PySpark follow this [guide](https://github.com/DataTalksClub/data-engineering-zoomcamp/blob/main/05-batch/setup/pyspark.md) 22 | 23 | ### Question 2: 24 | 25 | **FHV October 2019** 26 | 27 | Read the October 2019 FHV into a Spark Dataframe with a schema as we did in the lessons. 28 | 29 | Repartition the Dataframe to 6 partitions and save it to parquet. 30 | 31 | What is the average size of the Parquet (ending with .parquet extension) Files that were created (in MB)? Select the answer which most closely matches. 32 | 33 | - 1MB 34 | - 6MB 35 | - 25MB 36 | - 87MB 37 | 38 | 39 | 40 | ### Question 3: 41 | 42 | **Count records** 43 | 44 | How many taxi trips were there on the 15th of October? 45 | 46 | Consider only trips that started on the 15th of October. 47 | 48 | - 108,164 49 | - 12,856 50 | - 452,470 51 | - 62,610 52 | 53 | > [!IMPORTANT] 54 | > Be aware of columns order when defining schema 55 | 56 | ### Question 4: 57 | 58 | **Longest trip for each day** 59 | 60 | What is the length of the longest trip in the dataset in hours? 61 | 62 | - 631,152.50 Hours 63 | - 243.44 Hours 64 | - 7.68 Hours 65 | - 3.32 Hours 66 | 67 | 68 | 69 | ### Question 5: 70 | 71 | **User Interface** 72 | 73 | Spark’s User Interface which shows the application's dashboard runs on which local port? 74 | 75 | - 80 76 | - 443 77 | - 4040 78 | - 8080 79 | 80 | 81 | 82 | ### Question 6: 83 | 84 | **Least frequent pickup location zone** 85 | 86 | Load the zone lookup data into a temp view in Spark
87 | [Zone Data](https://github.com/DataTalksClub/nyc-tlc-data/releases/download/misc/taxi_zone_lookup.csv) 88 | 89 | Using the zone lookup data and the FHV October 2019 data, what is the name of the LEAST frequent pickup location Zone?
90 | 91 | - East Chelsea 92 | - Jamaica Bay 93 | - Union Sq 94 | - Crown Heights North 95 | 96 | 97 | ## Submitting the solutions 98 | 99 | - Form for submitting: https://courses.datatalks.club/de-zoomcamp-2024/homework/hw5 100 | - Deadline: See the website 101 | -------------------------------------------------------------------------------- /cohorts/2025/05-batch/homework.md: -------------------------------------------------------------------------------- 1 | ## Module 5 Homework (DRAFT) 2 | 3 | Solution: https://www.youtube.com/watch?v=YtddC7vJOgQ 4 | 5 | In this homework we'll put what we learned about Spark in practice. 6 | 7 | For this homework we will be using the FHV 2019-10 data found here. [FHV Data](https://github.com/DataTalksClub/nyc-tlc-data/releases/download/fhv/fhv_tripdata_2019-10.csv.gz) 8 | 9 | ### Question 1: 10 | 11 | **Install Spark and PySpark** 12 | 13 | - Install Spark 14 | - Run PySpark 15 | - Create a local spark session 16 | - Execute spark.version. 17 | 18 | What's the output? 19 | 20 | > [!NOTE] 21 | > To install PySpark follow this [guide](https://github.com/DataTalksClub/data-engineering-zoomcamp/blob/main/05-batch/setup/pyspark.md) 22 | 23 | ### Question 2: 24 | 25 | **FHV October 2019** 26 | 27 | Read the October 2019 FHV into a Spark Dataframe with a schema as we did in the lessons. 28 | 29 | Repartition the Dataframe to 6 partitions and save it to parquet. 30 | 31 | What is the average size of the Parquet (ending with .parquet extension) Files that were created (in MB)? Select the answer which most closely matches. 32 | 33 | - 1MB 34 | - 6MB 35 | - 25MB 36 | - 87MB 37 | 38 | 39 | 40 | ### Question 3: 41 | 42 | **Count records** 43 | 44 | How many taxi trips were there on the 15th of October? 45 | 46 | Consider only trips that started on the 15th of October. 47 | 48 | - 108,164 49 | - 12,856 50 | - 452,470 51 | - 62,610 52 | 53 | > [!IMPORTANT] 54 | > Be aware of columns order when defining schema 55 | 56 | ### Question 4: 57 | 58 | **Longest trip for each day** 59 | 60 | What is the length of the longest trip in the dataset in hours? 61 | 62 | - 631,152.50 Hours 63 | - 243.44 Hours 64 | - 7.68 Hours 65 | - 3.32 Hours 66 | 67 | 68 | 69 | ### Question 5: 70 | 71 | **User Interface** 72 | 73 | Spark’s User Interface which shows the application's dashboard runs on which local port? 74 | 75 | - 80 76 | - 443 77 | - 4040 78 | - 8080 79 | 80 | 81 | 82 | ### Question 6: 83 | 84 | **Least frequent pickup location zone** 85 | 86 | Load the zone lookup data into a temp view in Spark
87 | [Zone Data](https://github.com/DataTalksClub/nyc-tlc-data/releases/download/misc/taxi_zone_lookup.csv) 88 | 89 | Using the zone lookup data and the FHV October 2019 data, what is the name of the LEAST frequent pickup location Zone?
90 | 91 | - East Chelsea 92 | - Jamaica Bay 93 | - Union Sq 94 | - Crown Heights North 95 | 96 | 97 | ## Submitting the solutions 98 | 99 | - Form for submitting: https://courses.datatalks.club/de-zoomcamp-2024/homework/hw5 100 | - Deadline: See the website 101 | -------------------------------------------------------------------------------- /06-streaming/java/kafka_examples/src/main/java/org/example/JsonKStream.java: -------------------------------------------------------------------------------- 1 | package org.example; 2 | 3 | import org.apache.kafka.clients.consumer.ConsumerConfig; 4 | import org.apache.kafka.common.serialization.Serdes; 5 | import org.apache.kafka.streams.KafkaStreams; 6 | import org.apache.kafka.streams.StreamsBuilder; 7 | import org.apache.kafka.streams.StreamsConfig; 8 | import org.apache.kafka.streams.Topology; 9 | import org.apache.kafka.streams.kstream.Consumed; 10 | import org.apache.kafka.streams.kstream.Produced; 11 | import org.example.customserdes.CustomSerdes; 12 | import org.example.data.Ride; 13 | 14 | import java.util.Properties; 15 | 16 | public class JsonKStream { 17 | private Properties props = new Properties(); 18 | 19 | public JsonKStream() { 20 | props.put(StreamsConfig.BOOTSTRAP_SERVERS_CONFIG, "pkc-75m1o.europe-west3.gcp.confluent.cloud:9092"); 21 | props.put("security.protocol", "SASL_SSL"); 22 | props.put("sasl.jaas.config", "org.apache.kafka.common.security.plain.PlainLoginModule required username='"+Secrets.KAFKA_CLUSTER_KEY+"' password='"+Secrets.KAFKA_CLUSTER_SECRET+"';"); 23 | props.put("sasl.mechanism", "PLAIN"); 24 | props.put("client.dns.lookup", "use_all_dns_ips"); 25 | props.put("session.timeout.ms", "45000"); 26 | props.put(StreamsConfig.APPLICATION_ID_CONFIG, "kafka_tutorial.kstream.count.plocation.v1"); 27 | props.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "latest"); 28 | props.put(StreamsConfig.CACHE_MAX_BYTES_BUFFERING_CONFIG, 0); 29 | 30 | } 31 | 32 | public Topology createTopology() { 33 | StreamsBuilder streamsBuilder = new StreamsBuilder(); 34 | var ridesStream = streamsBuilder.stream("rides", Consumed.with(Serdes.String(), CustomSerdes.getSerde(Ride.class))); 35 | var puLocationCount = ridesStream.groupByKey().count().toStream(); 36 | puLocationCount.to("rides-pulocation-count", Produced.with(Serdes.String(), Serdes.Long())); 37 | return streamsBuilder.build(); 38 | } 39 | 40 | public void countPLocation() throws InterruptedException { 41 | var topology = createTopology(); 42 | var kStreams = new KafkaStreams(topology, props); 43 | kStreams.start(); 44 | while (kStreams.state() != KafkaStreams.State.RUNNING) { 45 | System.out.println(kStreams.state()); 46 | Thread.sleep(1000); 47 | } 48 | System.out.println(kStreams.state()); 49 | Runtime.getRuntime().addShutdownHook(new Thread(kStreams::close)); 50 | } 51 | 52 | public static void main(String[] args) throws InterruptedException { 53 | var object = new JsonKStream(); 54 | object.countPLocation(); 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /01-docker-terraform/2_docker_sql/ingest_data.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | import os 5 | import argparse 6 | 7 | from time import time 8 | 9 | import pandas as pd 10 | from sqlalchemy import create_engine 11 | 12 | 13 | def main(params): 14 | user = params.user 15 | password = params.password 16 | host = params.host 17 | port = params.port 18 | db = params.db 19 | table_name = params.table_name 20 | url = params.url 21 | 22 | # the backup files are gzipped, and it's important to keep the correct extension 23 | # for pandas to be able to open the file 24 | if url.endswith('.csv.gz'): 25 | csv_name = 'output.csv.gz' 26 | else: 27 | csv_name = 'output.csv' 28 | 29 | os.system(f"wget {url} -O {csv_name}") 30 | 31 | engine = create_engine(f'postgresql://{user}:{password}@{host}:{port}/{db}') 32 | 33 | df_iter = pd.read_csv(csv_name, iterator=True, chunksize=100000) 34 | 35 | df = next(df_iter) 36 | 37 | df.tpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime) 38 | df.tpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime) 39 | 40 | df.head(n=0).to_sql(name=table_name, con=engine, if_exists='replace') 41 | 42 | df.to_sql(name=table_name, con=engine, if_exists='append') 43 | 44 | 45 | while True: 46 | 47 | try: 48 | t_start = time() 49 | 50 | df = next(df_iter) 51 | 52 | df.tpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime) 53 | df.tpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime) 54 | 55 | df.to_sql(name=table_name, con=engine, if_exists='append') 56 | 57 | t_end = time() 58 | 59 | print('inserted another chunk, took %.3f second' % (t_end - t_start)) 60 | 61 | except StopIteration: 62 | print("Finished ingesting data into the postgres database") 63 | break 64 | 65 | if __name__ == '__main__': 66 | parser = argparse.ArgumentParser(description='Ingest CSV data to Postgres') 67 | 68 | parser.add_argument('--user', required=True, help='user name for postgres') 69 | parser.add_argument('--password', required=True, help='password for postgres') 70 | parser.add_argument('--host', required=True, help='host for postgres') 71 | parser.add_argument('--port', required=True, help='port for postgres') 72 | parser.add_argument('--db', required=True, help='database name for postgres') 73 | parser.add_argument('--table_name', required=True, help='name of the table where we will write the results to') 74 | parser.add_argument('--url', required=True, help='url of the csv file') 75 | 76 | args = parser.parse_args() 77 | 78 | main(args) 79 | -------------------------------------------------------------------------------- /06-streaming/java/kafka_examples/src/main/java/org/example/JsonConsumer.java: -------------------------------------------------------------------------------- 1 | package org.example; 2 | 3 | import org.apache.kafka.clients.consumer.ConsumerConfig; 4 | import org.apache.kafka.clients.consumer.ConsumerRecord; 5 | import org.apache.kafka.clients.consumer.KafkaConsumer; 6 | import org.apache.kafka.clients.producer.ProducerConfig; 7 | import org.example.data.Ride; 8 | 9 | import java.time.Duration; 10 | import java.time.temporal.ChronoUnit; 11 | import java.time.temporal.TemporalUnit; 12 | import java.util.List; 13 | import java.util.Properties; 14 | import io.confluent.kafka.serializers.KafkaJsonDeserializerConfig; 15 | public class JsonConsumer { 16 | 17 | private Properties props = new Properties(); 18 | private KafkaConsumer consumer; 19 | public JsonConsumer() { 20 | props.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, "pkc-75m1o.europe-west3.gcp.confluent.cloud:9092"); 21 | props.put("security.protocol", "SASL_SSL"); 22 | props.put("sasl.jaas.config", "org.apache.kafka.common.security.plain.PlainLoginModule required username='"+Secrets.KAFKA_CLUSTER_KEY+"' password='"+Secrets.KAFKA_CLUSTER_SECRET+"';"); 23 | props.put("sasl.mechanism", "PLAIN"); 24 | props.put("client.dns.lookup", "use_all_dns_ips"); 25 | props.put("session.timeout.ms", "45000"); 26 | props.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringDeserializer"); 27 | props.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, "io.confluent.kafka.serializers.KafkaJsonDeserializer"); 28 | props.put(ConsumerConfig.GROUP_ID_CONFIG, "kafka_tutorial_example.jsonconsumer.v2"); 29 | props.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "earliest"); 30 | props.put(KafkaJsonDeserializerConfig.JSON_VALUE_TYPE, Ride.class); 31 | consumer = new KafkaConsumer(props); 32 | consumer.subscribe(List.of("rides")); 33 | 34 | } 35 | 36 | public void consumeFromKafka() { 37 | System.out.println("Consuming form kafka started"); 38 | var results = consumer.poll(Duration.of(1, ChronoUnit.SECONDS)); 39 | var i = 0; 40 | do { 41 | 42 | for(ConsumerRecord result: results) { 43 | System.out.println(result.value().DOLocationID); 44 | } 45 | results = consumer.poll(Duration.of(1, ChronoUnit.SECONDS)); 46 | System.out.println("RESULTS:::" + results.count()); 47 | i++; 48 | } 49 | while(!results.isEmpty() || i < 10); 50 | } 51 | 52 | public static void main(String[] args) { 53 | JsonConsumer jsonConsumer = new JsonConsumer(); 54 | jsonConsumer.consumeFromKafka(); 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /cohorts/2023/week_6_stream_processing/producer_confluent.py: -------------------------------------------------------------------------------- 1 | from confluent_kafka import Producer 2 | 3 | import argparse 4 | import csv 5 | from typing import Dict 6 | from time import sleep 7 | 8 | from settings import CONFLUENT_CLOUD_CONFIG, \ 9 | GREEN_TAXI_TOPIC, FHV_TAXI_TOPIC, \ 10 | GREEN_TRIP_DATA_PATH, FHV_TRIP_DATA_PATH 11 | 12 | 13 | class RideCSVProducer: 14 | def __init__(self, probs: Dict, ride_type: str): 15 | 16 | self.producer = Producer(**probs) 17 | self.ride_type = ride_type 18 | 19 | def parse_row(self, row): 20 | if self.ride_type == 'green': 21 | record = f'{row[5]}, {row[6]}' # PULocationID, DOLocationID 22 | key = str(row[0]) # vendor_id 23 | elif self.ride_type == 'fhv': 24 | record = f'{row[3]}, {row[4]}' # PULocationID, DOLocationID, 25 | key = str(row[0]) # dispatching_base_num 26 | return key, record 27 | 28 | def read_records(self, resource_path: str): 29 | records, ride_keys = [], [] 30 | with open(resource_path, 'r') as f: 31 | reader = csv.reader(f) 32 | header = next(reader) # skip the header 33 | for row in reader: 34 | key, record = self.parse_row(row) 35 | ride_keys.append(key) 36 | records.append(record) 37 | return zip(ride_keys, records) 38 | 39 | def publish(self, records: [str, str], topic: str): 40 | for key_value in records: 41 | key, value = key_value 42 | try: 43 | self.producer.poll(0) 44 | self.producer.produce(topic=topic, key=key, value=value) 45 | print(f"Producing record for ") 46 | except KeyboardInterrupt: 47 | break 48 | except BufferError as bfer: 49 | self.producer.poll(0.1) 50 | except Exception as e: 51 | print(f"Exception while producing record - {value}: {e}") 52 | 53 | self.producer.flush() 54 | sleep(10) 55 | 56 | 57 | if __name__ == "__main__": 58 | parser = argparse.ArgumentParser(description='Kafka Consumer') 59 | parser.add_argument('--type', type=str, default='green') 60 | args = parser.parse_args() 61 | 62 | if args.type == 'green': 63 | kafka_topic = GREEN_TAXI_TOPIC 64 | data_path = GREEN_TRIP_DATA_PATH 65 | elif args.type == 'fhv': 66 | kafka_topic = FHV_TAXI_TOPIC 67 | data_path = FHV_TRIP_DATA_PATH 68 | 69 | producer = RideCSVProducer(ride_type=args.type, probs=CONFLUENT_CLOUD_CONFIG) 70 | ride_records = producer.read_records(resource_path=data_path) 71 | producer.publish(records=ride_records, topic=kafka_topic) 72 | -------------------------------------------------------------------------------- /cohorts/2023/project.md: -------------------------------------------------------------------------------- 1 | ## Course Project 2 | 3 | The goal of this project is to apply everything we learned 4 | in this course and build an end-to-end data pipeline. 5 | 6 | You will have two attempts to submit your project. If you don't have 7 | time to submit your project by the end of attempt #1 (you started the 8 | course late, you have vacation plans, life/work got in the way, etc.) 9 | or you fail your first attempt, 10 | then you will have a second chance to submit your project as attempt 11 | #2. 12 | 13 | There are only two attempts. 14 | 15 | Remember that to pass the project, you must evaluate 3 peers. If you don't do that, 16 | your project can't be considered complete. 17 | 18 | To find the projects assigned to you, use the peer review assignments link 19 | and find your hash in the first column. You will see three rows: you need to evaluate 20 | each of these projects. For each project, you need to submit the form once, 21 | so in total, you will make three submissions. 22 | 23 | 24 | ### Submitting 25 | 26 | #### Project Attempt #1 27 | 28 | Project: 29 | 30 | * Form: https://forms.gle/zTJiVYSmCgsENj6y8 31 | * Deadline: 10 April, 22:00 CET 32 | 33 | Peer reviewing: 34 | 35 | * Peer review assignments: [link](https://docs.google.com/spreadsheets/d/e/2PACX-1vRYQ0A9C7AkRK-YPSFhqaRMmuPR97QPfl2PjI8n11l5jntc6YMHIJXVVS0GQNqAYIGwzyevyManDB08/pubhtml?gid=0&single=true) ("project-01" sheet) 36 | * Form: https://forms.gle/1bxmgR8yPwV359zb7 37 | * Deadline: 17 April, 22:00 CET 38 | 39 | Project feedback: [link](https://docs.google.com/spreadsheets/d/e/2PACX-1vQuMt9m1XlPrCACqnsFTXTV_KGiSnsl9UjL7kdTMsLJ8DLu3jNJlPzoUKG6baxc8APeEQ8RaSP1U2VX/pubhtml?gid=27207346&single=true) ("project-01" sheet) 40 | 41 | #### Project Attempt #2 42 | 43 | Project: 44 | 45 | * Form: https://forms.gle/gCXUSYBm1KgMKXVm8 46 | * Deadline: 4 May, 22:00 CET 47 | 48 | Peer reviewing: 49 | 50 | * Peer review assignments: [link](https://docs.google.com/spreadsheets/d/e/2PACX-1vRYQ0A9C7AkRK-YPSFhqaRMmuPR97QPfl2PjI8n11l5jntc6YMHIJXVVS0GQNqAYIGwzyevyManDB08/pubhtml?gid=303437788&single=true) ("project-02" sheet) 51 | * Form: https://forms.gle/2x5MT4xxczR8isy37 52 | * Deadline: 11 May, 22:00 CET 53 | 54 | Project feedback: [link](https://docs.google.com/spreadsheets/d/e/2PACX-1vQuMt9m1XlPrCACqnsFTXTV_KGiSnsl9UjL7kdTMsLJ8DLu3jNJlPzoUKG6baxc8APeEQ8RaSP1U2VX/pubhtml?gid=246029638&single=true) 55 | 56 | ### Evaluation criteria 57 | 58 | See [here](../../week_7_project/README.md) 59 | 60 | 61 | ### Misc 62 | 63 | To get the hash for your project, use this function to hash your email: 64 | 65 | ```python 66 | from hashlib import sha1 67 | 68 | def compute_hash(email): 69 | return sha1(email.lower().encode('utf-8')).hexdigest() 70 | ``` 71 | 72 | Or use [this website](http://www.sha1-online.com/). 73 | -------------------------------------------------------------------------------- /06-streaming/python/redpanda_example/docker-compose.yaml: -------------------------------------------------------------------------------- 1 | version: '3.7' 2 | services: 3 | # Redpanda cluster 4 | redpanda-1: 5 | image: docker.redpanda.com/redpandadata/redpanda:v23.2.26 6 | container_name: redpanda-1 7 | command: 8 | - redpanda 9 | - start 10 | - --smp 11 | - '1' 12 | - --reserve-memory 13 | - 0M 14 | - --overprovisioned 15 | - --node-id 16 | - '1' 17 | - --kafka-addr 18 | - PLAINTEXT://0.0.0.0:29092,OUTSIDE://0.0.0.0:9092 19 | - --advertise-kafka-addr 20 | - PLAINTEXT://redpanda-1:29092,OUTSIDE://localhost:9092 21 | - --pandaproxy-addr 22 | - PLAINTEXT://0.0.0.0:28082,OUTSIDE://0.0.0.0:8082 23 | - --advertise-pandaproxy-addr 24 | - PLAINTEXT://redpanda-1:28082,OUTSIDE://localhost:8082 25 | - --rpc-addr 26 | - 0.0.0.0:33145 27 | - --advertise-rpc-addr 28 | - redpanda-1:33145 29 | ports: 30 | # - 8081:8081 31 | - 8082:8082 32 | - 9092:9092 33 | - 9644:9644 34 | - 28082:28082 35 | - 29092:29092 36 | 37 | # Want a two node Redpanda cluster? Uncomment this block :) 38 | # redpanda-2: 39 | # image: docker.redpanda.com/redpandadata/redpanda:v23.1.1 40 | # container_name: redpanda-2 41 | # command: 42 | # - redpanda 43 | # - start 44 | # - --smp 45 | # - '1' 46 | # - --reserve-memory 47 | # - 0M 48 | # - --overprovisioned 49 | # - --node-id 50 | # - '2' 51 | # - --seeds 52 | # - redpanda-1:33145 53 | # - --kafka-addr 54 | # - PLAINTEXT://0.0.0.0:29093,OUTSIDE://0.0.0.0:9093 55 | # - --advertise-kafka-addr 56 | # - PLAINTEXT://redpanda-2:29093,OUTSIDE://localhost:9093 57 | # - --pandaproxy-addr 58 | # - PLAINTEXT://0.0.0.0:28083,OUTSIDE://0.0.0.0:8083 59 | # - --advertise-pandaproxy-addr 60 | # - PLAINTEXT://redpanda-2:28083,OUTSIDE://localhost:8083 61 | # - --rpc-addr 62 | # - 0.0.0.0:33146 63 | # - --advertise-rpc-addr 64 | # - redpanda-2:33146 65 | # ports: 66 | # - 8083:8083 67 | # - 9093:9093 68 | 69 | redpanda-console: 70 | image: docker.redpanda.com/redpandadata/console:v2.2.2 71 | container_name: redpanda-console 72 | entrypoint: /bin/sh 73 | command: -c "echo \"$$CONSOLE_CONFIG_FILE\" > /tmp/config.yml; /app/console" 74 | environment: 75 | CONFIG_FILEPATH: /tmp/config.yml 76 | CONSOLE_CONFIG_FILE: | 77 | kafka: 78 | brokers: ["redpanda-1:29092"] 79 | schemaRegistry: 80 | enabled: false 81 | redpanda: 82 | adminApi: 83 | enabled: true 84 | urls: ["http://redpanda-1:9644"] 85 | connect: 86 | enabled: false 87 | ports: 88 | - 8080:8080 89 | depends_on: 90 | - redpanda-1 91 | -------------------------------------------------------------------------------- /cohorts/2022/week_2_data_ingestion/airflow/docs/1_concepts.md: -------------------------------------------------------------------------------- 1 | ## Airflow concepts 2 | 3 | 4 | ### Airflow architecture 5 | ![](arch-diag-airflow.png) 6 | 7 | Ref: https://airflow.apache.org/docs/apache-airflow/stable/concepts/overview.html 8 | 9 | * **Web server**: 10 | GUI to inspect, trigger and debug the behaviour of DAGs and tasks. 11 | Available at http://localhost:8080. 12 | 13 | * **Scheduler**: 14 | Responsible for scheduling jobs. Handles both triggering & scheduled workflows, submits Tasks to the executor to run, monitors all tasks and DAGs, and 15 | then triggers the task instances once their dependencies are complete. 16 | 17 | * **Worker**: 18 | This component executes the tasks given by the scheduler. 19 | 20 | * **Metadata database (postgres)**: 21 | Backend to the Airflow environment. Used by the scheduler, executor and webserver to store state. 22 | 23 | * **Other components** (seen in docker-compose services): 24 | * `redis`: Message broker that forwards messages from scheduler to worker. 25 | * `flower`: The flower app for monitoring the environment. It is available at http://localhost:5555. 26 | * `airflow-init`: initialization service (customized as per this design) 27 | 28 | All these services allow you to run Airflow with CeleryExecutor. 29 | For more information, see [Architecture Overview](https://airflow.apache.org/docs/apache-airflow/stable/concepts/overview.html). 30 | 31 | 32 | ### Project Structure: 33 | 34 | * `./dags` - `DAG_FOLDER` for DAG files (use `./dags_local` for the local ingestion DAG) 35 | * `./logs` - contains logs from task execution and scheduler. 36 | * `./plugins` - for custom plugins 37 | 38 | 39 | ### Workflow components 40 | 41 | * `DAG`: Directed acyclic graph, specifies the dependencies between a set of tasks with explicit execution order, and has a beginning as well as an end. (Hence, “acyclic”) 42 | * `DAG Structure`: DAG Definition, Tasks (eg. Operators), Task Dependencies (control flow: `>>` or `<<` ) 43 | 44 | * `Task`: a defined unit of work (aka, operators in Airflow). The Tasks themselves describe what to do, be it fetching data, running analysis, triggering other systems, or more. 45 | * Common Types: Operators (used in this workshop), Sensors, TaskFlow decorators 46 | * Sub-classes of Airflow's BaseOperator 47 | 48 | * `DAG Run`: individual execution/run of a DAG 49 | * scheduled or triggered 50 | 51 | * `Task Instance`: an individual run of a single task. Task instances also have an indicative state, which could be “running”, “success”, “failed”, “skipped”, “up for retry”, etc. 52 | * Ideally, a task should flow from `none`, to `scheduled`, to `queued`, to `running`, and finally to `success`. 53 | 54 | 55 | ### References 56 | 57 | https://airflow.apache.org/docs/apache-airflow/stable/concepts/dags.html 58 | 59 | https://airflow.apache.org/docs/apache-airflow/stable/concepts/tasks.html 60 | 61 | -------------------------------------------------------------------------------- /02-workflow-orchestration/homework.md: -------------------------------------------------------------------------------- 1 | ## Module 2 Homework 2 | 3 | ### Assignment 4 | 5 | So far in the course, we processed data for the year 2019 and 2020. Your task is to extend the existing flows to include data for the year 2021. 6 | 7 | ![homework datasets](images/homework.png) 8 | 9 | As a hint, Kestra makes that process really easy: 10 | 1. You can leverage the backfill functionality in the [scheduled flow](../flows/07_gcp_taxi_scheduled.yaml) to backfill the data for the year 2021. Just make sure to select the time period for which data exists i.e. from `2021-01-01` to `2021-07-31`. Also, make sure to do the same for both `yellow` and `green` taxi data (select the right service in the `taxi` input). 11 | 2. Alternatively, run the flow manually for each of the seven months of 2021 for both `yellow` and `green` taxi data. Challenge for you: find out how to loop over the combination of Year-Month and `taxi`-type using `ForEach` task which triggers the flow for each combination using a `Subflow` task. 12 | 13 | ### Quiz Questions 14 | 15 | Complete the Quiz shown below. It’s a set of 6 multiple-choice questions to test your understanding of workflow orchestration, Kestra and ETL pipelines for data lakes and warehouses. 16 | 17 | 1) Within the execution for `Yellow` Taxi data for the year `2020` and month `12`: what is the uncompressed file size (i.e. the output file `yellow_tripdata_2020-12.csv` of the `extract` task)? 18 | - 128.3 MB 19 | - 134.5 MB 20 | - 364.7 MB 21 | - 692.6 MB 22 | 23 | 2) What is the value of the variable `file` when the inputs `taxi` is set to `green`, `year` is set to `2020`, and `month` is set to `04` during execution? 24 | - `{{inputs.taxi}}_tripdata_{{inputs.year}}-{{inputs.month}}.csv` 25 | - `green_tripdata_2020-04.csv` 26 | - `green_tripdata_04_2020.csv` 27 | - `green_tripdata_2020.csv` 28 | 29 | 3) How many rows are there for the `Yellow` Taxi data for the year 2020? 30 | - 13,537.299 31 | - 24,648,499 32 | - 18,324,219 33 | - 29,430,127 34 | 35 | 4) How many rows are there for the `Green` Taxi data for the year 2020? 36 | - 5,327,301 37 | - 936,199 38 | - 1,734,051 39 | - 1,342,034 40 | 41 | 5) Using dbt on the `Green` and `Yellow` Taxi data for the year 2020, how many rows are there in the `fact_trips` table? 42 | - 198 43 | - 165 44 | - 151 45 | - 203 46 | 47 | 6) How would you configure the timezone to New York in a Schedule trigger? 48 | - Add a `timezone` property set to `EST` in the `Schedule` trigger configuration 49 | - Add a `timezone` property set to `America/New_York` in the `Schedule` trigger configuration 50 | - Add a `timezone` property set to `UTC-5` in the `Schedule` trigger configuration 51 | - Add a `location` property set to `New_York` in the `Schedule` trigger configuration 52 | 53 | 54 | ## Submitting the solutions 55 | 56 | * Form for submitting: https://courses.datatalks.club/de-zoomcamp-2025/homework/hw2 57 | * Check the link above to see the due date -------------------------------------------------------------------------------- /06-streaming/java/kafka_examples/src/main/java/org/example/JsonKStreamWindow.java: -------------------------------------------------------------------------------- 1 | package org.example; 2 | 3 | import org.apache.kafka.clients.consumer.ConsumerConfig; 4 | import org.apache.kafka.common.serialization.Serdes; 5 | import org.apache.kafka.streams.KafkaStreams; 6 | import org.apache.kafka.streams.StreamsBuilder; 7 | import org.apache.kafka.streams.StreamsConfig; 8 | import org.apache.kafka.streams.Topology; 9 | import org.apache.kafka.streams.kstream.Consumed; 10 | import org.apache.kafka.streams.kstream.Produced; 11 | import org.apache.kafka.streams.kstream.TimeWindows; 12 | import org.apache.kafka.streams.kstream.WindowedSerdes; 13 | import org.example.customserdes.CustomSerdes; 14 | import org.example.data.Ride; 15 | 16 | import java.time.Duration; 17 | import java.time.temporal.ChronoUnit; 18 | import java.util.Properties; 19 | 20 | public class JsonKStreamWindow { 21 | private Properties props = new Properties(); 22 | 23 | public JsonKStreamWindow() { 24 | props.put(StreamsConfig.BOOTSTRAP_SERVERS_CONFIG, "pkc-75m1o.europe-west3.gcp.confluent.cloud:9092"); 25 | props.put("security.protocol", "SASL_SSL"); 26 | props.put("sasl.jaas.config", "org.apache.kafka.common.security.plain.PlainLoginModule required username='"+Secrets.KAFKA_CLUSTER_KEY+"' password='"+Secrets.KAFKA_CLUSTER_SECRET+"';"); 27 | props.put("sasl.mechanism", "PLAIN"); 28 | props.put("client.dns.lookup", "use_all_dns_ips"); 29 | props.put("session.timeout.ms", "45000"); 30 | props.put(StreamsConfig.APPLICATION_ID_CONFIG, "kafka_tutorial.kstream.count.plocation.v1"); 31 | props.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "latest"); 32 | props.put(StreamsConfig.CACHE_MAX_BYTES_BUFFERING_CONFIG, 0); 33 | 34 | } 35 | 36 | public Topology createTopology() { 37 | StreamsBuilder streamsBuilder = new StreamsBuilder(); 38 | var ridesStream = streamsBuilder.stream("rides", Consumed.with(Serdes.String(), CustomSerdes.getSerde(Ride.class))); 39 | var puLocationCount = ridesStream.groupByKey() 40 | .windowedBy(TimeWindows.ofSizeAndGrace(Duration.ofSeconds(10), Duration.ofSeconds(5))) 41 | .count().toStream(); 42 | var windowSerde = WindowedSerdes.timeWindowedSerdeFrom(String.class, 10*1000); 43 | 44 | puLocationCount.to("rides-pulocation-window-count", Produced.with(windowSerde, Serdes.Long())); 45 | return streamsBuilder.build(); 46 | } 47 | 48 | public void countPLocationWindowed() { 49 | var topology = createTopology(); 50 | var kStreams = new KafkaStreams(topology, props); 51 | kStreams.start(); 52 | 53 | Runtime.getRuntime().addShutdownHook(new Thread(kStreams::close)); 54 | } 55 | 56 | public static void main(String[] args) { 57 | var object = new JsonKStreamWindow(); 58 | object.countPLocationWindowed(); 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /04-analytics-engineering/taxi_rides_ny/analyses/hack-load-data.sql: -------------------------------------------------------------------------------- 1 | -- MAKE SURE YOU REPLACE taxi-rides-ny-339813-412521 WITH THE NAME OF YOUR DATASET! 2 | -- When you run the query, only run 5 of the ALTER TABLE statements at one time (by highlighting only 5). 3 | -- Otherwise BigQuery will say too many alterations to the table are being made. 4 | 5 | CREATE TABLE `taxi-rides-ny-339813-412521.trips_data_all.green_tripdata` as 6 | SELECT * FROM `bigquery-public-data.new_york_taxi_trips.tlc_green_trips_2019`; 7 | 8 | 9 | CREATE TABLE `taxi-rides-ny-339813-412521.trips_data_all.yellow_tripdata` as 10 | SELECT * FROM `bigquery-public-data.new_york_taxi_trips.tlc_yellow_trips_2019`; 11 | 12 | insert into `taxi-rides-ny-339813-412521.trips_data_all.green_tripdata` 13 | SELECT * FROM `bigquery-public-data.new_york_taxi_trips.tlc_green_trips_2020` ; 14 | 15 | 16 | insert into `taxi-rides-ny-339813-412521.trips_data_all.yellow_tripdata` 17 | SELECT * FROM `bigquery-public-data.new_york_taxi_trips.tlc_yellow_trips_2020`; 18 | 19 | -- Fixes yellow table schema 20 | ALTER TABLE `taxi-rides-ny-339813-412521.trips_data_all.yellow_tripdata` 21 | RENAME COLUMN vendor_id TO VendorID; 22 | ALTER TABLE `taxi-rides-ny-339813-412521.trips_data_all.yellow_tripdata` 23 | RENAME COLUMN pickup_datetime TO tpep_pickup_datetime; 24 | ALTER TABLE `taxi-rides-ny-339813-412521.trips_data_all.yellow_tripdata` 25 | RENAME COLUMN dropoff_datetime TO tpep_dropoff_datetime; 26 | ALTER TABLE `taxi-rides-ny-339813-412521.trips_data_all.yellow_tripdata` 27 | RENAME COLUMN rate_code TO RatecodeID; 28 | ALTER TABLE `taxi-rides-ny-339813-412521.trips_data_all.yellow_tripdata` 29 | RENAME COLUMN imp_surcharge TO improvement_surcharge; 30 | ALTER TABLE `taxi-rides-ny-339813-412521.trips_data_all.yellow_tripdata` 31 | RENAME COLUMN pickup_location_id TO PULocationID; 32 | ALTER TABLE `taxi-rides-ny-339813-412521.trips_data_all.yellow_tripdata` 33 | RENAME COLUMN dropoff_location_id TO DOLocationID; 34 | 35 | -- Fixes green table schema 36 | ALTER TABLE `taxi-rides-ny-339813-412521.trips_data_all.green_tripdata` 37 | RENAME COLUMN vendor_id TO VendorID; 38 | ALTER TABLE `taxi-rides-ny-339813-412521.trips_data_all.green_tripdata` 39 | RENAME COLUMN pickup_datetime TO lpep_pickup_datetime; 40 | ALTER TABLE `taxi-rides-ny-339813-412521.trips_data_all.green_tripdata` 41 | RENAME COLUMN dropoff_datetime TO lpep_dropoff_datetime; 42 | ALTER TABLE `taxi-rides-ny-339813-412521.trips_data_all.green_tripdata` 43 | RENAME COLUMN rate_code TO RatecodeID; 44 | ALTER TABLE `taxi-rides-ny-339813-412521.trips_data_all.green_tripdata` 45 | RENAME COLUMN imp_surcharge TO improvement_surcharge; 46 | ALTER TABLE `taxi-rides-ny-339813-412521.trips_data_all.green_tripdata` 47 | RENAME COLUMN pickup_location_id TO PULocationID; 48 | ALTER TABLE `taxi-rides-ny-339813-412521.trips_data_all.green_tripdata` 49 | RENAME COLUMN dropoff_location_id TO DOLocationID; 50 | -------------------------------------------------------------------------------- /01-docker-terraform/1_terraform_gcp/1_terraform_overview.md: -------------------------------------------------------------------------------- 1 | ## Terraform Overview 2 | 3 | [Video](https://www.youtube.com/watch?v=18jIzE41fJ4&list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb&index=2) 4 | 5 | ### Concepts 6 | 7 | #### Introduction 8 | 9 | 1. What is [Terraform](https://www.terraform.io)? 10 | * open-source tool by [HashiCorp](https://www.hashicorp.com), used for provisioning infrastructure resources 11 | * supports DevOps best practices for change management 12 | * Managing configuration files in source control to maintain an ideal provisioning state 13 | for testing and production environments 14 | 2. What is IaC? 15 | * Infrastructure-as-Code 16 | * build, change, and manage your infrastructure in a safe, consistent, and repeatable way 17 | by defining resource configurations that you can version, reuse, and share. 18 | 3. Some advantages 19 | * Infrastructure lifecycle management 20 | * Version control commits 21 | * Very useful for stack-based deployments, and with cloud providers such as AWS, GCP, Azure, K8S… 22 | * State-based approach to track resource changes throughout deployments 23 | 24 | 25 | #### Files 26 | 27 | * `main.tf` 28 | * `variables.tf` 29 | * Optional: `resources.tf`, `output.tf` 30 | * `.tfstate` 31 | 32 | #### Declarations 33 | * `terraform`: configure basic Terraform settings to provision your infrastructure 34 | * `required_version`: minimum Terraform version to apply to your configuration 35 | * `backend`: stores Terraform's "state" snapshots, to map real-world resources to your configuration. 36 | * `local`: stores state file locally as `terraform.tfstate` 37 | * `required_providers`: specifies the providers required by the current module 38 | * `provider`: 39 | * adds a set of resource types and/or data sources that Terraform can manage 40 | * The Terraform Registry is the main directory of publicly available providers from most major infrastructure platforms. 41 | * `resource` 42 | * blocks to define components of your infrastructure 43 | * Project modules/resources: google_storage_bucket, google_bigquery_dataset, google_bigquery_table 44 | * `variable` & `locals` 45 | * runtime arguments and constants 46 | 47 | 48 | #### Execution steps 49 | 1. `terraform init`: 50 | * Initializes & configures the backend, installs plugins/providers, & checks out an existing configuration from a version control 51 | 2. `terraform plan`: 52 | * Matches/previews local changes against a remote state, and proposes an Execution Plan. 53 | 3. `terraform apply`: 54 | * Asks for approval to the proposed plan, and applies changes to cloud 55 | 4. `terraform destroy` 56 | * Removes your stack from the Cloud 57 | 58 | 59 | ### Terraform Workshop to create GCP Infra 60 | Continue [here](./terraform): `week_1_basics_n_setup/1_terraform_gcp/terraform` 61 | 62 | 63 | ### References 64 | https://learn.hashicorp.com/collections/terraform/gcp-get-started 65 | -------------------------------------------------------------------------------- /01-docker-terraform/2_docker_sql/data-loading-parquet.py: -------------------------------------------------------------------------------- 1 | #Cleaned up version of data-loading.ipynb 2 | import argparse, os, sys 3 | from time import time 4 | import pandas as pd 5 | import pyarrow.parquet as pq 6 | from sqlalchemy import create_engine 7 | 8 | 9 | def main(params): 10 | user = params.user 11 | password = params.password 12 | host = params.host 13 | port = params.port 14 | db = params.db 15 | tb = params.tb 16 | url = params.url 17 | 18 | # Get the name of the file from url 19 | file_name = url.rsplit('/', 1)[-1].strip() 20 | print(f'Downloading {file_name} ...') 21 | # Download file from url 22 | os.system(f'curl {url.strip()} -o {file_name}') 23 | print('\n') 24 | 25 | # Create SQL engine 26 | engine = create_engine(f'postgresql://{user}:{password}@{host}:{port}/{db}') 27 | 28 | # Read file based on csv or parquet 29 | if '.csv' in file_name: 30 | df = pd.read_csv(file_name, nrows=10) 31 | df_iter = pd.read_csv(file_name, iterator=True, chunksize=100000) 32 | elif '.parquet' in file_name: 33 | file = pq.ParquetFile(file_name) 34 | df = next(file.iter_batches(batch_size=10)).to_pandas() 35 | df_iter = file.iter_batches(batch_size=100000) 36 | else: 37 | print('Error. Only .csv or .parquet files allowed.') 38 | sys.exit() 39 | 40 | 41 | # Create the table 42 | df.head(0).to_sql(name=tb, con=engine, if_exists='replace') 43 | 44 | 45 | # Insert values 46 | t_start = time() 47 | count = 0 48 | for batch in df_iter: 49 | count+=1 50 | 51 | if '.parquet' in file_name: 52 | batch_df = batch.to_pandas() 53 | else: 54 | batch_df = batch 55 | 56 | print(f'inserting batch {count}...') 57 | 58 | b_start = time() 59 | batch_df.to_sql(name=tb, con=engine, if_exists='append') 60 | b_end = time() 61 | 62 | print(f'inserted! time taken {b_end-b_start:10.3f} seconds.\n') 63 | 64 | t_end = time() 65 | print(f'Completed! Total time taken was {t_end-t_start:10.3f} seconds for {count} batches.') 66 | 67 | 68 | 69 | if __name__ == '__main__': 70 | #Parsing arguments 71 | parser = argparse.ArgumentParser(description='Loading data from .paraquet file link to a Postgres datebase.') 72 | 73 | parser.add_argument('--user', help='Username for Postgres.') 74 | parser.add_argument('--password', help='Password to the username for Postgres.') 75 | parser.add_argument('--host', help='Hostname for Postgres.') 76 | parser.add_argument('--port', help='Port for Postgres connection.') 77 | parser.add_argument('--db', help='Databse name for Postgres') 78 | parser.add_argument('--tb', help='Destination table name for Postgres.') 79 | parser.add_argument('--url', help='URL for .paraquet file.') 80 | 81 | args = parser.parse_args() 82 | main(args) 83 | 84 | 85 | 86 | 87 | -------------------------------------------------------------------------------- /cohorts/2023/week_1_docker_sql/homework.md: -------------------------------------------------------------------------------- 1 | ## Week 1 Homework 2 | 3 | In this homework we'll prepare the environment 4 | and practice with Docker and SQL 5 | 6 | 7 | ## Question 1. Knowing docker tags 8 | 9 | Run the command to get information on Docker 10 | 11 | ```docker --help``` 12 | 13 | Now run the command to get help on the "docker build" command 14 | 15 | Which tag has the following text? - *Write the image ID to the file* 16 | 17 | - `--imageid string` 18 | - `--iidfile string` 19 | - `--idimage string` 20 | - `--idfile string` 21 | 22 | 23 | ## Question 2. Understanding docker first run 24 | 25 | Run docker with the python:3.9 image in an interactive mode and the entrypoint of bash. 26 | Now check the python modules that are installed ( use pip list). 27 | How many python packages/modules are installed? 28 | 29 | - 1 30 | - 6 31 | - 3 32 | - 7 33 | 34 | # Prepare Postgres 35 | 36 | Run Postgres and load data as shown in the videos 37 | We'll use the green taxi trips from January 2019: 38 | 39 | ```wget https://github.com/DataTalksClub/nyc-tlc-data/releases/download/green/green_tripdata_2019-01.csv.gz``` 40 | 41 | You will also need the dataset with zones: 42 | 43 | ```wget https://s3.amazonaws.com/nyc-tlc/misc/taxi+_zone_lookup.csv``` 44 | 45 | Download this data and put it into Postgres (with jupyter notebooks or with a pipeline) 46 | 47 | 48 | ## Question 3. Count records 49 | 50 | How many taxi trips were totally made on January 15? 51 | 52 | Tip: started and finished on 2019-01-15. 53 | 54 | Remember that `lpep_pickup_datetime` and `lpep_dropoff_datetime` columns are in the format timestamp (date and hour+min+sec) and not in date. 55 | 56 | - 20689 57 | - 20530 58 | - 17630 59 | - 21090 60 | 61 | ## Question 4. Largest trip for each day 62 | 63 | Which was the day with the largest trip distance 64 | Use the pick up time for your calculations. 65 | 66 | - 2019-01-18 67 | - 2019-01-28 68 | - 2019-01-15 69 | - 2019-01-10 70 | 71 | ## Question 5. The number of passengers 72 | 73 | In 2019-01-01 how many trips had 2 and 3 passengers? 74 | 75 | - 2: 1282 ; 3: 266 76 | - 2: 1532 ; 3: 126 77 | - 2: 1282 ; 3: 254 78 | - 2: 1282 ; 3: 274 79 | 80 | 81 | ## Question 6. Largest tip 82 | 83 | For the passengers picked up in the Astoria Zone which was the drop off zone that had the largest tip? 84 | We want the name of the zone, not the id. 85 | 86 | Note: it's not a typo, it's `tip` , not `trip` 87 | 88 | - Central Park 89 | - Jamaica 90 | - South Ozone Park 91 | - Long Island City/Queens Plaza 92 | 93 | 94 | ## Submitting the solutions 95 | 96 | * Form for submitting: [form](https://forms.gle/EjphSkR1b3nsdojv7) 97 | * You can submit your homework multiple times. In this case, only the last submission will be used. 98 | 99 | Deadline: 30 January (Monday), 22:00 CET 100 | 101 | 102 | ## Solution 103 | 104 | See here: https://www.youtube.com/watch?v=KIh_9tZiroA 105 | -------------------------------------------------------------------------------- /cohorts/2022/week_1_basics_n_setup/homework.md: -------------------------------------------------------------------------------- 1 | ## Week 1 Homework 2 | 3 | In this homework we'll prepare the environment 4 | and practice with terraform and SQL 5 | 6 | 7 | ## Question 1. Google Cloud SDK 8 | 9 | Install Google Cloud SDK. What's the version you have? 10 | 11 | To get the version, run `gcloud --version` 12 | 13 | ## Google Cloud account 14 | 15 | Create an account in Google Cloud and create a project. 16 | 17 | 18 | ## Question 2. Terraform 19 | 20 | Now install terraform and go to the terraform directory (`week_1_basics_n_setup/1_terraform_gcp/terraform`) 21 | 22 | After that, run 23 | 24 | * `terraform init` 25 | * `terraform plan` 26 | * `terraform apply` 27 | 28 | Apply the plan and copy the output (after running `apply`) to the form. 29 | 30 | It should be the entire output - from the moment you typed `terraform init` to the very end. 31 | 32 | ## Prepare Postgres 33 | 34 | Run Postgres and load data as shown in the videos 35 | 36 | We'll use the yellow taxi trips from January 2021: 37 | 38 | ```bash 39 | wget https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2021-01.csv 40 | ``` 41 | 42 | You will also need the dataset with zones: 43 | 44 | ```bash 45 | wget https://s3.amazonaws.com/nyc-tlc/misc/taxi+_zone_lookup.csv 46 | ``` 47 | 48 | Download this data and put it to Postgres 49 | 50 | ## Question 3. Count records 51 | 52 | How many taxi trips were there on January 15? 53 | 54 | Consider only trips that started on January 15. 55 | 56 | 57 | ## Question 4. Largest tip for each day 58 | 59 | Find the largest tip for each day. 60 | On which day it was the largest tip in January? 61 | 62 | Use the pick up time for your calculations. 63 | 64 | (note: it's not a typo, it's "tip", not "trip") 65 | 66 | 67 | ## Question 5. Most popular destination 68 | 69 | What was the most popular destination for passengers picked up 70 | in central park on January 14? 71 | 72 | Use the pick up time for your calculations. 73 | 74 | Enter the zone name (not id). If the zone name is unknown (missing), write "Unknown" 75 | 76 | 77 | ## Question 6. Most expensive locations 78 | 79 | What's the pickup-dropoff pair with the largest 80 | average price for a ride (calculated based on `total_amount`)? 81 | 82 | Enter two zone names separated by a slash 83 | 84 | For example: 85 | 86 | "Jamaica Bay / Clinton East" 87 | 88 | If any of the zone names are unknown (missing), write "Unknown". For example, "Unknown / Clinton East". 89 | 90 | 91 | ## Submitting the solutions 92 | 93 | * Form for submitting: https://forms.gle/yGQrkgRdVbiFs8Vd7 94 | * You can submit your homework multiple times. In this case, only the last submission will be used. 95 | 96 | Deadline: 26 January (Wednesday), 22:00 CET 97 | 98 | 99 | ## Solution 100 | 101 | Here is the solution to questions 3-6: [video](https://www.youtube.com/watch?v=HxHqH2ARfxM&list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb) 102 | 103 | -------------------------------------------------------------------------------- /cohorts/2023/week_5_batch_processing/homework.md: -------------------------------------------------------------------------------- 1 | ## Week 5 Homework 2 | 3 | In this homework we'll put what we learned about Spark in practice. 4 | 5 | For this homework we will be using the FHVHV 2021-06 data found here. [FHVHV Data](https://github.com/DataTalksClub/nyc-tlc-data/releases/download/fhvhv/fhvhv_tripdata_2021-06.csv.gz ) 6 | 7 | 8 | ### Question 1: 9 | 10 | **Install Spark and PySpark** 11 | 12 | - Install Spark 13 | - Run PySpark 14 | - Create a local spark session 15 | - Execute spark.version. 16 | 17 | What's the output? 18 | - 3.3.2 19 | - 2.1.4 20 | - 1.2.3 21 | - 5.4 22 |

23 | 24 | 25 | ### Question 2: 26 | 27 | **HVFHW June 2021** 28 | 29 | Read it with Spark using the same schema as we did in the lessons.
30 | We will use this dataset for all the remaining questions.
31 | Repartition it to 12 partitions and save it to parquet.
32 | What is the average size of the Parquet (ending with .parquet extension) Files that were created (in MB)? Select the answer which most closely matches.
33 | 34 | 35 | - 2MB 36 | - 24MB 37 | - 100MB 38 | - 250MB 39 |

40 | 41 | 42 | ### Question 3: 43 | 44 | **Count records** 45 | 46 | How many taxi trips were there on June 15?

47 | Consider only trips that started on June 15.
48 | 49 | - 308,164 50 | - 12,856 51 | - 452,470 52 | - 50,982 53 |

54 | 55 | 56 | ### Question 4: 57 | 58 | **Longest trip for each day** 59 | 60 | Now calculate the duration for each trip.
61 | How long was the longest trip in Hours?
62 | 63 | - 66.87 Hours 64 | - 243.44 Hours 65 | - 7.68 Hours 66 | - 3.32 Hours 67 |

68 | 69 | ### Question 5: 70 | 71 | **User Interface** 72 | 73 | Spark’s User Interface which shows application's dashboard runs on which local port?
74 | 75 | - 80 76 | - 443 77 | - 4040 78 | - 8080 79 |

80 | 81 | 82 | ### Question 6: 83 | 84 | **Most frequent pickup location zone** 85 | 86 | Load the zone lookup data into a temp view in Spark
87 | [Zone Data](https://github.com/DataTalksClub/nyc-tlc-data/releases/download/misc/taxi_zone_lookup.csv)
88 | 89 | Using the zone lookup data and the fhvhv June 2021 data, what is the name of the most frequent pickup location zone?
90 | 91 | - East Chelsea 92 | - Astoria 93 | - Union Sq 94 | - Crown Heights North 95 |

96 | 97 | 98 | 99 | 100 | ## Submitting the solutions 101 | 102 | * Form for submitting: https://forms.gle/EcSvDs6vp64gcGuD8 103 | * You can submit your homework multiple times. In this case, only the last submission will be used. 104 | 105 | Deadline: 06 March (Monday), 22:00 CET 106 | 107 | 108 | ## Solution 109 | 110 | * Video: https://www.youtube.com/watch?v=ldoDIT32pJs 111 | * Answers: 112 | * Question 1: 3.3.2 113 | * Question 2: 24MB 114 | * Question 3: 452,470 115 | * Question 4: 66.87 Hours 116 | * Question 5: 4040 117 | * Question 6: Crown Heights North 118 | -------------------------------------------------------------------------------- /06-streaming/java/kafka_examples/src/main/java/org/example/JsonProducer.java: -------------------------------------------------------------------------------- 1 | package org.example; 2 | 3 | import com.opencsv.CSVReader; 4 | import com.opencsv.exceptions.CsvException; 5 | import org.apache.kafka.clients.producer.*; 6 | import org.apache.kafka.streams.StreamsConfig; 7 | import org.example.data.Ride; 8 | 9 | import java.io.FileReader; 10 | import java.io.IOException; 11 | import java.time.LocalDateTime; 12 | import java.util.List; 13 | import java.util.Properties; 14 | import java.util.concurrent.ExecutionException; 15 | import java.util.stream.Collectors; 16 | 17 | public class JsonProducer { 18 | private Properties props = new Properties(); 19 | public JsonProducer() { 20 | props.put(StreamsConfig.BOOTSTRAP_SERVERS_CONFIG, "pkc-75m1o.europe-west3.gcp.confluent.cloud:9092"); 21 | props.put("security.protocol", "SASL_SSL"); 22 | props.put("sasl.jaas.config", "org.apache.kafka.common.security.plain.PlainLoginModule required username='"+Secrets.KAFKA_CLUSTER_KEY+"' password='"+Secrets.KAFKA_CLUSTER_SECRET+"';"); 23 | props.put("sasl.mechanism", "PLAIN"); 24 | props.put("client.dns.lookup", "use_all_dns_ips"); 25 | props.put("session.timeout.ms", "45000"); 26 | props.put(ProducerConfig.ACKS_CONFIG, "all"); 27 | props.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringSerializer"); 28 | props.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, "io.confluent.kafka.serializers.KafkaJsonSerializer"); 29 | } 30 | 31 | public List getRides() throws IOException, CsvException { 32 | var ridesStream = this.getClass().getResource("/rides.csv"); 33 | var reader = new CSVReader(new FileReader(ridesStream.getFile())); 34 | reader.skip(1); 35 | return reader.readAll().stream().map(arr -> new Ride(arr)) 36 | .collect(Collectors.toList()); 37 | 38 | } 39 | 40 | public void publishRides(List rides) throws ExecutionException, InterruptedException { 41 | KafkaProducer kafkaProducer = new KafkaProducer(props); 42 | for(Ride ride: rides) { 43 | ride.tpep_pickup_datetime = LocalDateTime.now().minusMinutes(20); 44 | ride.tpep_dropoff_datetime = LocalDateTime.now(); 45 | var record = kafkaProducer.send(new ProducerRecord<>("rides", String.valueOf(ride.DOLocationID), ride), (metadata, exception) -> { 46 | if(exception != null) { 47 | System.out.println(exception.getMessage()); 48 | } 49 | }); 50 | System.out.println(record.get().offset()); 51 | System.out.println(ride.DOLocationID); 52 | Thread.sleep(500); 53 | } 54 | } 55 | 56 | public static void main(String[] args) throws IOException, CsvException, ExecutionException, InterruptedException { 57 | var producer = new JsonProducer(); 58 | var rides = producer.getRides(); 59 | producer.publishRides(rides); 60 | } 61 | } -------------------------------------------------------------------------------- /06-streaming/python/docker/kafka/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3.6' 2 | networks: 3 | default: 4 | name: kafka-spark-network 5 | external: true 6 | services: 7 | broker: 8 | image: confluentinc/cp-kafka:7.2.0 9 | hostname: broker 10 | container_name: broker 11 | depends_on: 12 | - zookeeper 13 | ports: 14 | - '9092:9092' 15 | environment: 16 | KAFKA_BROKER_ID: 1 17 | KAFKA_ZOOKEEPER_CONNECT: 'zookeeper:2181' 18 | KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: PLAINTEXT:PLAINTEXT,PLAINTEXT_HOST:PLAINTEXT 19 | KAFKA_LISTENERS: PLAINTEXT://broker:29092,PLAINTEXT_HOST://broker:9092 20 | KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://broker:29092,PLAINTEXT_HOST://localhost:9092 21 | KAFKA_INTER_BROKER_LISTENER_NAME: PLAINTEXT 22 | KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1 23 | KAFKA_GROUP_INITIAL_REBALANCE_DELAY_MS: 0 24 | KAFKA_TRANSACTION_STATE_LOG_MIN_ISR: 1 25 | KAFKA_TRANSACTION_STATE_LOG_REPLICATION_FACTOR: 1 26 | schema-registry: 27 | image: confluentinc/cp-schema-registry:7.2.0 28 | hostname: schema-registry 29 | container_name: schema-registry 30 | depends_on: 31 | - zookeeper 32 | - broker 33 | ports: 34 | - "8081:8081" 35 | environment: 36 | # SCHEMA_REGISTRY_KAFKASTORE_CONNECTION_URL: "zookeeper:2181" #(depreciated) 37 | SCHEMA_REGISTRY_KAFKASTORE_BOOTSTRAP_SERVERS: "broker:29092" 38 | SCHEMA_REGISTRY_HOST_NAME: "localhost" 39 | SCHEMA_REGISTRY_LISTENERS: "http://0.0.0.0:8081" #(default: http://0.0.0.0:8081) 40 | zookeeper: 41 | image: confluentinc/cp-zookeeper:7.2.0 42 | hostname: zookeeper 43 | container_name: zookeeper 44 | ports: 45 | - '2181:2181' 46 | environment: 47 | ZOOKEEPER_CLIENT_PORT: 2181 48 | ZOOKEEPER_TICK_TIME: 2000 49 | control-center: 50 | image: confluentinc/cp-enterprise-control-center:7.2.0 51 | hostname: control-center 52 | container_name: control-center 53 | depends_on: 54 | - zookeeper 55 | - broker 56 | - schema-registry 57 | ports: 58 | - "9021:9021" 59 | environment: 60 | CONTROL_CENTER_BOOTSTRAP_SERVERS: 'broker:29092' 61 | CONTROL_CENTER_ZOOKEEPER_CONNECT: 'zookeeper:2181' 62 | CONTROL_CENTER_SCHEMA_REGISTRY_URL: "http://localhost:8081" 63 | CONTROL_CENTER_REPLICATION_FACTOR: 1 64 | CONTROL_CENTER_INTERNAL_TOPICS_PARTITIONS: 1 65 | CONTROL_CENTER_MONITORING_INTERCEPTOR_TOPIC_PARTITIONS: 1 66 | CONFLUENT_METRICS_TOPIC_REPLICATION: 1 67 | PORT: 9021 68 | 69 | kafka-rest: 70 | image: confluentinc/cp-kafka-rest:7.2.0 71 | hostname: kafka-rest 72 | ports: 73 | - "8082:8082" 74 | depends_on: 75 | - schema-registry 76 | - broker 77 | environment: 78 | KAFKA_REST_BOOTSTRAP_SERVERS: 'broker:29092' 79 | KAFKA_REST_ZOOKEEPER_CONNECT: 'zookeeper:2181' 80 | KAFKA_REST_SCHEMA_REGISTRY_URL: 'http://localhost:8081' 81 | KAFKA_REST_HOST_NAME: localhost 82 | KAFKA_REST_LISTENERS: 'http://0.0.0.0:8082' -------------------------------------------------------------------------------- /05-batch/code/06_spark_sql.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | import argparse 5 | 6 | import pyspark 7 | from pyspark.sql import SparkSession 8 | from pyspark.sql import functions as F 9 | 10 | 11 | parser = argparse.ArgumentParser() 12 | 13 | parser.add_argument('--input_green', required=True) 14 | parser.add_argument('--input_yellow', required=True) 15 | parser.add_argument('--output', required=True) 16 | 17 | args = parser.parse_args() 18 | 19 | input_green = args.input_green 20 | input_yellow = args.input_yellow 21 | output = args.output 22 | 23 | 24 | spark = SparkSession.builder \ 25 | .appName('test') \ 26 | .getOrCreate() 27 | 28 | df_green = spark.read.parquet(input_green) 29 | 30 | df_green = df_green \ 31 | .withColumnRenamed('lpep_pickup_datetime', 'pickup_datetime') \ 32 | .withColumnRenamed('lpep_dropoff_datetime', 'dropoff_datetime') 33 | 34 | df_yellow = spark.read.parquet(input_yellow) 35 | 36 | 37 | df_yellow = df_yellow \ 38 | .withColumnRenamed('tpep_pickup_datetime', 'pickup_datetime') \ 39 | .withColumnRenamed('tpep_dropoff_datetime', 'dropoff_datetime') 40 | 41 | 42 | common_colums = [ 43 | 'VendorID', 44 | 'pickup_datetime', 45 | 'dropoff_datetime', 46 | 'store_and_fwd_flag', 47 | 'RatecodeID', 48 | 'PULocationID', 49 | 'DOLocationID', 50 | 'passenger_count', 51 | 'trip_distance', 52 | 'fare_amount', 53 | 'extra', 54 | 'mta_tax', 55 | 'tip_amount', 56 | 'tolls_amount', 57 | 'improvement_surcharge', 58 | 'total_amount', 59 | 'payment_type', 60 | 'congestion_surcharge' 61 | ] 62 | 63 | 64 | 65 | df_green_sel = df_green \ 66 | .select(common_colums) \ 67 | .withColumn('service_type', F.lit('green')) 68 | 69 | df_yellow_sel = df_yellow \ 70 | .select(common_colums) \ 71 | .withColumn('service_type', F.lit('yellow')) 72 | 73 | 74 | df_trips_data = df_green_sel.unionAll(df_yellow_sel) 75 | 76 | df_trips_data.registerTempTable('trips_data') 77 | 78 | 79 | df_result = spark.sql(""" 80 | SELECT 81 | -- Reveneue grouping 82 | PULocationID AS revenue_zone, 83 | date_trunc('month', pickup_datetime) AS revenue_month, 84 | service_type, 85 | 86 | -- Revenue calculation 87 | SUM(fare_amount) AS revenue_monthly_fare, 88 | SUM(extra) AS revenue_monthly_extra, 89 | SUM(mta_tax) AS revenue_monthly_mta_tax, 90 | SUM(tip_amount) AS revenue_monthly_tip_amount, 91 | SUM(tolls_amount) AS revenue_monthly_tolls_amount, 92 | SUM(improvement_surcharge) AS revenue_monthly_improvement_surcharge, 93 | SUM(total_amount) AS revenue_monthly_total_amount, 94 | SUM(congestion_surcharge) AS revenue_monthly_congestion_surcharge, 95 | 96 | -- Additional calculations 97 | AVG(passenger_count) AS avg_montly_passenger_count, 98 | AVG(trip_distance) AS avg_montly_trip_distance 99 | FROM 100 | trips_data 101 | GROUP BY 102 | 1, 2, 3 103 | """) 104 | 105 | 106 | df_result.coalesce(1) \ 107 | .write.parquet(output, mode='overwrite') 108 | 109 | 110 | 111 | 112 | -------------------------------------------------------------------------------- /06-streaming/java/kafka_examples/src/test/java/org/example/JsonKStreamJoinsTest.java: -------------------------------------------------------------------------------- 1 | package org.example; 2 | 3 | import org.apache.kafka.clients.consumer.ConsumerConfig; 4 | import org.apache.kafka.common.internals.Topic; 5 | import org.apache.kafka.common.serialization.Serdes; 6 | import org.apache.kafka.streams.*; 7 | import org.example.customserdes.CustomSerdes; 8 | import org.example.data.PickupLocation; 9 | import org.example.data.Ride; 10 | import org.example.data.VendorInfo; 11 | import org.example.helper.DataGeneratorHelper; 12 | import org.junit.jupiter.api.AfterAll; 13 | import org.junit.jupiter.api.BeforeEach; 14 | import org.junit.jupiter.api.Test; 15 | 16 | import javax.xml.crypto.Data; 17 | import java.util.Properties; 18 | 19 | import static org.junit.jupiter.api.Assertions.*; 20 | 21 | class JsonKStreamJoinsTest { 22 | private Properties props = new Properties(); 23 | private static TopologyTestDriver testDriver; 24 | private TestInputTopic ridesTopic; 25 | private TestInputTopic pickLocationTopic; 26 | private TestOutputTopic outputTopic; 27 | 28 | private Topology topology = new JsonKStreamJoins().createTopology(); 29 | @BeforeEach 30 | public void setup() { 31 | props = new Properties(); 32 | props.setProperty(StreamsConfig.APPLICATION_ID_CONFIG, "testing_count_application"); 33 | props.setProperty(StreamsConfig.BOOTSTRAP_SERVERS_CONFIG, "dummy:1234"); 34 | if (testDriver != null) { 35 | testDriver.close(); 36 | } 37 | testDriver = new TopologyTestDriver(topology, props); 38 | ridesTopic = testDriver.createInputTopic(Topics.INPUT_RIDE_TOPIC, Serdes.String().serializer(), CustomSerdes.getSerde(Ride.class).serializer()); 39 | pickLocationTopic = testDriver.createInputTopic(Topics.INPUT_RIDE_LOCATION_TOPIC, Serdes.String().serializer(), CustomSerdes.getSerde(PickupLocation.class).serializer()); 40 | outputTopic = testDriver.createOutputTopic(Topics.OUTPUT_TOPIC, Serdes.String().deserializer(), CustomSerdes.getSerde(VendorInfo.class).deserializer()); 41 | } 42 | 43 | @Test 44 | public void testIfJoinWorksOnSameDropOffPickupLocationId() { 45 | Ride ride = DataGeneratorHelper.generateRide(); 46 | PickupLocation pickupLocation = DataGeneratorHelper.generatePickUpLocation(ride.DOLocationID); 47 | ridesTopic.pipeInput(String.valueOf(ride.DOLocationID), ride); 48 | pickLocationTopic.pipeInput(String.valueOf(pickupLocation.PULocationID), pickupLocation); 49 | 50 | assertEquals(outputTopic.getQueueSize(), 1); 51 | var expected = new VendorInfo(ride.VendorID, pickupLocation.PULocationID, pickupLocation.tpep_pickup_datetime, ride.tpep_dropoff_datetime); 52 | var result = outputTopic.readKeyValue(); 53 | assertEquals(result.key, String.valueOf(ride.DOLocationID)); 54 | assertEquals(result.value.VendorID, expected.VendorID); 55 | assertEquals(result.value.pickupTime, expected.pickupTime); 56 | } 57 | 58 | 59 | @AfterAll 60 | public static void shutdown() { 61 | testDriver.close(); 62 | } 63 | } --------------------------------------------------------------------------------